├── .gitignore
├── LICENSE
├── Lavis
    ├── evaluate.py
    ├── lavis
    │   ├── __init__.py
    │   ├── common
    │   │   ├── config.py
    │   │   ├── dist_utils.py
    │   │   ├── gradcam.py
    │   │   ├── logger.py
    │   │   ├── optims.py
    │   │   ├── registry.py
    │   │   ├── utils.py
    │   │   └── vqa_tools
    │   │   │   ├── __init__.py
    │   │   │   ├── vqa.py
    │   │   │   └── vqa_eval.py
    │   ├── configs
    │   │   ├── datasets
    │   │   │   ├── aokvqa
    │   │   │   │   ├── defaults.yaml
    │   │   │   │   └── eval_aokvqa.yaml
    │   │   │   ├── avsd
    │   │   │   │   └── defaults_dial.yaml
    │   │   │   ├── coco
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   ├── defaults_ret.yaml
    │   │   │   │   ├── defaults_vqa.yaml
    │   │   │   │   └── eval_vqa.yaml
    │   │   │   ├── conceptual_caption
    │   │   │   │   ├── defaults_12m.yaml
    │   │   │   │   └── defaults_3m.yaml
    │   │   │   ├── didemo
    │   │   │   │   └── defaults_ret.yaml
    │   │   │   ├── flickr30k
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── gqa
    │   │   │   │   ├── balanced_testdev.yaml
    │   │   │   │   ├── balanced_val.yaml
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── imagenet
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── laion
    │   │   │   │   └── defaults_2B_multi.yaml
    │   │   │   ├── msrvtt
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   ├── defaults_qa.yaml
    │   │   │   │   └── defaults_ret.yaml
    │   │   │   ├── msvd
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   └── defaults_qa.yaml
    │   │   │   ├── nlvr
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── nocaps
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── okvqa
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── sbu_caption
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── snli_ve
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── vatex
    │   │   │   │   └── defaults_cap.yaml
    │   │   │   └── vg
    │   │   │   │   ├── defaults_caption.yaml
    │   │   │   │   └── defaults_vqa.yaml
    │   │   ├── default.yaml
    │   │   └── models
    │   │   │   ├── albef_classification_ve.yaml
    │   │   │   ├── albef_feature_extractor.yaml
    │   │   │   ├── albef_nlvr.yaml
    │   │   │   ├── albef_pretrain_base.yaml
    │   │   │   ├── albef_retrieval_coco.yaml
    │   │   │   ├── albef_retrieval_flickr.yaml
    │   │   │   ├── albef_vqav2.yaml
    │   │   │   ├── alpro_qa_msrvtt.yaml
    │   │   │   ├── alpro_qa_msvd.yaml
    │   │   │   ├── alpro_retrieval_didemo.yaml
    │   │   │   ├── alpro_retrieval_msrvtt.yaml
    │   │   │   ├── bert_config.json
    │   │   │   ├── bert_config_alpro.json
    │   │   │   ├── blip2
    │   │   │       ├── blip2_caption_flant5xl.yaml
    │   │   │       ├── blip2_caption_opt2.7b.yaml
    │   │   │       ├── blip2_caption_opt6.7b.yaml
    │   │   │       ├── blip2_coco.yaml
    │   │   │       ├── blip2_pretrain.yaml
    │   │   │       ├── blip2_pretrain_flant5xl.yaml
    │   │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
    │   │   │       ├── blip2_pretrain_flant5xxl.yaml
    │   │   │       ├── blip2_pretrain_opt2.7b.yaml
    │   │   │       ├── blip2_pretrain_opt6.7b.yaml
    │   │   │       └── blip2_pretrain_vitL.yaml
    │   │   │   ├── blip_caption_base_coco.yaml
    │   │   │   ├── blip_caption_large_coco.yaml
    │   │   │   ├── blip_classification_base.yaml
    │   │   │   ├── blip_feature_extractor_base.yaml
    │   │   │   ├── blip_itm_base.yaml
    │   │   │   ├── blip_itm_large.yaml
    │   │   │   ├── blip_nlvr.yaml
    │   │   │   ├── blip_pretrain_base.yaml
    │   │   │   ├── blip_pretrain_large.yaml
    │   │   │   ├── blip_retrieval_coco.yaml
    │   │   │   ├── blip_retrieval_flickr.yaml
    │   │   │   ├── blip_vqa_aokvqa.yaml
    │   │   │   ├── blip_vqa_okvqa.yaml
    │   │   │   ├── blip_vqav2.yaml
    │   │   │   ├── clip
    │   │   │       ├── RN101-quickgelu.json
    │   │   │       ├── RN101.json
    │   │   │       ├── RN50-quickgelu.json
    │   │   │       ├── RN50.json
    │   │   │       ├── RN50x16.json
    │   │   │       ├── RN50x4.json
    │   │   │       ├── ViT-B-16-plus-240.json
    │   │   │       ├── ViT-B-16-plus.json
    │   │   │       ├── ViT-B-16.json
    │   │   │       ├── ViT-B-32-plus-256.json
    │   │   │       ├── ViT-B-32-quickgelu.json
    │   │   │       ├── ViT-B-32.json
    │   │   │       ├── ViT-H-14.json
    │   │   │       ├── ViT-H-16.json
    │   │   │       ├── ViT-L-14-280.json
    │   │   │       ├── ViT-L-14-336.json
    │   │   │       ├── ViT-L-14.json
    │   │   │       ├── ViT-L-16-320.json
    │   │   │       ├── ViT-L-16.json
    │   │   │       ├── ViT-g-14.json
    │   │   │       ├── timm-efficientnetv2_rw_s.json
    │   │   │       ├── timm-resnet50d.json
    │   │   │       ├── timm-resnetaa50d.json
    │   │   │       ├── timm-resnetblur50.json
    │   │   │       ├── timm-swin_base_patch4_window7_224.json
    │   │   │       ├── timm-vit_base_patch16_224.json
    │   │   │       ├── timm-vit_base_patch32_224.json
    │   │   │       └── timm-vit_small_patch16_224.json
    │   │   │   ├── clip_resnet50.yaml
    │   │   │   ├── clip_vit_base16.yaml
    │   │   │   ├── clip_vit_base32.yaml
    │   │   │   ├── clip_vit_large14.yaml
    │   │   │   ├── clip_vit_large14_336.yaml
    │   │   │   ├── gpt_dialogue_base.yaml
    │   │   │   ├── img2prompt-vqa
    │   │   │       └── img2prompt_vqa_base.yaml
    │   │   │   ├── med_config.json
    │   │   │   ├── med_config_albef.json
    │   │   │   ├── med_large_config.json
    │   │   │   └── pnp-vqa
    │   │   │       ├── pnp_vqa_3b.yaml
    │   │   │       ├── pnp_vqa_base.yaml
    │   │   │       ├── pnp_vqa_large.yaml
    │   │   │       ├── unifiedqav2_3b_config.json
    │   │   │       ├── unifiedqav2_base_config.json
    │   │   │       └── unifiedqav2_large_config.json
    │   ├── datasets
    │   │   ├── builders
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset_builder.py
    │   │   │   ├── caption_builder.py
    │   │   │   ├── classification_builder.py
    │   │   │   ├── dialogue_builder.py
    │   │   │   ├── image_text_pair_builder.py
    │   │   │   ├── imagefolder_builder.py
    │   │   │   ├── retrieval_builder.py
    │   │   │   ├── video_qa_builder.py
    │   │   │   └── vqa_builder.py
    │   │   ├── data_utils.py
    │   │   ├── datasets
    │   │   │   ├── aok_vqa_datasets.py
    │   │   │   ├── avsd_dialogue_datasets.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── caption_datasets.py
    │   │   │   ├── coco_caption_datasets.py
    │   │   │   ├── coco_vqa_datasets.py
    │   │   │   ├── dataloader_utils.py
    │   │   │   ├── dialogue_datasets.py
    │   │   │   ├── gqa_datasets.py
    │   │   │   ├── image_text_pair_datasets.py
    │   │   │   ├── imagefolder_dataset.py
    │   │   │   ├── laion_dataset.py
    │   │   │   ├── multimodal_classification_datasets.py
    │   │   │   ├── nlvr_datasets.py
    │   │   │   ├── retrieval_datasets.py
    │   │   │   ├── snli_ve_datasets.py
    │   │   │   ├── vg_vqa_datasets.py
    │   │   │   ├── video_caption_datasets.py
    │   │   │   ├── video_vqa_datasets.py
    │   │   │   └── vqa_datasets.py
    │   │   └── download_scripts
    │   │   │   ├── DownloadConceptualCaptions
    │   │   │       ├── LICENSE
    │   │   │       ├── README.md
    │   │   │       ├── create_annotation_12m.ipynb
    │   │   │       ├── create_annotation_3m.ipynb
    │   │   │       ├── download_data_cc12m.py
    │   │   │       └── download_data_cc3m.py
    │   │   │   ├── download_coco.py
    │   │   │   ├── download_didemo.py
    │   │   │   ├── download_flickr.py
    │   │   │   ├── download_gqa.py
    │   │   │   ├── download_msrvtt.py
    │   │   │   ├── download_msvd.py
    │   │   │   ├── download_nocaps.py
    │   │   │   ├── download_sbu.py
    │   │   │   └── download_vg.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── albef_models
    │   │   │   ├── __init__.py
    │   │   │   ├── albef_classification.py
    │   │   │   ├── albef_feature_extractor.py
    │   │   │   ├── albef_nlvr.py
    │   │   │   ├── albef_outputs.py
    │   │   │   ├── albef_pretrain.py
    │   │   │   ├── albef_retrieval.py
    │   │   │   └── albef_vqa.py
    │   │   ├── alpro_models
    │   │   │   ├── __init__.py
    │   │   │   ├── alpro_outputs.py
    │   │   │   ├── alpro_qa.py
    │   │   │   └── alpro_retrieval.py
    │   │   ├── base_model.py
    │   │   ├── blip2_models
    │   │   │   ├── Qformer.py
    │   │   │   ├── __init__.py
    │   │   │   ├── blip2.py
    │   │   │   ├── blip2_image_text_matching.py
    │   │   │   ├── blip2_opt.py
    │   │   │   ├── blip2_qformer.py
    │   │   │   ├── blip2_t5.py
    │   │   │   ├── blip2_t5_par.py
    │   │   │   ├── modeling_opt.py
    │   │   │   └── modeling_t5.py
    │   │   ├── blip_models
    │   │   │   ├── __init__.py
    │   │   │   ├── blip.py
    │   │   │   ├── blip_caption.py
    │   │   │   ├── blip_classification.py
    │   │   │   ├── blip_feature_extractor.py
    │   │   │   ├── blip_image_text_matching.py
    │   │   │   ├── blip_nlvr.py
    │   │   │   ├── blip_outputs.py
    │   │   │   ├── blip_pretrain.py
    │   │   │   ├── blip_retrieval.py
    │   │   │   ├── blip_vqa.py
    │   │   │   └── nlvr_encoder.py
    │   │   ├── clip_models
    │   │   │   ├── __init__.py
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── clip_outputs.py
    │   │   │   ├── loss.py
    │   │   │   ├── model.py
    │   │   │   ├── pics
    │   │   │   │   └── CLIP.png
    │   │   │   ├── pretrained.py
    │   │   │   ├── timm_model.py
    │   │   │   ├── tokenizer.py
    │   │   │   ├── transform.py
    │   │   │   └── utils.py
    │   │   ├── clip_vit.py
    │   │   ├── eva_vit.py
    │   │   ├── gpt_models
    │   │   │   └── gpt_dialogue.py
    │   │   ├── img2prompt_models
    │   │   │   ├── __init__.py
    │   │   │   └── img2prompt_vqa.py
    │   │   ├── med.py
    │   │   ├── pnp_vqa_models
    │   │   │   ├── __init__.py
    │   │   │   ├── pnp_unifiedqav2_fid.py
    │   │   │   └── pnp_vqa.py
    │   │   ├── timesformer
    │   │   │   ├── __init__.py
    │   │   │   ├── conv2d_same.py
    │   │   │   ├── features.py
    │   │   │   ├── helpers.py
    │   │   │   ├── linear.py
    │   │   │   ├── vit.py
    │   │   │   └── vit_utils.py
    │   │   └── vit.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── alpro_processors.py
    │   │   ├── base_processor.py
    │   │   ├── blip_processors.py
    │   │   ├── clip_processors.py
    │   │   ├── functional_video.py
    │   │   ├── gpt_processors.py
    │   │   ├── randaugment.py
    │   │   └── transforms_video.py
    │   ├── projects
    │   │   └── blip2
    │   │   │   ├── direct_aokvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── mc_aokvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   └── vqav2_zeroshot_flant5xl_eval.yaml
    │   ├── runners
    │   │   ├── __init__.py
    │   │   ├── runner_base.py
    │   │   └── runner_iter.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── captioning.py
    │   │   ├── dialogue.py
    │   │   ├── image_text_pretrain.py
    │   │   ├── multimodal_classification.py
    │   │   ├── retrieval.py
    │   │   ├── vqa.py
    │   │   └── vqa_reading_comprehension.py
    ├── requirements.txt
    ├── setup.py
    └── train.py
├── MiniGPT-4
    ├── demo.py
    ├── environment.yml
    ├── evaluate.py
    ├── minigpt4
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── dist_utils.py
    │   │   ├── gradcam.py
    │   │   ├── logger.py
    │   │   ├── optims.py
    │   │   ├── registry.py
    │   │   ├── utils.py
    │   │   └── vqa_tools
    │   │   │   ├── __init__.py
    │   │   │   ├── vqa.py
    │   │   │   └── vqa_eval.py
    │   ├── configs
    │   │   ├── datasets
    │   │   │   ├── aokvqa
    │   │   │   │   ├── defaults.yaml
    │   │   │   │   └── eval_aokvqa.yaml
    │   │   │   ├── cc_sbu
    │   │   │   │   ├── align.yaml
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── coco
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   ├── defaults_ret.yaml
    │   │   │   │   ├── defaults_vqa.yaml
    │   │   │   │   └── eval_vqa.yaml
    │   │   │   └── laion
    │   │   │   │   └── defaults.yaml
    │   │   ├── default.yaml
    │   │   └── models
    │   │   │   ├── minigpt4_llama2.yaml
    │   │   │   └── minigpt4_vicuna0.yaml
    │   ├── conversation
    │   │   ├── __init__.py
    │   │   └── conversation.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── builders
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset_builder.py
    │   │   │   ├── image_text_pair_builder.py
    │   │   │   └── vqa_builder.py
    │   │   ├── data_utils.py
    │   │   └── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── aok_vqa_datasets.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── caption_datasets.py
    │   │   │   ├── cc_sbu_dataset.py
    │   │   │   ├── coco_vqa_datasets.py
    │   │   │   ├── dataloader_utils.py
    │   │   │   ├── laion_dataset.py
    │   │   │   ├── old_dataloader_utils.py
    │   │   │   └── vqa_datasets.py
    │   ├── models
    │   │   ├── Qformer.py
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   ├── blip2.py
    │   │   ├── blip2_outputs.py
    │   │   ├── eva_vit.py
    │   │   ├── mini_gpt4.py
    │   │   └── modeling_llama.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── base_processor.py
    │   │   ├── blip_processors.py
    │   │   └── randaugment.py
    │   ├── projects
    │   │   └── minigpt4
    │   │   │   ├── conv_direct_aokvqa.yaml
    │   │   │   ├── conv_mc_aokvqa.yaml
    │   │   │   └── conv_vqav2.yaml
    │   ├── runners
    │   │   ├── __init__.py
    │   │   └── runner_base.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── image_text_pretrain.py
    │   │   └── vqa.py
    └── train.py
├── README.md
└── assets
    ├── README.md
    ├── intro.png
    └── pipeline.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Archiki Prasad
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.registry import registry
14 | 
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/Lavis/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/aokvqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   aok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
17 |           storage:
18 |               - aokvqa/annotations/aokvqa_v1p0_train.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
22 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
23 |           storage:
24 |               - aokvqa/annotations/aokvqa_v1p0_val.json
25 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
26 |               # - aokvqa/annotations/large_vocab_train_lavis.json
27 |         test:
28 |           url:
29 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
31 |           storage:
32 |               - aokvqa/annotations/aokvqa_v1p0_test.json
33 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
34 |       images:
35 |           storage: coco/images/
36 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/aokvqa/eval_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   aok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               - aokvqa/annotations/aokvqa_v1p0_val.json
17 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
18 |           storage:
19 |               - aokvqa/annotations/aokvqa_v1p0_val.json
20 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
21 |       images:
22 |           storage: coco/images/
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   avsd_dialogue: # name of the dataset builder
 8 |     dataset_card: dataset_card/avsd_dialogue.md 
 9 |     data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 |           storage: avsd/annotations/train.json 
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 |           storage: avsd/annotations/val.json 
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 |           storage: avsd/annotations/test.json 
23 |       features:
24 |         storage: avsd/features/ 
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_caption: # name of the dataset builder
 8 |     dataset_card: dataset_card/coco_caption.md
 9 |     # data_dir: ${env.data_dir}/datasets
10 |     data_type: images # [images|videos|features]
11 | 
12 |     build_info:
13 |       # Be careful not to append minus sign (-) before split to avoid itemizing
14 |       annotations:
15 |         train:
16 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 |           md5: aa31ac474cf6250ebb81d18348a07ed8
18 |           storage: coco/annotations/coco_karpathy_train.json
19 |         val:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 |           md5: b273847456ef5580e33713b1f7de52a0
22 |           storage:  coco/annotations/coco_karpathy_val.json
23 |         test:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 |           storage: coco/annotations/coco_karpathy_test.json
27 |       images:
28 |         storage: coco/images/
29 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               - coco/annotations/vqa_val_eval.json
17 |               - coco/annotations/answer_list.json
18 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
19 |               - coco/annotations/v2_mscoco_val2014_annotations.json
20 |           storage:
21 |               - coco/annotations/vqa_val_eval.json
22 |               - coco/annotations/answer_list.json
23 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
24 |               - coco/annotations/v2_mscoco_val2014_annotations.json
25 |       images:
26 |           storage: coco/images/
27 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   didemo_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 |           storage: didemo/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 |           storage: didemo/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 |           storage: didemo/annotations/retrieval_test.json
23 |       videos:
24 |         storage: didemo/videos
25 |         # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/balanced_testdev.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |             - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
22 |           storage:
23 |             - gqa/annotations/testdev_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/balanced_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22 |           storage:
23 |               - gqa/annotations/val_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
17 |               - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
18 |           storage:
19 |               - gqa/annotations/train_all_questions_0.json
20 |               - gqa/annotations/val_all_questions.json
21 |         val:
22 |           url:
23 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
24 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
25 |           storage:
26 |               - aokvqa/annotations/aokvqa_v1p0_val.json
27 |               - aokvqa/annotations/large_vocab_train_lavis.json
28 |         test:
29 |           url:
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
31 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
32 |           storage:
33 |               - aokvqa/annotations/aokvqa_v1p0_test.json
34 |               - aokvqa/annotations/large_vocab_train_lavis.json
35 |       images:
36 |           storage: gqa/images/
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
16 |           storage: msrvtt/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
19 |           storage: msrvtt/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
22 |           storage: msrvtt/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
25 |           storage: msrvtt/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msrvtt/videos
28 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 |           storage: msrvtt/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 |           storage: msrvtt/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 |           storage: msrvtt/annotations/retrieval_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msvd/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16 |           storage: msvd/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19 |           storage: msvd/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22 |           storage: msvd/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25 |           storage: msvd/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msvd/videos
28 | 
29 |       instance_id_key: question_id
30 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/okvqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   ok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               # TODO make this order insensitive
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
18 |               # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
19 |               # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
20 |           storage:
21 |               - okvqa/annotations/okvqa_train.json
22 |               # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
23 |               # - okvqa/annotations/mscoco_train2014_annotations.json
24 |         test:
25 |           url:
26 |               # TODO make this order insensitive
27 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
28 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
29 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
31 |           storage:
32 |               - okvqa/annotations/vqa_val_eval.json
33 |               - okvqa/annotations/answer_list.json
34 |               - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
35 |               - okvqa/annotations/mscoco_val2014_annotations.json
36 |       images:
37 |           storage: coco/images/
38 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 |           storage: vatex/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 |           storage: vatex/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 |           storage: vatex/annotations/cap_test.json
23 |       videos:
24 |         storage: /export/share/dongxuli/data/vatex
25 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: ".cache/lavis"
11 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-32
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 336
53 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: img2prompt_vqa
 8 |   model_type: base
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_generation_moodel:
47 |     pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth"
48 | 
49 | 
50 | 
51 | preprocess:
52 |   vis_processor:
53 |       eval:
54 |         name: "blip_image_eval"
55 |         image_size: 384
56 |   text_processor:
57 |       eval:
58 |         name: "blip_caption"
59 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_answering_model:
47 |     arch: pnp_unifiedqav2_fid
48 | 
49 |     pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
50 | 
51 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
52 | 
53 | preprocess:
54 |   vis_processor:
55 |       eval:
56 |         name: "blip_image_eval"
57 |         image_size: 384
58 |   text_processor:
59 |       eval:
60 |         name: "blip_caption"
61 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 |   question_answering_model:
46 |     arch: pnp_unifiedqav2_fid
47 | 
48 |     pretrained: "allenai/unifiedqa-v2-t5-base-1363200"
49 | 
50 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json"
51 | 
52 | preprocess:
53 |   vis_processor:
54 |       eval:
55 |         name: "blip_image_eval"
56 |         image_size: 384
57 |   text_processor:
58 |       eval:
59 |         name: "blip_caption"
60 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_answering_model:
47 |     arch: pnp_unifiedqav2_fid
48 | 
49 |     pretrained: "allenai/unifiedqa-v2-t5-large-1363200"
50 | 
51 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json"
52 | 
53 | preprocess:
54 |   vis_processor:
55 |       eval:
56 |         name: "blip_image_eval"
57 |         image_size: 384
58 |   text_processor:
59 |       eval:
60 |         name: "blip_caption"
61 | 


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 16384,
 6 |   "d_kv": 128,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 24,
21 |   "num_heads": 32,
22 |   "num_layers": 24,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "torch_dtype": "float32",
57 |   "transformers_version": "4.21.3",
58 |   "use_cache": true,
59 |   "vocab_size": 32128
60 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 3072,
 6 |   "d_kv": 64,
 7 |   "d_model": 768,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 12,
21 |   "num_heads": 12,
22 |   "num_layers": 12,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 4096,
 6 |   "d_kv": 64,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 24,
21 |   "num_heads": 16,
22 |   "num_layers": 24,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/caption_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 9 | from lavis.datasets.datasets.coco_caption_datasets import (
10 |     COCOCapDataset,
11 |     COCOCapEvalDataset,
12 |     NoCapsEvalDataset,
13 | )
14 | 
15 | from lavis.common.registry import registry
16 | from lavis.datasets.datasets.video_caption_datasets import (
17 |     VideoCaptionDataset,
18 |     VideoCaptionEvalDataset,
19 | )
20 | 
21 | 
22 | @registry.register_builder("coco_caption")
23 | class COCOCapBuilder(BaseDatasetBuilder):
24 |     train_dataset_cls = COCOCapDataset
25 |     eval_dataset_cls = COCOCapEvalDataset
26 | 
27 |     DATASET_CONFIG_DICT = {
28 |         "default": "configs/datasets/coco/defaults_cap.yaml",
29 |     }
30 | 
31 | 
32 | @registry.register_builder("nocaps")
33 | class COCOCapBuilder(BaseDatasetBuilder):
34 |     eval_dataset_cls = NoCapsEvalDataset
35 | 
36 |     DATASET_CONFIG_DICT = {
37 |         "default": "configs/datasets/nocaps/defaults.yaml",
38 |     }
39 | 
40 | 
41 | @registry.register_builder("msrvtt_caption")
42 | class MSRVTTCapBuilder(BaseDatasetBuilder):
43 |     train_dataset_cls = VideoCaptionDataset
44 |     eval_dataset_cls = VideoCaptionEvalDataset
45 | 
46 |     DATASET_CONFIG_DICT = {
47 |         "default": "configs/datasets/msrvtt/defaults_cap.yaml",
48 |     }
49 | 
50 | 
51 | @registry.register_builder("msvd_caption")
52 | class MSVDCapBuilder(BaseDatasetBuilder):
53 |     train_dataset_cls = VideoCaptionDataset
54 |     eval_dataset_cls = VideoCaptionEvalDataset
55 | 
56 |     DATASET_CONFIG_DICT = {
57 |         "default": "configs/datasets/msvd/defaults_cap.yaml",
58 |     }
59 | 
60 | 
61 | @registry.register_builder("vatex_caption")
62 | class VATEXCapBuilder(BaseDatasetBuilder):
63 |     train_dataset_cls = VideoCaptionDataset
64 |     eval_dataset_cls = VideoCaptionEvalDataset
65 | 
66 |     DATASET_CONFIG_DICT = {
67 |         "default": "configs/datasets/vatex/defaults_cap.yaml",
68 |     }
69 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 | 
13 | 
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 |     train_dataset_cls = NLVRDataset
17 |     eval_dataset_cls = NLVREvalDataset
18 | 
19 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 | 
21 | 
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 |     train_dataset_cls = SNLIVisualEntialmentDataset
25 |     eval_dataset_cls = SNLIVisualEntialmentDataset
26 | 
27 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 |     AVSDDialDataset,
12 |     AVSDDialEvalDataset,
13 | )
14 | 
15 | 
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 |     train_dataset_cls = AVSDDialDataset
19 |     eval_dataset_cls = AVSDDialEvalDataset
20 | 
21 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/retrieval_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 9 | from lavis.datasets.datasets.retrieval_datasets import (
10 |     RetrievalDataset,
11 |     RetrievalEvalDataset,
12 |     VideoRetrievalDataset,
13 |     VideoRetrievalEvalDataset,
14 | )
15 | 
16 | from lavis.common.registry import registry
17 | 
18 | 
19 | @registry.register_builder("msrvtt_retrieval")
20 | class MSRVTTRetrievalBuilder(BaseDatasetBuilder):
21 |     train_dataset_cls = VideoRetrievalDataset
22 |     eval_dataset_cls = VideoRetrievalEvalDataset
23 | 
24 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/msrvtt/defaults_ret.yaml"}
25 | 
26 | 
27 | @registry.register_builder("didemo_retrieval")
28 | class DiDeMoRetrievalBuilder(BaseDatasetBuilder):
29 |     train_dataset_cls = VideoRetrievalDataset
30 |     eval_dataset_cls = VideoRetrievalEvalDataset
31 | 
32 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/didemo/defaults_ret.yaml"}
33 | 
34 | 
35 | @registry.register_builder("coco_retrieval")
36 | class COCORetrievalBuilder(BaseDatasetBuilder):
37 |     train_dataset_cls = RetrievalDataset
38 |     eval_dataset_cls = RetrievalEvalDataset
39 | 
40 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/coco/defaults_ret.yaml"}
41 | 
42 | 
43 | @registry.register_builder("flickr30k")
44 | class Flickr30kBuilder(BaseDatasetBuilder):
45 |     train_dataset_cls = RetrievalDataset
46 |     eval_dataset_cls = RetrievalEvalDataset
47 | 
48 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/flickr30k/defaults.yaml"}
49 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/video_qa_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.common.utils import get_cache_path
10 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
11 | from lavis.datasets.datasets.video_vqa_datasets import VideoQADataset
12 | 
13 | 
14 | class VideoQABuilder(BaseDatasetBuilder):
15 |     train_dataset_cls = VideoQADataset
16 |     eval_dataset_cls = VideoQADataset
17 | 
18 |     def build(self):
19 |         datasets = super().build()
20 | 
21 |         ans2label = self.config.build_info.annotations.get("ans2label")
22 |         if ans2label is None:
23 |             raise ValueError("ans2label is not specified in build_info.")
24 | 
25 |         ans2label = get_cache_path(ans2label.storage)
26 | 
27 |         for split in datasets:
28 |             datasets[split]._build_class_labels(ans2label)
29 | 
30 |         return datasets
31 | 
32 | 
33 | @registry.register_builder("msrvtt_qa")
34 | class MSRVTTQABuilder(VideoQABuilder):
35 |     DATASET_CONFIG_DICT = {
36 |         "default": "configs/datasets/msrvtt/defaults_qa.yaml",
37 |     }
38 | 
39 | 
40 | @registry.register_builder("msvd_qa")
41 | class MSVDQABuilder(VideoQABuilder):
42 |     DATASET_CONFIG_DICT = {
43 |         "default": "configs/datasets/msvd/defaults_qa.yaml",
44 |     }
45 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/vqa_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 9 | 
10 | from lavis.common.registry import registry
11 | from lavis.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
12 | from lavis.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
13 | from lavis.datasets.datasets.vg_vqa_datasets import VGVQADataset
14 | from lavis.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
15 | 
16 | 
17 | @registry.register_builder("coco_vqa")
18 | class COCOVQABuilder(BaseDatasetBuilder):
19 |     train_dataset_cls = COCOVQADataset
20 |     eval_dataset_cls = COCOVQAEvalDataset
21 | 
22 |     DATASET_CONFIG_DICT = {
23 |         "default": "configs/datasets/coco/defaults_vqa.yaml",
24 |         "eval": "configs/datasets/coco/eval_vqa.yaml",
25 |     }
26 | 
27 | 
28 | @registry.register_builder("vg_vqa")
29 | class VGVQABuilder(BaseDatasetBuilder):
30 |     train_dataset_cls = VGVQADataset
31 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa.yaml"}
32 | 
33 | 
34 | @registry.register_builder("ok_vqa")
35 | class OKVQABuilder(COCOVQABuilder):
36 |     DATASET_CONFIG_DICT = {
37 |         "default": "configs/datasets/okvqa/defaults.yaml",
38 |     }
39 | 
40 | 
41 | @registry.register_builder("aok_vqa")
42 | class AOKVQABuilder(BaseDatasetBuilder):
43 |     train_dataset_cls = AOKVQADataset
44 |     eval_dataset_cls = AOKVQAEvalDataset
45 | 
46 |     DATASET_CONFIG_DICT = {
47 |         "default": "configs/datasets/aokvqa/defaults.yaml",
48 |         "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml",
49 |     }
50 | 
51 | 
52 | @registry.register_builder("gqa")
53 | class GQABuilder(BaseDatasetBuilder):
54 |     train_dataset_cls = GQADataset
55 |     eval_dataset_cls = GQAEvalDataset
56 | 
57 |     DATASET_CONFIG_DICT = {
58 |         "default": "configs/datasets/gqa/defaults.yaml",
59 |         "balanced_val": "configs/datasets/gqa/balanced_val.yaml",
60 |         "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml",
61 |     }


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/image_text_pair_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from collections import OrderedDict
10 | 
11 | from lavis.datasets.datasets.base_dataset import BaseDataset
12 | from PIL import Image
13 | 
14 | 
15 | class __DisplMixin:
16 |     def displ_item(self, index):
17 |         sample, ann = self.__getitem__(index), self.annotation[index]
18 | 
19 |         return OrderedDict(
20 |             {
21 |                 "file": os.path.basename(ann["image"]),
22 |                 "caption": ann["caption"],
23 |                 "image": sample["image"],
24 |             }
25 |         )
26 | 
27 | 
28 | class ImageTextPairDataset(BaseDataset, __DisplMixin):
29 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
30 |         """
31 |         vis_root (string): Root directory of images (e.g. coco/images/)
32 |         ann_root (string): directory to store the annotation file
33 |         """
34 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
35 | 
36 |     def __getitem__(self, index):
37 | 
38 |         # TODO this assumes image input, not general enough
39 |         ann = self.annotation[index]
40 | 
41 |         image_path = os.path.join(self.vis_root, ann["image"])
42 |         image = Image.open(image_path).convert("RGB")
43 | 
44 |         image = self.vis_processor(image)
45 |         caption = self.text_processor(ann["caption"])
46 | 
47 |         return {"image": image, "text_input": caption}
48 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/imagefolder_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from collections import OrderedDict
10 | 
11 | from lavis.datasets.datasets.base_dataset import BaseDataset
12 | from PIL import Image
13 | from torchvision import datasets
14 | 
15 | 
16 | class ImageFolderDataset(BaseDataset):
17 |     def __init__(self, vis_processor, vis_root, classnames=[], **kwargs):
18 |         super().__init__(vis_processor=vis_processor, vis_root=vis_root)
19 | 
20 |         self.inner_dataset = datasets.ImageFolder(vis_root)
21 | 
22 |         self.annotation = [
23 |             {"image": elem[0], "label": elem[1], "image_id": elem[0]}
24 |             for elem in self.inner_dataset.imgs
25 |         ]
26 | 
27 |         self.classnames = classnames
28 | 
29 |         self._add_instance_ids()
30 | 
31 |     def __len__(self):
32 |         return len(self.inner_dataset)
33 | 
34 |     def __getitem__(self, index):
35 |         ann = self.annotation[index]
36 | 
37 |         img_fn = ann["image"]
38 |         image_path = os.path.join(self.vis_root, img_fn)
39 |         image = Image.open(image_path).convert("RGB")
40 | 
41 |         image = self.vis_processor(image)
42 | 
43 |         return {
44 |             "image": image,
45 |             "label": ann["label"],
46 |             "image_id": ann["image_id"],
47 |             "instance_id": ann["instance_id"],
48 |         }
49 | 
50 |     def displ_item(self, index):
51 |         sample, ann = self.__getitem__(index), self.annotation[index]
52 | 
53 |         return OrderedDict(
54 |             {
55 |                 "file": ann["image"],
56 |                 "label": self.classnames[ann["label"]],
57 |                 "image": sample["image"],
58 |             }
59 |         )
60 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import webdataset as wds
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class LaionDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, location):
14 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 | 
16 |         self.inner_dataset = wds.DataPipeline(
17 |             wds.ResampledShards(location),
18 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 |             wds.shuffle(1000, handler=wds.warn_and_continue),
20 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
21 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
24 |         )
25 | 
26 |     def to_dict(self, sample):
27 |         return {
28 |             "image": sample[0],
29 |             "text_input": self.text_processor(sample[1]["caption"]),
30 |         }
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     from torchvision import transforms
35 | 
36 |     def to_image_text_pair(sample):
37 |         return sample[0], sample[1]["caption"]
38 | 
39 |     normalize = transforms.Normalize(
40 |         (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
41 |     )
42 | 
43 |     transform_train = transforms.Compose(
44 |         [
45 |             transforms.RandomResizedCrop(256, scale=(0.2, 1.0)),
46 |             transforms.RandomHorizontalFlip(),
47 |             transforms.ToTensor(),
48 |             normalize,
49 |         ]
50 |     )
51 | 
52 |     dataset = LaionDataset(
53 |         vis_processor=transform_train,
54 |         text_processor=lambda x: x,
55 |         location="/export/laion/laion2B-multi/part-00000/{00000..01743}.tar",
56 |     )
57 | 
58 |     import torch
59 | 
60 |     loader = torch.utils.data.DataLoader(dataset.inner_dataset, batch_size=2)
61 | 
62 |     print(next(iter(loader))["text_input"])
63 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from abc import abstractmethod
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class MultimodalClassificationDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 | 
16 |         self.class_labels = None
17 | 
18 |     @abstractmethod
19 |     def _build_class_labels(self):
20 |         pass
21 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/snli_ve_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from collections import OrderedDict
10 | 
11 | from lavis.datasets.datasets.multimodal_classification_datasets import (
12 |     MultimodalClassificationDataset,
13 | )
14 | from PIL import Image
15 | 
16 | 
17 | class __DisplMixin:
18 |     def displ_item(self, index):
19 |         sample, ann = self.__getitem__(index), self.annotation[index]
20 | 
21 |         return OrderedDict(
22 |             {
23 |                 "file": os.path.basename(ann["image"]),
24 |                 "sentence": ann["sentence"],
25 |                 "label": ann["label"],
26 |                 "image": sample["image"],
27 |             }
28 |         )
29 | 
30 | 
31 | class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __DisplMixin):
32 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
33 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
34 | 
35 |         self.class_labels = self._build_class_labels()
36 | 
37 |     def _build_class_labels(self):
38 |         return {"contradiction": 0, "neutral": 1, "entailment": 2}
39 | 
40 |     def __getitem__(self, index):
41 |         ann = self.annotation[index]
42 | 
43 |         image_id = ann["image"]
44 |         image_path = os.path.join(self.vis_root, "%s.jpg" % image_id)
45 |         image = Image.open(image_path).convert("RGB")
46 | 
47 |         image = self.vis_processor(image)
48 |         sentence = self.text_processor(ann["sentence"])
49 | 
50 |         return {
51 |             "image": image,
52 |             "text_input": sentence,
53 |             "label": self.class_labels[ann["label"]],
54 |             "image_id": image_id,
55 |             "instance_id": ann["instance_id"],
56 |         }
57 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from PIL import Image
11 | 
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 | 
14 | 
15 | class VGVQADataset(VQADataset):
16 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 | 
19 |     def __getitem__(self, index):
20 |         ann = self.annotation[index]
21 | 
22 |         image_path = os.path.join(self.vis_root, ann["image"])
23 |         image = Image.open(image_path).convert("RGB")
24 | 
25 |         image = self.vis_processor(image)
26 |         question = self.text_processor(ann["question"])
27 | 
28 |         answers = [ann["answer"]]
29 |         # TODO this should be configured better
30 |         weights = [0.2]
31 | 
32 |         return {
33 |             "image": image,
34 |             "text_input": question,
35 |             "answers": answers,
36 |             "weights": weights,
37 |         }
38 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/video_caption_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | from lavis.datasets.datasets.caption_datasets import CaptionDataset
12 | 
13 | 
14 | class VideoCaptionDataset(CaptionDataset):
15 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
16 |         """
17 |         vis_root (string): Root directory of images (e.g. coco/images/)
18 |         ann_root (string): directory to store the annotation file
19 |         split (string): val or test
20 |         """
21 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
22 | 
23 |     def __getitem__(self, index):
24 | 
25 |         ann = self.annotation[index]
26 | 
27 |         vname = ann["video"]
28 |         video_path = os.path.join(self.vis_root, vname)
29 | 
30 |         video = self.vis_processor(video_path)
31 |         caption = self.text_processor(ann["caption"])
32 | 
33 |         # "image_id" is kept to stay compatible with the COCO evaluation format
34 |         return {
35 |             "video": video,
36 |             "text_input": caption,
37 |             "image_id": self.img_ids[ann["image_id"]],
38 |         }
39 | 
40 | 
41 | class VideoCaptionEvalDataset(BaseDataset):
42 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
43 |         """
44 |         vis_root (string): Root directory of images (e.g. coco/images/)
45 |         ann_root (string): directory to store the annotation file
46 |         split (string): val or test
47 |         """
48 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
49 | 
50 |     def __getitem__(self, index):
51 | 
52 |         ann = self.annotation[index]
53 | 
54 |         vname = ann["video"]
55 |         video_path = os.path.join(self.vis_root, vname)
56 | 
57 |         video = self.vis_processor(video_path)
58 | 
59 |         return {
60 |             "video": video,
61 |             "image_id": ann["image_id"],
62 |             "instance_id": ann["instance_id"],
63 |         }
64 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/video_vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import json
 9 | import os
10 | from collections import OrderedDict
11 | 
12 | from lavis.datasets.datasets.multimodal_classification_datasets import (
13 |     MultimodalClassificationDataset,
14 | )
15 | 
16 | 
17 | class __DisplMixin:
18 |     def displ_item(self, index):
19 |         ann = self.annotation[index]
20 | 
21 |         vname = ann["video"]
22 |         vpath = os.path.join(self.vis_root, vname)
23 | 
24 |         return OrderedDict(
25 |             {"file": vpath, "question": ann["question"], "answer": ann["answer"]}
26 |         )
27 | 
28 | 
29 | class VideoQADataset(MultimodalClassificationDataset, __DisplMixin):
30 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
31 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
32 | 
33 |     def _build_class_labels(self, ans_path):
34 |         ans2label = json.load(open(ans_path))
35 | 
36 |         self.class_labels = ans2label
37 | 
38 |     def _get_answer_label(self, answer):
39 |         if answer in self.class_labels:
40 |             return self.class_labels[answer]
41 |         else:
42 |             return len(self.class_labels)
43 | 
44 |     def __getitem__(self, index):
45 |         assert (
46 |             self.class_labels
47 |         ), f"class_labels of {__class__.__name__} is not built yet."
48 | 
49 |         ann = self.annotation[index]
50 | 
51 |         vname = ann["video"]
52 |         vpath = os.path.join(self.vis_root, vname)
53 | 
54 |         frms = self.vis_processor(vpath)
55 |         question = self.text_processor(ann["question"])
56 | 
57 |         return {
58 |             "video": frms,
59 |             "text_input": question,
60 |             "answers": self._get_answer_label(ann["answer"]),
61 |             "question_id": ann["question_id"],
62 |             "instance_id": ann["instance_id"],
63 |         }
64 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | from lavis.datasets.datasets.base_dataset import BaseDataset
11 | 
12 | 
13 | class VQADataset(BaseDataset):
14 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
15 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
16 | 
17 |     def collater(self, samples):
18 |         image_list, question_list, answer_list, weight_list = [], [], [], []
19 | 
20 |         num_answers = []
21 | 
22 |         for sample in samples:
23 |             image_list.append(sample["image"])
24 |             question_list.append(sample["text_input"].capitalize())
25 | 
26 |             weight_list.extend(sample["weights"])
27 | 
28 |             answers = sample["answers"]
29 | 
30 |             answer_list.extend(answers)
31 |             num_answers.append(len(answers))
32 | 
33 |         return {
34 |             "image": torch.stack(image_list, dim=0),
35 |             "text_input": question_list,
36 |             "answer": answer_list,
37 |             "weight": torch.Tensor(weight_list),
38 |             "n_answers": torch.LongTensor(num_answers),
39 |         }
40 | 
41 | 
42 | class VQAEvalDataset(BaseDataset):
43 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
44 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
45 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2019 Igor Brigadir
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | -->
 7 | 
 8 | # Download Conceptual Captions Data
 9 | 
10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
11 | 
12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
13 | 
14 | run `download_data_cc3m.py` or `download_data_cc12m.py`.
15 | 
16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
17 | 
18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
19 | 
20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
21 | 
22 | It should take about a day or two to download the training data, keep an eye on disk space.
23 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_coco.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     download_and_extract_archive,
16 |     get_abs_path,
17 |     get_cache_path,
18 | )
19 | 
20 | 
21 | DATA_URL = {
22 |     "train": "http://images.cocodataset.org/zips/train2014.zip",  # md5: 0da8c0bd3d6becc4dcb32757491aca88
23 |     "val": "http://images.cocodataset.org/zips/val2014.zip",  # md5: a3d79f5ed8d289b7a7554ce06a5782b3
24 |     "test": "http://images.cocodataset.org/zips/test2014.zip",  # md5: 04127eef689ceac55e3a572c2c92f264
25 |     "test2015": "http://images.cocodataset.org/zips/test2015.zip",  # md5: 04127eef689ceac55e3a572c2c92f264
26 | }
27 | 
28 | 
29 | def download_datasets(root, url):
30 |     download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
31 | 
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     config_path = get_abs_path("configs/datasets/coco/defaults_cap.yaml")
36 | 
37 |     storage_dir = OmegaConf.load(
38 |         config_path
39 |     ).datasets.coco_caption.build_info.images.storage
40 | 
41 |     download_dir = Path(get_cache_path(storage_dir)).parent / "download"
42 |     storage_dir = Path(get_cache_path(storage_dir))
43 | 
44 |     if storage_dir.exists():
45 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
46 |         exit(0)
47 | 
48 |     try:
49 |         for k, v in DATA_URL.items():
50 |             print("Downloading {} to {}".format(v, k))
51 |             download_datasets(download_dir, v)
52 |     except Exception as e:
53 |         # remove download dir if failed
54 |         cleanup_dir(download_dir)
55 |         print("Failed to download or extracting datasets. Aborting.")
56 | 
57 |     cleanup_dir(download_dir)
58 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_didemo.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     download_and_extract_archive,
16 |     get_abs_path,
17 |     get_cache_path,
18 | )
19 | 
20 | DATA_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/didemo_videos.tar.gz"
21 | 
22 | 
23 | def download_datasets(root, url):
24 |     """
25 |     Download the Imagenet-R dataset archives and expand them
26 |     in the folder provided as parameter
27 |     """
28 |     download_and_extract_archive(url=url, download_root=root)
29 | 
30 | 
31 | def move_files(download_path, storage_path):
32 |     """
33 |     Move files from download_path to storage_path
34 |     """
35 |     print("Moving to {}".format(storage_path))
36 | 
37 |     os.makedirs(storage_path, exist_ok=True)
38 | 
39 |     for file_name in os.listdir(download_path):
40 |         os.rename(
41 |             os.path.join(download_path, file_name),
42 |             os.path.join(storage_path, file_name),
43 |         )
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 
48 |     config_path = get_abs_path("configs/datasets/didemo/defaults_ret.yaml")
49 | 
50 |     storage_dir = OmegaConf.load(
51 |         config_path
52 |     ).datasets.didemo_retrieval.build_info.videos.storage
53 | 
54 |     download_dir = Path(get_cache_path(storage_dir)).parent / "download"
55 |     storage_dir = Path(get_cache_path(storage_dir))
56 | 
57 |     if storage_dir.exists():
58 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
59 |         exit(0)
60 | 
61 |     try:
62 |         print("Downloading {} to {}".format(DATA_URL, download_dir))
63 |         download_datasets(download_dir, DATA_URL)
64 |     except Exception as e:
65 |         # remove download dir if failed
66 |         cleanup_dir(download_dir)
67 |         print("Failed to download or extracting datasets. Aborting.")
68 | 
69 |     move_files(download_dir / "videos", storage_dir)
70 |     cleanup_dir(download_dir)
71 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_flickr.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     get_abs_path,
16 |     get_cache_path,
17 | )
18 | 
19 | import opendatasets as od
20 | 
21 | 
22 | DATA_URL = "https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset"
23 | 
24 | print(
25 |     """
26 |     To download the dataset, you need to have a Kaggle account and the associated key.
27 |     See https://www.kaggle.com/docs/api to create account and a new API token.
28 |     """
29 | )
30 | 
31 | 
32 | def move_directory(src_dir, dst_dir):
33 |     """
34 |     Move files from download_path to storage_path
35 |     """
36 |     print("Moving to {}".format(dst_dir))
37 | 
38 |     os.makedirs(dst_dir, exist_ok=True)
39 | 
40 |     for file_name in os.listdir(src_dir):
41 |         os.rename(
42 |             os.path.join(src_dir, file_name),
43 |             os.path.join(dst_dir, file_name),
44 |         )
45 | 
46 | 
47 | if __name__ == "__main__":
48 | 
49 |     config_path = get_abs_path("configs/datasets/flickr30k/defaults.yaml")
50 | 
51 |     storage_dir = OmegaConf.load(
52 |         config_path
53 |     ).datasets.flickr30k.build_info.images.storage
54 | 
55 |     storage_dir = Path(get_cache_path(storage_dir))
56 |     download_dir = storage_dir.parent / "download"
57 | 
58 |     if storage_dir.exists():
59 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
60 |         exit(0)
61 | 
62 |     os.makedirs(download_dir)
63 | 
64 |     try:
65 |         print("Downloading {} to {}".format(DATA_URL, download_dir))
66 |         od.download(DATA_URL, download_dir)
67 |     except Exception as e:
68 |         print(e)
69 |         # remove download dir if failed
70 |         cleanup_dir(download_dir)
71 |         exit(1)
72 | 
73 |     move_directory(
74 |         download_dir / "flickr-image-dataset" / "flickr30k_images" / "flickr30k_images",
75 |         storage_dir / "flickr30k-images",
76 |     )
77 | 
78 |     cleanup_dir(download_dir)
79 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_gqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     download_and_extract_archive,
16 |     get_abs_path,
17 |     get_cache_path,
18 | )
19 | 
20 | 
21 | DATA_URL = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
22 | 
23 | 
24 | def download_datasets(root, url):
25 |     download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir.parent)
26 | 
27 | 
28 | if __name__ == "__main__":
29 | 
30 |     config_path = get_abs_path("configs/datasets/gqa/defaults.yaml")
31 | 
32 |     storage_dir = OmegaConf.load(
33 |         config_path
34 |     ).datasets.gqa.build_info.images.storage
35 | 
36 |     download_dir = Path(get_cache_path(storage_dir)).parent / "download"
37 |     storage_dir = Path(get_cache_path(storage_dir))
38 | 
39 |     if storage_dir.exists():
40 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
41 |         exit(0)
42 | 
43 |     try:
44 |         print("Downloading {}".format(DATA_URL))
45 |         download_datasets(download_dir, DATA_URL)
46 |     except Exception as e:
47 |         # remove download dir if failed
48 |         cleanup_dir(download_dir)
49 |         print("Failed to download or extracting datasets. Aborting.")
50 | 
51 |     cleanup_dir(download_dir)
52 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_msvd.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     download_and_extract_archive,
16 |     get_abs_path,
17 |     get_cache_path,
18 | )
19 | 
20 | 
21 | DATA_URL = "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar"
22 | 
23 | 
24 | def download_datasets(root, url):
25 |     download_and_extract_archive(url=url, download_root=root)
26 | 
27 | 
28 | def move_files(download_path, storage_path):
29 |     """
30 |     Move files from download_path to storage_path
31 |     """
32 |     print("Moving to {}".format(storage_path))
33 | 
34 |     os.makedirs(storage_path, exist_ok=True)
35 | 
36 |     for file_name in os.listdir(download_path):
37 |         os.rename(
38 |             os.path.join(download_path, file_name),
39 |             os.path.join(storage_path, file_name),
40 |         )
41 | 
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     config_path = get_abs_path("configs/datasets/msvd/defaults_cap.yaml")
46 | 
47 |     storage_dir = OmegaConf.load(
48 |         config_path
49 |     ).datasets.msvd_cap.build_info.videos.storage
50 | 
51 |     download_dir = Path(get_cache_path(storage_dir)).parent / "download"
52 |     storage_dir = Path(get_cache_path(storage_dir))
53 | 
54 |     if storage_dir.exists():
55 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
56 |         exit(0)
57 | 
58 |     try:
59 |         print("Downloading {}".format(DATA_URL))
60 |         download_datasets(download_dir, DATA_URL)
61 |     except Exception as e:
62 |         # remove download dir if failed
63 |         cleanup_dir(download_dir)
64 |         print("Failed to download or extracting datasets. Aborting.")
65 | 
66 |     move_files(download_dir / "YouTubeClips", storage_dir)
67 |     cleanup_dir(download_dir)
68 | 


--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_vg.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.utils import (
14 |     cleanup_dir,
15 |     download_and_extract_archive,
16 |     get_abs_path,
17 |     get_cache_path,
18 | )
19 | 
20 | 
21 | DATA_URL = {
22 |     "train": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
23 |     "train2": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip",
24 | }
25 | 
26 | 
27 | def download_datasets(root, url):
28 |     download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
29 | 
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     config_path = get_abs_path("configs/datasets/vg/defaults_caption.yaml")
34 | 
35 |     storage_dir = OmegaConf.load(
36 |         config_path
37 |     ).datasets.vg_caption.build_info.images.storage
38 | 
39 |     download_dir = Path(get_cache_path(storage_dir)).parent / "download"
40 |     storage_dir = Path(get_cache_path(storage_dir))
41 | 
42 |     if storage_dir.exists():
43 |         print(f"Dataset already exists at {storage_dir}. Aborting.")
44 |         exit(0)
45 | 
46 |     try:
47 |         for k, v in DATA_URL.items():
48 |             print("Downloading {} to {}".format(v, k))
49 |             download_datasets(download_dir, v)
50 |     except Exception as e:
51 |         # remove download dir if failed
52 |         cleanup_dir(download_dir)
53 |         print("Failed to download or extracting datasets. Aborting.")
54 | 
55 |     cleanup_dir(download_dir)
56 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/alpro_models/alpro_outputs.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from dataclasses import dataclass
 9 | from typing import Optional
10 | 
11 | import torch
12 | from transformers.modeling_outputs import (
13 |     BaseModelOutputWithPoolingAndCrossAttentions,
14 |     ModelOutput,
15 | )
16 | 
17 | 
18 | @dataclass
19 | class AlproSimilarity(ModelOutput):
20 |     sim_v2t: torch.FloatTensor = None
21 |     sim_t2v: torch.FloatTensor = None
22 | 
23 |     sim_v2t_targets: Optional[torch.FloatTensor] = None
24 |     sim_t2v_targets: Optional[torch.FloatTensor] = None
25 | 
26 | 
27 | @dataclass
28 | class AlproIntermediateOutput(ModelOutput):
29 |     # uni-modal features
30 |     video_embeds: torch.FloatTensor = None
31 |     text_embeds: Optional[torch.FloatTensor] = None
32 | 
33 |     # intermediate outputs of multimodal encoder
34 |     encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
35 |     encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
36 | 
37 |     vtm_logits: Optional[torch.FloatTensor] = None
38 |     vtm_labels: Optional[torch.LongTensor] = None
39 | 
40 | 
41 | @dataclass
42 | class AlproOutput(ModelOutput):
43 |     # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
44 |     sims: Optional[AlproSimilarity] = None
45 | 
46 |     intermediate_output: AlproIntermediateOutput = None
47 | 
48 |     loss: Optional[torch.FloatTensor] = None
49 | 
50 |     loss_vtc: Optional[torch.FloatTensor] = None
51 | 
52 |     loss_vtm: Optional[torch.FloatTensor] = None
53 | 
54 |     loss_mlm: Optional[torch.FloatTensor] = None
55 | 
56 | 
57 | @dataclass
58 | class AlproOutputWithLogits(AlproOutput):
59 |     logits: torch.FloatTensor = None
60 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 | 
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/clip_outputs.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | from dataclasses import dataclass
11 | 
12 | from typing import Optional
13 | 
14 | import torch
15 | from transformers.modeling_outputs import ModelOutput
16 | 
17 | 
18 | @dataclass
19 | class ClipOutputFeatures(ModelOutput):
20 |     """
21 |     Data class of features from AlbefFeatureExtractor.
22 | 
23 |     Args:
24 |         image_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional`
25 |         image_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional`
26 |         text_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional`
27 |         text_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional`
28 |     """
29 | 
30 |     image_embeds: Optional[torch.FloatTensor] = None
31 |     image_embeds_proj: Optional[torch.FloatTensor] = None
32 | 
33 |     text_embeds: Optional[torch.FloatTensor] = None
34 |     text_embeds_proj: Optional[torch.FloatTensor] = None
35 | 
36 | 
37 | @dataclass
38 | class ClipOutput(ModelOutput):
39 |     intermediate_output: Optional[ClipOutputFeatures] = None
40 | 
41 |     logit_scale_exp: Optional[torch.FloatTensor] = None
42 | 
43 |     loss: Optional[torch.FloatTensor] = None
44 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/pics/CLIP.png


--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | from torch import nn as nn
11 | from torchvision.ops.misc import FrozenBatchNorm2d
12 | 
13 | 
14 | def freeze_batch_norm_2d(module, module_match={}, name=""):
15 |     """
16 |     Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
17 |     itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
18 |     returned. Otherwise, the module is walked recursively and submodules are converted in place.
19 |     Args:
20 |         module (torch.nn.Module): Any PyTorch module.
21 |         module_match (dict): Dictionary of full module names to freeze (all if empty)
22 |         name (str): Full module name (prefix)
23 |     Returns:
24 |         torch.nn.Module: Resulting module
25 |     Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
26 |     """
27 |     res = module
28 |     is_match = True
29 |     if module_match:
30 |         is_match = name in module_match
31 |     if is_match and isinstance(
32 |         module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)
33 |     ):
34 |         res = FrozenBatchNorm2d(module.num_features)
35 |         res.num_features = module.num_features
36 |         res.affine = module.affine
37 |         if module.affine:
38 |             res.weight.data = module.weight.data.clone().detach()
39 |             res.bias.data = module.bias.data.clone().detach()
40 |         res.running_mean.data = module.running_mean.data
41 |         res.running_var.data = module.running_var.data
42 |         res.eps = module.eps
43 |     else:
44 |         for child_name, child in module.named_children():
45 |             full_child_name = ".".join([name, child_name]) if name else child_name
46 |             new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
47 |             if new_child is not child:
48 |                 res.add_module(child_name, new_child)
49 |     return res
50 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/pnp_vqa_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def prepare_qa_input(sample, num_captions, num_captions_fid):
12 |     sample_question_captions = []
13 | 
14 |     for question, captions in zip(sample['text_input'], sample['captions']):
15 |         assert isinstance(captions, list)
16 |         question_captions = []
17 |         question_caption = ''
18 |         for cap_id, cap_ in enumerate(captions[0:num_captions]):
19 |             question_caption += (cap_.strip() + '. ')
20 |             if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0):
21 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
22 |                 question_captions.append(question_caption)
23 |                 question_caption = ''
24 |             if (cap_id + 1) == num_captions:
25 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
26 |                 question_captions.append(question_caption)
27 |         sample_question_captions.append(question_captions)
28 | 
29 |     sample['question_captions'] = sample_question_captions
30 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/Lavis/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/Lavis/lavis/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.processors.base_processor import BaseProcessor
 9 | 
10 | from lavis.processors.alpro_processors import (
11 |     AlproVideoTrainProcessor,
12 |     AlproVideoEvalProcessor,
13 | )
14 | from lavis.processors.blip_processors import (
15 |     BlipImageTrainProcessor,
16 |     Blip2ImageTrainProcessor,
17 |     BlipImageEvalProcessor,
18 |     BlipCaptionProcessor,
19 | )
20 | from lavis.processors.gpt_processors import (
21 |     GPTVideoFeatureProcessor,
22 |     GPTDialogueProcessor,
23 | )
24 | from lavis.processors.clip_processors import ClipImageTrainProcessor
25 | 
26 | from lavis.common.registry import registry
27 | 
28 | __all__ = [
29 |     "BaseProcessor",
30 |     # ALPRO
31 |     "AlproVideoTrainProcessor",
32 |     "AlproVideoEvalProcessor",
33 |     # BLIP
34 |     "BlipImageTrainProcessor",
35 |     "Blip2ImageTrainProcessor",
36 |     "BlipImageEvalProcessor",
37 |     "BlipCaptionProcessor",
38 |     "ClipImageTrainProcessor",
39 |     # GPT
40 |     "GPTVideoFeatureProcessor",
41 |     "GPTDialogueProcessor",
42 | ]
43 | 
44 | 
45 | def load_processor(name, cfg=None):
46 |     """
47 |     Example
48 | 
49 |     >>> processor = load_processor("alpro_video_train", cfg=None)
50 |     """
51 |     processor = registry.get_processor_class(name).from_config(cfg)
52 | 
53 |     return processor
54 | 


--------------------------------------------------------------------------------
/Lavis/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/direct_aokvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 41.22
 7 | 
 8 | model:
 9 |   arch: blip2_t5_par
10 |   model_type: pretrain_flant5xl
11 |   use_grad_checkpoint: False
12 |   max_txt_len: 128
13 |   prompt: "Question: {} Short Answer: "
14 |   multiple_choice: False
15 | 
16 |   keyword_pipeline: True
17 |   reason: True
18 |   paraphrase: False
19 | 
20 |   ext_paraphrase: False
21 |   par_num_beams: 5
22 |   num_add_candidates: 4
23 | 
24 |   perform_selection: False
25 |   selection_criterion: 'Aconf'
26 |   calibrate: False
27 |   perform_ensembling: False
28 |   dropout_aggregate: False
29 | 
30 |   constrained: True
31 |   verbose: False
32 | 
33 |   use_caption: False
34 |   use_promptcap: False
35 |   alt_device: 0
36 | 
37 |   # for OKVQA evaluation
38 |   apply_lemmatizer: False
39 | 
40 | datasets:
41 |   aok_vqa: # name of the dataset builder
42 |     type: eval
43 |     vis_processor:
44 |         eval:
45 |           name: "blip_image_eval"
46 |           image_size: 224
47 |     text_processor:
48 |         eval:
49 |           name: "blip_question"
50 | 
51 | 
52 | run:
53 |   task: aok_vqa
54 |   # optimization-specific
55 |   batch_size_train: 16
56 |   batch_size_eval: 10
57 |   num_workers: 4
58 | 
59 |   # inference-specific
60 |   max_len: 10
61 |   min_len: 1
62 |   num_beams: 5
63 |   inference_method: "generate"
64 | 
65 |   seed: 42
66 |   output_dir: "output/BLIP2/AOKVQA-direct"
67 | 
68 |   evaluate: True
69 |   test_splits: ["val"]
70 | 
71 |   # distribution-specific
72 |   device: "cuda"
73 |   world_size: 1
74 |   dist_url: "env://"
75 |   distributed: True
76 | 


--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/mc_aokvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | 
 7 | model:
 8 |   arch: blip2_t5_par
 9 |   model_type: pretrain_flant5xl
10 |   use_grad_checkpoint: False
11 |   max_txt_len: 128
12 |   prompt: "Based on this information, select the correct answer to the question from the options.\nQuestion: {}\nOptions: A. {}, B. {}, C. {}, D. {}\nAnswer: Option "
13 |   multiple_choice: True
14 |   
15 |   keyword_pipeline: False
16 |   reason: False
17 |   paraphrase: False
18 | 
19 |   ext_paraphrase: False
20 |   par_num_beams: 5
21 |   num_add_candidates: 0
22 | 
23 |   perform_selection: False
24 |   selection_criterion: 'Aconf'
25 |   calibrate: False
26 |   perform_ensembling: False
27 |   dropout_aggregate: False
28 | 
29 |   constrained: True
30 |   verbose: False
31 | 
32 |   use_caption: False
33 |   use_promptcap: False
34 |   alt_device: 0
35 | 
36 |   # for OKVQA evaluation
37 |   apply_lemmatizer: False
38 | 
39 | datasets:
40 |   aok_vqa: # name of the dataset builder
41 |     type: eval
42 |     vis_processor:
43 |         eval:
44 |           name: "blip_image_eval"
45 |           image_size: 224
46 |     text_processor:
47 |         eval:
48 |           name: "blip_question"
49 | #     build_info:
50 | #         images:
51 | #             storage: '/export/share/datasets/vision/coco/images/'
52 | 
53 | run:
54 |   task: mc_aok_vqa
55 |   # optimization-specific
56 |   batch_size_train: 16
57 |   batch_size_eval: 24
58 |   num_workers: 4
59 | 
60 |   # inference-specific
61 |   max_len: 10
62 |   min_len: 1
63 |   num_beams: 5
64 |   inference_method: "generate"
65 | 
66 |   seed: 42
67 |   output_dir: "output/BLIP2/AOKVQA-MC"
68 | 
69 |   evaluate: True
70 |   test_splits: ["val"]
71 | 
72 |   # distribution-specific
73 |   device: "cuda"
74 |   world_size: 1
75 |   dist_url: "env://"
76 |   distributed: True
77 | 


--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/vqav2_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | 
 7 | model:
 8 |   arch: blip2_t5_par
 9 |   model_type: pretrain_flant5xl
10 |   use_grad_checkpoint: False
11 |   prompt: "Question: {} Short answer:"
12 |   max_txt_len: 256
13 | 
14 |   keyword_pipeline: True
15 |   reason: True
16 |   paraphrase: False
17 |   ext_paraphrase: True
18 |   par_num_beams: 5
19 |   num_add_candidates: 4
20 | 
21 |   perform_selection: False
22 |   selection_criterion: 'Aconf'
23 |   calibrate: False
24 |   perform_ensembling: False
25 |   dropout_aggregate: False
26 | 
27 |   constrained: True
28 |   verbose: True
29 | 
30 |   use_caption: False
31 |   use_promptcap: False
32 |   alt_device: 0
33 | 
34 | datasets:
35 |   coco_vqa: # name of the dataset builder
36 |     type: eval
37 |     vis_processor:
38 |         eval:
39 |           name: "blip_image_eval"
40 |           image_size: 224
41 |     text_processor:
42 |         eval:
43 |           name: "blip_question"
44 | #     build_info:
45 | #         images:
46 | #             storage: '/export/share/datasets/vision/coco/images/'
47 | 
48 | run:
49 |   task: vqa
50 |   # optimization-specific
51 |   batch_size_train: 16
52 |   batch_size_eval: 16
53 |   num_workers: 4
54 | 
55 |   # inference-specific
56 |   max_len: 10
57 |   min_len: 1
58 |   num_beams: 5
59 |   inference_method: "generate"
60 |   #"Short answer:"
61 | 
62 |   seed: 42
63 |   output_dir: "output/BLIP2/VQA"
64 | 
65 |   evaluate: True
66 |   test_splits: ["val"]
67 | 
68 |   # distribution-specific
69 |   device: "cuda"
70 |   world_size: 1
71 |   dist_url: "env://"
72 |   distributed: True
73 | 


--------------------------------------------------------------------------------
/Lavis/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.runners.runner_base import RunnerBase
 9 | from lavis.runners.runner_iter import RunnerIter
10 | 
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 | 


--------------------------------------------------------------------------------
/Lavis/lavis/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | from lavis.tasks.captioning import CaptionTask
11 | from lavis.tasks.image_text_pretrain import ImageTextPretrainTask
12 | from lavis.tasks.multimodal_classification import (
13 |     MultimodalClassificationTask,
14 | )
15 | from lavis.tasks.retrieval import RetrievalTask
16 | from lavis.tasks.vqa import VQATask, GQATask, AOKVQATask, MultiChoiceAOKVQATask
17 | from lavis.tasks.vqa_reading_comprehension import VQARCTask, GQARCTask
18 | from lavis.tasks.dialogue import DialogueTask
19 | 
20 | 
21 | def setup_task(cfg):
22 |     assert "task" in cfg.run_cfg, "Task name must be provided."
23 | 
24 |     task_name = cfg.run_cfg.task
25 |     task = registry.get_task_class(task_name).setup_task(cfg=cfg)
26 |     assert task is not None, "Task {} not properly registered.".format(task_name)
27 | 
28 |     return task
29 | 
30 | 
31 | __all__ = [
32 |     "BaseTask",
33 |     "AOKVQATask",
34 |     "RetrievalTask",
35 |     "CaptionTask",
36 |     "VQATask",
37 |     "GQATask",
38 |     "VQARCTask",
39 |     "GQARCTask",
40 |     "MultimodalClassificationTask",
41 |     "MultiChoiceAOKVQATask",
42 |     # "VideoQATask",
43 |     # "VisualEntailmentTask",
44 |     "ImageTextPretrainTask",
45 |     "DialogueTask",
46 | ]
47 | 


--------------------------------------------------------------------------------
/Lavis/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/Lavis/requirements.txt:
--------------------------------------------------------------------------------
 1 | contexttimer
 2 | decord
 3 | einops>=0.4.1
 4 | fairscale==0.4.4
 5 | ftfy
 6 | iopath
 7 | ipython
 8 | omegaconf
 9 | opencv-python-headless==4.5.5.64
10 | opendatasets
11 | packaging
12 | pandas
13 | plotly
14 | pre-commit
15 | pycocoevalcap
16 | pycocotools
17 | python-magic
18 | scikit-image
19 | sentencepiece
20 | spacy
21 | streamlit
22 | timm==0.4.12
23 | torch>=1.10.0
24 | torchvision
25 | tqdm
26 | transformers>=4.25.0,<4.27
27 | webdataset
28 | wheel
29 | rake-nltk
30 | 


--------------------------------------------------------------------------------
/Lavis/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from setuptools import setup, find_namespace_packages
 9 | import platform
10 | 
11 | DEPENDENCY_LINKS = []
12 | if platform.system() == "Windows":
13 |     DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
14 | 
15 | 
16 | def fetch_requirements(filename):
17 |     with open(filename) as f:
18 |         return [ln.strip() for ln in f.read().split("\n")]
19 | 
20 | 
21 | setup(
22 |     name="salesforce-lavis",
23 |     version="1.0.1",
24 |     author="Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi",
25 |     description="LAVIS - A One-stop Library for Language-Vision Intelligence",
26 |     long_description=open("README.md", "r", encoding="utf-8").read(),
27 |     long_description_content_type="text/markdown",
28 |     keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch",
29 |     license="3-Clause BSD",
30 |     packages=find_namespace_packages(include="lavis.*"),
31 |     install_requires=fetch_requirements("requirements.txt"),
32 |     python_requires=">=3.7.0",
33 |     include_package_data=True,
34 |     dependency_links=DEPENDENCY_LINKS,
35 |     zip_safe=False,
36 | )
37 | 


--------------------------------------------------------------------------------
/MiniGPT-4/environment.yml:
--------------------------------------------------------------------------------
 1 | name: minigpt4
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - anaconda
 6 | dependencies:
 7 |   - python=3.9
 8 |   - cudatoolkit
 9 |   - pip
10 |   - pytorch=1.12.1
11 |   - pytorch-mutex=1.0=cuda
12 |   - torchaudio=0.12.1
13 |   - torchvision=0.13.1
14 |   - pip:
15 |     - accelerate==0.16.0
16 |     - aiohttp==3.8.4
17 |     - aiosignal==1.3.1
18 |     - async-timeout==4.0.2
19 |     - attrs==22.2.0
20 |     - bitsandbytes==0.37.0
21 |     - cchardet==2.1.7
22 |     - chardet==5.1.0
23 |     - contourpy==1.0.7
24 |     - cycler==0.11.0
25 |     - filelock==3.9.0
26 |     - fonttools==4.38.0
27 |     - frozenlist==1.3.3
28 |     - huggingface-hub==0.13.4
29 |     - importlib-resources==5.12.0
30 |     - kiwisolver==1.4.4
31 |     - matplotlib==3.7.0
32 |     - multidict==6.0.4
33 |     - openai==0.27.0
34 |     - packaging==23.0
35 |     - psutil==5.9.4
36 |     - pycocotools==2.0.6
37 |     - pyparsing==3.0.9
38 |     - python-dateutil==2.8.2
39 |     - pyyaml==6.0
40 |     - regex==2022.10.31
41 |     - tokenizers==0.13.2
42 |     - tqdm==4.64.1
43 |     - transformers==4.28.0
44 |     - timm==0.6.13
45 |     - spacy==3.5.1
46 |     - webdataset==0.2.48
47 |     - scikit-learn==1.2.2
48 |     - scipy==1.10.1
49 |     - yarl==1.8.2
50 |     - zipp==3.14.0
51 |     - omegaconf==2.3.0
52 |     - opencv-python==4.7.0.72
53 |     - iopath==0.1.10
54 |     - decord==0.6.0
55 |     - tenacity==8.2.2
56 |     - peft
57 |     - pycocoevalcap
58 |     - sentence-transformers
59 |     - umap-learn
60 |     - notebook
61 |     - gradio==3.24.1
62 |     - gradio-client==0.0.8
63 |     - wandb
64 |     - rake-nltk
65 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from minigpt4.common.registry import registry
14 | 
15 | from minigpt4.datasets.builders import *
16 | from minigpt4.models import *
17 | from minigpt4.processors import *
18 | from minigpt4.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/common/__init__.py


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/aokvqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   aok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
17 |           storage:
18 |               - aokvqa/annotations/aokvqa_v1p0_train.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
22 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
23 |           storage:
24 |               - aokvqa/annotations/aokvqa_v1p0_val.json
25 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
26 |               # - aokvqa/annotations/large_vocab_train_lavis.json
27 |         test:
28 |           url:
29 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
31 |           storage:
32 |               - aokvqa/annotations/aokvqa_v1p0_test.json
33 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
34 |       images:
35 |           storage: coco/images/
36 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/aokvqa/eval_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   aok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               - aokvqa/annotations/aokvqa_v1p0_val.json
17 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
18 |           storage:
19 |               - aokvqa/annotations/aokvqa_v1p0_val.json
20 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
21 |               # - aokvqa/annotations/large_vocab_train_lavis.json
22 |       images:
23 |           storage: coco/images/
24 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/cc_sbu/align.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu_align:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_align/
6 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/cc_sbu/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_dataset/{00000..01255}.tar
6 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_caption: # name of the dataset builder
 8 |     dataset_card: dataset_card/coco_caption.md
 9 |     # data_dir: ${env.data_dir}/datasets
10 |     data_type: images # [images|videos|features]
11 | 
12 |     build_info:
13 |       # Be careful not to append minus sign (-) before split to avoid itemizing
14 |       annotations:
15 |         train:
16 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 |           md5: aa31ac474cf6250ebb81d18348a07ed8
18 |           storage: coco/annotations/coco_karpathy_train.json
19 |         val:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 |           md5: b273847456ef5580e33713b1f7de52a0
22 |           storage:  coco/annotations/coco_karpathy_val.json
23 |         test:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 |           storage: coco/annotations/coco_karpathy_test.json
27 |       images:
28 |         storage: coco/images/
29 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               - coco/annotations/vqa_val_eval.json
17 |               - coco/annotations/answer_list.json
18 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
19 |               - coco/annotations/v2_mscoco_val2014_annotations.json
20 |     
21 |           storage:
22 |               - coco/annotations/vqa_val_eval.json
23 |               - coco/annotations/answer_list.json
24 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
25 |               - coco/annotations/v2_mscoco_val2014_annotations.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/laion/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   laion:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/laion_dataset/{00000..10488}.tar
6 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 |   # For default users
3 |   # cache_root: "cache"
4 |   # For internal use with persistent storage
5 |   cache_root: ".cache/lavis"
6 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/models/minigpt4_llama2.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   has_qformer: False
11 | 
12 |   # generation configs
13 |   prompt: ""
14 | 
15 |   llama_model: 'meta-llama/Llama-2-7b-chat-hf'
16 | 
17 | preprocess:
18 |     vis_processor:
19 |         train:
20 |           name: "blip2_image_train"
21 |           image_size: 224
22 |         eval:
23 |           name: "blip2_image_eval"
24 |           image_size: 224
25 |     text_processor:
26 |         train:
27 |           name: "blip_caption"
28 |         eval:
29 |           name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/models/minigpt4_vicuna0.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # generation configs
16 |   prompt: ""
17 | 
18 |   llama_model: "Vision-CAIR/vicuna-7b"
19 | 
20 | preprocess:
21 |     vis_processor:
22 |         train:
23 |           name: "blip2_image_train"
24 |           image_size: 224
25 |         eval:
26 |           name: "blip2_image_eval"
27 |           image_size: 224
28 |     text_processor:
29 |         train:
30 |           name: "blip_caption"
31 |         eval:
32 |           name: "blip_caption"
33 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/conversation/__init__.py


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/__init__.py


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/builders/vqa_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 9 | 
10 | from minigpt4.common.registry import registry
11 | from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
12 | from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
13 | 
14 | 
15 | @registry.register_builder("coco_vqa")
16 | class COCOVQABuilder(BaseDatasetBuilder):
17 |     train_dataset_cls = COCOVQADataset
18 |     eval_dataset_cls = COCOVQAEvalDataset
19 | 
20 |     DATASET_CONFIG_DICT = {
21 |         "default": "configs/datasets/coco/defaults_vqa.yaml",
22 |         "eval": "configs/datasets/coco/eval_vqa.yaml",
23 |     }
24 | 
25 | 
26 | @registry.register_builder("ok_vqa")
27 | class OKVQABuilder(COCOVQABuilder):
28 |     DATASET_CONFIG_DICT = {
29 |         "default": "configs/datasets/okvqa/defaults.yaml",
30 |     }
31 | 
32 | 
33 | @registry.register_builder("aok_vqa")
34 | class AOKVQABuilder(BaseDatasetBuilder):
35 |     train_dataset_cls = AOKVQADataset
36 |     eval_dataset_cls = AOKVQAEvalDataset
37 | 
38 |     DATASET_CONFIG_DICT = {
39 |         "default": "configs/datasets/aokvqa/defaults.yaml",
40 |         "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml",
41 |     }
42 |    
43 | 
44 | 
45 | # @registry.register_builder("gqa")
46 | # class GQABuilder(BaseDatasetBuilder):
47 | #     train_dataset_cls = GQADataset
48 | #     eval_dataset_cls = GQAEvalDataset
49 | 
50 | #     DATASET_CONFIG_DICT = {
51 | #         "default": "configs/datasets/gqa/defaults.yaml",
52 | #         "balanced_val": "configs/datasets/gqa/balanced_val.yaml",
53 | #         "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml",
54 | #     }


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/datasets/__init__.py


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/base_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import json
 9 | from typing import Iterable
10 | 
11 | from torch.utils.data import Dataset, ConcatDataset
12 | from torch.utils.data.dataloader import default_collate
13 | 
14 | 
15 | class BaseDataset(Dataset):
16 |     def __init__(
17 |         self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
18 |     ):
19 |         """
20 |         vis_root (string): Root directory of images (e.g. coco/images/)
21 |         ann_root (string): directory to store the annotation file
22 |         """
23 |         self.vis_root = vis_root
24 |         self.annotation = []
25 |         for ann_path in ann_paths:
26 |             self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
27 | 
28 |         self.vis_processor = vis_processor
29 |         self.text_processor = text_processor
30 | 
31 |         self._add_instance_ids()
32 | 
33 |     def __len__(self):
34 |         return len(self.annotation)
35 | 
36 |     def collater(self, samples):
37 |         return default_collate(samples)
38 | 
39 |     def set_processors(self, vis_processor, text_processor):
40 |         self.vis_processor = vis_processor
41 |         self.text_processor = text_processor
42 | 
43 |     def _add_instance_ids(self, key="instance_id"):
44 |         for idx, ann in enumerate(self.annotation):
45 |             ann[key] = str(idx)
46 | 
47 | 
48 | class ConcatDataset(ConcatDataset):
49 |     def __init__(self, datasets: Iterable[Dataset]) -> None:
50 |         super().__init__(datasets)
51 | 
52 |     def collater(self, samples):
53 |         # TODO For now only supports datasets with same underlying collater implementations
54 | 
55 |         all_keys = set()
56 |         for s in samples:
57 |             all_keys.update(s)
58 | 
59 |         shared_keys = all_keys
60 |         for s in samples:
61 |             shared_keys = shared_keys & set(s.keys())
62 | 
63 |         samples_shared_keys = []
64 |         for s in samples:
65 |             samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
66 | 
67 |         return self.datasets[0].collater(samples_shared_keys)
68 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/cc_sbu_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | import webdataset as wds
 4 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
 5 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
 6 | 
 7 | 
 8 | class CCSBUDataset(BaseDataset):
 9 |     def __init__(self, vis_processor, text_processor, location):
10 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
11 | 
12 |         self.inner_dataset = wds.DataPipeline(
13 |             wds.ResampledShards(location),
14 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
15 |             wds.shuffle(1000, handler=wds.warn_and_continue),
16 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
17 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
18 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
19 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
20 |         )
21 | 
22 |     def to_dict(self, sample):
23 |         return {
24 |             "image": sample[0],
25 |             "answer": self.text_processor(sample[1]["caption"]),
26 |         }
27 | 
28 | 
29 | class CCSBUAlignDataset(CaptionDataset):
30 | 
31 |     def __getitem__(self, index):
32 | 
33 |         # TODO this assumes image input, not general enough
34 |         ann = self.annotation[index]
35 | 
36 |         img_file = '{}.jpg'.format(ann["image_id"])
37 |         image_path = os.path.join(self.vis_root, img_file)
38 |         image = Image.open(image_path).convert("RGB")
39 | 
40 |         image = self.vis_processor(image)
41 |         caption = ann["caption"]
42 | 
43 |         return {
44 |             "image": image,
45 |             "answer": caption,
46 |             "image_id": self.img_ids[ann["image_id"]],
47 |         }


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import webdataset as wds
 9 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class LaionDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, location):
14 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 | 
16 |         self.inner_dataset = wds.DataPipeline(
17 |             wds.ResampledShards(location),
18 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 |             wds.shuffle(1000, handler=wds.warn_and_continue),
20 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
21 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
24 |         )
25 | 
26 |     def to_dict(self, sample):
27 |         return {
28 |             "image": sample[0],
29 |             "answer": self.text_processor(sample[1]["caption"]),
30 |         }
31 | 
32 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
11 | 
12 | 
13 | class VQADataset(BaseDataset):
14 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
15 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
16 | 
17 |     def collater(self, samples):
18 |         image_list, question_list, answer_list, weight_list = [], [], [], []
19 | 
20 |         num_answers = []
21 | 
22 |         for sample in samples:
23 |             image_list.append(sample["image"])
24 |             question_list.append(sample["text_input"])
25 | 
26 |             weight_list.extend(sample["weights"])
27 | 
28 |             answers = sample["answers"]
29 | 
30 |             answer_list.extend(answers)
31 |             num_answers.append(len(answers))
32 | 
33 |         return {
34 |             "image": torch.stack(image_list, dim=0),
35 |             "text_input": question_list,
36 |             "answer": answer_list,
37 |             "weight": torch.Tensor(weight_list),
38 |             "n_answers": torch.LongTensor(num_answers),
39 |         }
40 | 
41 | 
42 | class VQAEvalDataset(BaseDataset):
43 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
44 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
45 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.processors.base_processor import BaseProcessor
 9 | from minigpt4.processors.blip_processors import (
10 |     Blip2ImageTrainProcessor,
11 |     Blip2ImageEvalProcessor,
12 |     BlipCaptionProcessor,
13 | )
14 | 
15 | from minigpt4.common.registry import registry
16 | 
17 | __all__ = [
18 |     "BaseProcessor",
19 |     "Blip2ImageTrainProcessor",
20 |     "Blip2ImageEvalProcessor",
21 |     "BlipCaptionProcessor",
22 | ]
23 | 
24 | 
25 | def load_processor(name, cfg=None):
26 |     """
27 |     Example
28 | 
29 |     >>> processor = load_processor("alpro_video_train", cfg=None)
30 |     """
31 |     processor = registry.get_processor_class(name).from_config(cfg)
32 | 
33 |     return processor
34 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/projects/minigpt4/conv_direct_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: mini_gpt4
 8 |   model_type: pretrain_vicuna0
 9 |   end_sym: "###"
10 |   max_txt_len: 256
11 |   ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth'
12 |   use_grad_checkpoint: False
13 |   answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word."
14 |   prompt_template: "### Human: <Img><ImageHere></Img>### Human: Based on the image, answer the question below.\nQuestion: {}"
15 |   process_answer: True
16 |   answer_processor: 'aok-vqa'
17 |   conversation: True
18 |   multiple_choice: False
19 | 
20 |   keyword_pipeline: True
21 |   reason: True
22 |   paraphrase: False
23 | 
24 |   perform_selection: False
25 |   selection_criterion: 'Aconf'
26 |   perform_ensembling: False
27 | 
28 |   ext_paraphrase: False
29 |   par_num_beams: 5
30 |   num_add_candidates: 4
31 |   verbose: True
32 |   alt_device: 0
33 | 
34 | datasets:
35 |   aok_vqa: # name of the dataset builder
36 |     type: eval
37 |     vis_processor:
38 |         eval:
39 |           name: "blip2_image_eval"
40 |           image_size: 224
41 |     text_processor:
42 |         eval:
43 |           name: "blip_question"
44 |     build_info:
45 |         images:
46 |             storage: '.cache/lavis/coco/images/'
47 | 
48 | run:
49 |   task: aok_vqa
50 |   # optimization-specific
51 |   batch_size_train: 16
52 |   batch_size_eval: 5
53 |   num_workers: 4
54 | 
55 |   # inference-specific
56 |   max_len: 30
57 |   min_len: 1
58 |   num_beams: 5
59 |   inference_method: "generate"
60 |   
61 |   seed: 42
62 |   output_dir: "output/Vicuna7B/AOK-VQA"
63 | 
64 |   evaluate: True
65 |   test_splits: ["val"]
66 | 
67 |   # distribution-specific
68 |   device: "cuda"
69 |   world_size: 1
70 |   dist_url: "env://"
71 |   distributed: True
72 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/projects/minigpt4/conv_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | 
 7 | model:
 8 |   arch: mini_gpt4
 9 |   model_type: pretrain_vicuna0
10 |   end_sym: "###"
11 |   max_txt_len: 200
12 |   ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth'
13 |   use_grad_checkpoint: False
14 |   answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word."
15 |   prompt_template: "### Human: <Img><ImageHere></Img>### Human: Based on the image, answer the question below.\nQuestion: {}"
16 |   process_answer: True
17 |   answer_processor: 'vqa'
18 |   conversation: True
19 | 
20 |   ext_paraphrase: False
21 |   par_num_beams: 5
22 |   num_add_candidates: 4
23 | 
24 |   keyword_pipeline: True
25 |   reason: True
26 |   paraphrase: False
27 | 
28 |   perform_selection: False
29 |   selection_criterion: 'Aconf'
30 |   perform_ensembling: False
31 | 
32 |   verbose: True
33 |   alt_device: 0
34 | 
35 | datasets:
36 |   coco_vqa: # name of the dataset builder
37 |     type: eval
38 |     vis_processor:
39 |         eval:
40 |           name: "blip2_image_eval"
41 |           image_size: 224
42 |     text_processor:
43 |         eval:
44 |           name: "blip_question"
45 |     build_info:
46 |         images:
47 |             storage: '.cache/lavis/coco/images/'
48 | 
49 | run:
50 |   task: vqa
51 |   # optimization-specific
52 |   batch_size_train: 16
53 |   batch_size_eval: 4
54 |   num_workers: 4
55 | 
56 |   # inference-specific
57 |   max_len: 50
58 |   min_len: 1
59 |   num_beams: 5
60 |   inference_method: "generate"
61 |   
62 |   seed: 42
63 |   output_dir: "output/Vicuna7B/VQA"
64 | 
65 |   evaluate: True
66 |   test_splits: ["val"]
67 | 
68 |   # distribution-specific
69 |   device: "cuda"
70 |   world_size: 1
71 |   dist_url: "env://"
72 |   distributed: True
73 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.runners.runner_base import RunnerBase
 9 | 
10 | __all__ = ["RunnerBase"]
11 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
11 | from minigpt4.tasks.vqa import VQATask, AOKVQATask, MultiChoiceAOKVQATask
12 | 
13 | 
14 | def setup_task(cfg):
15 |     assert "task" in cfg.run_cfg, "Task name must be provided."
16 | 
17 |     task_name = cfg.run_cfg.task
18 |     task = registry.get_task_class(task_name).setup_task(cfg=cfg)
19 |     assert task is not None, "Task {} not properly registered.".format(task_name)
20 | 
21 |     return task
22 | 
23 | 
24 | __all__ = [
25 |     "BaseTask",
26 |     "ImageTextPretrainTask",
27 |     "AOKVQATask",
28 |     "VQATask",
29 |     "MultiChoiceAOKVQATask",
30 | ]
31 | 


--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/README.md


--------------------------------------------------------------------------------
/assets/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/intro.png


--------------------------------------------------------------------------------
/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/pipeline.png


--------------------------------------------------------------------------------