├── .gitignore ├── LICENSE ├── Lavis ├── evaluate.py ├── lavis │ ├── __init__.py │ ├── common │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools │ │ │ ├── __init__.py │ │ │ ├── vqa.py │ │ │ └── vqa_eval.py │ ├── configs │ │ ├── datasets │ │ │ ├── aokvqa │ │ │ │ ├── defaults.yaml │ │ │ │ └── eval_aokvqa.yaml │ │ │ ├── avsd │ │ │ │ └── defaults_dial.yaml │ │ │ ├── coco │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_ret.yaml │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ └── eval_vqa.yaml │ │ │ ├── conceptual_caption │ │ │ │ ├── defaults_12m.yaml │ │ │ │ └── defaults_3m.yaml │ │ │ ├── didemo │ │ │ │ └── defaults_ret.yaml │ │ │ ├── flickr30k │ │ │ │ └── defaults.yaml │ │ │ ├── gqa │ │ │ │ ├── balanced_testdev.yaml │ │ │ │ ├── balanced_val.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── imagenet │ │ │ │ └── defaults.yaml │ │ │ ├── laion │ │ │ │ └── defaults_2B_multi.yaml │ │ │ ├── msrvtt │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_qa.yaml │ │ │ │ └── defaults_ret.yaml │ │ │ ├── msvd │ │ │ │ ├── defaults_cap.yaml │ │ │ │ └── defaults_qa.yaml │ │ │ ├── nlvr │ │ │ │ └── defaults.yaml │ │ │ ├── nocaps │ │ │ │ └── defaults.yaml │ │ │ ├── okvqa │ │ │ │ └── defaults.yaml │ │ │ ├── sbu_caption │ │ │ │ └── defaults.yaml │ │ │ ├── snli_ve │ │ │ │ └── defaults.yaml │ │ │ ├── vatex │ │ │ │ └── defaults_cap.yaml │ │ │ └── vg │ │ │ │ ├── defaults_caption.yaml │ │ │ │ └── defaults_vqa.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── albef_classification_ve.yaml │ │ │ ├── albef_feature_extractor.yaml │ │ │ ├── albef_nlvr.yaml │ │ │ ├── albef_pretrain_base.yaml │ │ │ ├── albef_retrieval_coco.yaml │ │ │ ├── albef_retrieval_flickr.yaml │ │ │ ├── albef_vqav2.yaml │ │ │ ├── alpro_qa_msrvtt.yaml │ │ │ ├── alpro_qa_msvd.yaml │ │ │ ├── alpro_retrieval_didemo.yaml │ │ │ ├── alpro_retrieval_msrvtt.yaml │ │ │ ├── bert_config.json │ │ │ ├── bert_config_alpro.json │ │ │ ├── blip2 │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ ├── blip2_coco.yaml │ │ │ ├── blip2_pretrain.yaml │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ └── blip2_pretrain_vitL.yaml │ │ │ ├── blip_caption_base_coco.yaml │ │ │ ├── blip_caption_large_coco.yaml │ │ │ ├── blip_classification_base.yaml │ │ │ ├── blip_feature_extractor_base.yaml │ │ │ ├── blip_itm_base.yaml │ │ │ ├── blip_itm_large.yaml │ │ │ ├── blip_nlvr.yaml │ │ │ ├── blip_pretrain_base.yaml │ │ │ ├── blip_pretrain_large.yaml │ │ │ ├── blip_retrieval_coco.yaml │ │ │ ├── blip_retrieval_flickr.yaml │ │ │ ├── blip_vqa_aokvqa.yaml │ │ │ ├── blip_vqa_okvqa.yaml │ │ │ ├── blip_vqav2.yaml │ │ │ ├── clip │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-g-14.json │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ ├── timm-resnet50d.json │ │ │ ├── timm-resnetaa50d.json │ │ │ ├── timm-resnetblur50.json │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ └── timm-vit_small_patch16_224.json │ │ │ ├── clip_resnet50.yaml │ │ │ ├── clip_vit_base16.yaml │ │ │ ├── clip_vit_base32.yaml │ │ │ ├── clip_vit_large14.yaml │ │ │ ├── clip_vit_large14_336.yaml │ │ │ ├── gpt_dialogue_base.yaml │ │ │ ├── img2prompt-vqa │ │ │ └── img2prompt_vqa_base.yaml │ │ │ ├── med_config.json │ │ │ ├── med_config_albef.json │ │ │ ├── med_large_config.json │ │ │ └── pnp-vqa │ │ │ ├── pnp_vqa_3b.yaml │ │ │ ├── pnp_vqa_base.yaml │ │ │ ├── pnp_vqa_large.yaml │ │ │ ├── unifiedqav2_3b_config.json │ │ │ ├── unifiedqav2_base_config.json │ │ │ └── unifiedqav2_large_config.json │ ├── datasets │ │ ├── builders │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── caption_builder.py │ │ │ ├── classification_builder.py │ │ │ ├── dialogue_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ ├── imagefolder_builder.py │ │ │ ├── retrieval_builder.py │ │ │ ├── video_qa_builder.py │ │ │ └── vqa_builder.py │ │ ├── data_utils.py │ │ ├── datasets │ │ │ ├── aok_vqa_datasets.py │ │ │ ├── avsd_dialogue_datasets.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── coco_caption_datasets.py │ │ │ ├── coco_vqa_datasets.py │ │ │ ├── dataloader_utils.py │ │ │ ├── dialogue_datasets.py │ │ │ ├── gqa_datasets.py │ │ │ ├── image_text_pair_datasets.py │ │ │ ├── imagefolder_dataset.py │ │ │ ├── laion_dataset.py │ │ │ ├── multimodal_classification_datasets.py │ │ │ ├── nlvr_datasets.py │ │ │ ├── retrieval_datasets.py │ │ │ ├── snli_ve_datasets.py │ │ │ ├── vg_vqa_datasets.py │ │ │ ├── video_caption_datasets.py │ │ │ ├── video_vqa_datasets.py │ │ │ └── vqa_datasets.py │ │ └── download_scripts │ │ │ ├── DownloadConceptualCaptions │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── create_annotation_12m.ipynb │ │ │ ├── create_annotation_3m.ipynb │ │ │ ├── download_data_cc12m.py │ │ │ └── download_data_cc3m.py │ │ │ ├── download_coco.py │ │ │ ├── download_didemo.py │ │ │ ├── download_flickr.py │ │ │ ├── download_gqa.py │ │ │ ├── download_msrvtt.py │ │ │ ├── download_msvd.py │ │ │ ├── download_nocaps.py │ │ │ ├── download_sbu.py │ │ │ └── download_vg.py │ ├── models │ │ ├── __init__.py │ │ ├── albef_models │ │ │ ├── __init__.py │ │ │ ├── albef_classification.py │ │ │ ├── albef_feature_extractor.py │ │ │ ├── albef_nlvr.py │ │ │ ├── albef_outputs.py │ │ │ ├── albef_pretrain.py │ │ │ ├── albef_retrieval.py │ │ │ └── albef_vqa.py │ │ ├── alpro_models │ │ │ ├── __init__.py │ │ │ ├── alpro_outputs.py │ │ │ ├── alpro_qa.py │ │ │ └── alpro_retrieval.py │ │ ├── base_model.py │ │ ├── blip2_models │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── blip2.py │ │ │ ├── blip2_image_text_matching.py │ │ │ ├── blip2_opt.py │ │ │ ├── blip2_qformer.py │ │ │ ├── blip2_t5.py │ │ │ ├── blip2_t5_par.py │ │ │ ├── modeling_opt.py │ │ │ └── modeling_t5.py │ │ ├── blip_models │ │ │ ├── __init__.py │ │ │ ├── blip.py │ │ │ ├── blip_caption.py │ │ │ ├── blip_classification.py │ │ │ ├── blip_feature_extractor.py │ │ │ ├── blip_image_text_matching.py │ │ │ ├── blip_nlvr.py │ │ │ ├── blip_outputs.py │ │ │ ├── blip_pretrain.py │ │ │ ├── blip_retrieval.py │ │ │ ├── blip_vqa.py │ │ │ └── nlvr_encoder.py │ │ ├── clip_models │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip_outputs.py │ │ │ ├── loss.py │ │ │ ├── model.py │ │ │ ├── pics │ │ │ │ └── CLIP.png │ │ │ ├── pretrained.py │ │ │ ├── timm_model.py │ │ │ ├── tokenizer.py │ │ │ ├── transform.py │ │ │ └── utils.py │ │ ├── clip_vit.py │ │ ├── eva_vit.py │ │ ├── gpt_models │ │ │ └── gpt_dialogue.py │ │ ├── img2prompt_models │ │ │ ├── __init__.py │ │ │ └── img2prompt_vqa.py │ │ ├── med.py │ │ ├── pnp_vqa_models │ │ │ ├── __init__.py │ │ │ ├── pnp_unifiedqav2_fid.py │ │ │ └── pnp_vqa.py │ │ ├── timesformer │ │ │ ├── __init__.py │ │ │ ├── conv2d_same.py │ │ │ ├── features.py │ │ │ ├── helpers.py │ │ │ ├── linear.py │ │ │ ├── vit.py │ │ │ └── vit_utils.py │ │ └── vit.py │ ├── processors │ │ ├── __init__.py │ │ ├── alpro_processors.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ ├── clip_processors.py │ │ ├── functional_video.py │ │ ├── gpt_processors.py │ │ ├── randaugment.py │ │ └── transforms_video.py │ ├── projects │ │ └── blip2 │ │ │ ├── direct_aokvqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── mc_aokvqa_zeroshot_flant5xl_eval.yaml │ │ │ └── vqav2_zeroshot_flant5xl_eval.yaml │ ├── runners │ │ ├── __init__.py │ │ ├── runner_base.py │ │ └── runner_iter.py │ └── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── captioning.py │ │ ├── dialogue.py │ │ ├── image_text_pretrain.py │ │ ├── multimodal_classification.py │ │ ├── retrieval.py │ │ ├── vqa.py │ │ └── vqa_reading_comprehension.py ├── requirements.txt ├── setup.py └── train.py ├── MiniGPT-4 ├── demo.py ├── environment.yml ├── evaluate.py ├── minigpt4 │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools │ │ │ ├── __init__.py │ │ │ ├── vqa.py │ │ │ └── vqa_eval.py │ ├── configs │ │ ├── datasets │ │ │ ├── aokvqa │ │ │ │ ├── defaults.yaml │ │ │ │ └── eval_aokvqa.yaml │ │ │ ├── cc_sbu │ │ │ │ ├── align.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── coco │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_ret.yaml │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ └── eval_vqa.yaml │ │ │ └── laion │ │ │ │ └── defaults.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── minigpt4_llama2.yaml │ │ │ └── minigpt4_vicuna0.yaml │ ├── conversation │ │ ├── __init__.py │ │ └── conversation.py │ ├── datasets │ │ ├── __init__.py │ │ ├── builders │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ └── vqa_builder.py │ │ ├── data_utils.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── aok_vqa_datasets.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── cc_sbu_dataset.py │ │ │ ├── coco_vqa_datasets.py │ │ │ ├── dataloader_utils.py │ │ │ ├── laion_dataset.py │ │ │ ├── old_dataloader_utils.py │ │ │ └── vqa_datasets.py │ ├── models │ │ ├── Qformer.py │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── blip2.py │ │ ├── blip2_outputs.py │ │ ├── eva_vit.py │ │ ├── mini_gpt4.py │ │ └── modeling_llama.py │ ├── processors │ │ ├── __init__.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ └── randaugment.py │ ├── projects │ │ └── minigpt4 │ │ │ ├── conv_direct_aokvqa.yaml │ │ │ ├── conv_mc_aokvqa.yaml │ │ │ └── conv_vqav2.yaml │ ├── runners │ │ ├── __init__.py │ │ └── runner_base.py │ └── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── image_text_pretrain.py │ │ └── vqa.py └── train.py ├── README.md └── assets ├── README.md ├── intro.png └── pipeline.png /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Archiki Prasad 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Lavis/lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.registry import registry 14 | 15 | from lavis.datasets.builders import * 16 | from lavis.models import * 17 | from lavis.processors import * 18 | from lavis.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /Lavis/lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /Lavis/lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/aokvqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | aok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json 17 | storage: 18 | - aokvqa/annotations/aokvqa_v1p0_train.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json 22 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 23 | storage: 24 | - aokvqa/annotations/aokvqa_v1p0_val.json 25 | - aokvqa/annotations/specialized_vocab_train_lavis.json 26 | # - aokvqa/annotations/large_vocab_train_lavis.json 27 | test: 28 | url: 29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 31 | storage: 32 | - aokvqa/annotations/aokvqa_v1p0_test.json 33 | - aokvqa/annotations/specialized_vocab_train_lavis.json 34 | images: 35 | storage: coco/images/ 36 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/aokvqa/eval_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | aok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | - aokvqa/annotations/aokvqa_v1p0_val.json 17 | - aokvqa/annotations/specialized_vocab_train_lavis.json 18 | storage: 19 | - aokvqa/annotations/aokvqa_v1p0_val.json 20 | - aokvqa/annotations/specialized_vocab_train_lavis.json 21 | images: 22 | storage: coco/images/ 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | - coco/annotations/vqa_val_eval.json 17 | - coco/annotations/answer_list.json 18 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 19 | - coco/annotations/v2_mscoco_val2014_annotations.json 20 | storage: 21 | - coco/annotations/vqa_val_eval.json 22 | - coco/annotations/answer_list.json 23 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 24 | - coco/annotations/v2_mscoco_val2014_annotations.json 25 | images: 26 | storage: coco/images/ 27 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/gqa/balanced_testdev.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json 22 | storage: 23 | - gqa/annotations/testdev_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/gqa/balanced_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json 22 | storage: 23 | - gqa/annotations/val_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/gqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json 17 | - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json 18 | storage: 19 | - gqa/annotations/train_all_questions_0.json 20 | - gqa/annotations/val_all_questions.json 21 | val: 22 | url: 23 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json 24 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json 25 | storage: 26 | - aokvqa/annotations/aokvqa_v1p0_val.json 27 | - aokvqa/annotations/large_vocab_train_lavis.json 28 | test: 29 | url: 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json 31 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json 32 | storage: 33 | - aokvqa/annotations/aokvqa_v1p0_test.json 34 | - aokvqa/annotations/large_vocab_train_lavis.json 35 | images: 36 | storage: gqa/images/ 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/msrvtt/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json 16 | storage: msrvtt/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json 19 | storage: msrvtt/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json 22 | storage: msrvtt/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json 25 | storage: msrvtt/annotations/qa_ans2label.json 26 | videos: 27 | storage: msrvtt/videos 28 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/msvd/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json 16 | storage: msvd/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json 19 | storage: msvd/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json 22 | storage: msvd/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json 25 | storage: msvd/annotations/qa_ans2label.json 26 | videos: 27 | storage: msvd/videos 28 | 29 | instance_id_key: question_id 30 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/okvqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | ok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json 18 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json 19 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json 20 | storage: 21 | - okvqa/annotations/okvqa_train.json 22 | # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json 23 | # - okvqa/annotations/mscoco_train2014_annotations.json 24 | test: 25 | url: 26 | # TODO make this order insensitive 27 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json 28 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json 29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json 31 | storage: 32 | - okvqa/annotations/vqa_val_eval.json 33 | - okvqa/annotations/answer_list.json 34 | - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json 35 | - okvqa/annotations/mscoco_val2014_annotations.json 36 | images: 37 | storage: coco/images/ 38 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: ".cache/lavis" 11 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: img2prompt_vqa 8 | model_type: base 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_generation_moodel: 47 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth" 48 | 49 | 50 | 51 | preprocess: 52 | vis_processor: 53 | eval: 54 | name: "blip_image_eval" 55 | image_size: 384 56 | text_processor: 57 | eval: 58 | name: "blip_caption" 59 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_answering_model: 47 | arch: pnp_unifiedqav2_fid 48 | 49 | pretrained: "allenai/unifiedqa-v2-t5-3b-1363200" 50 | 51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json" 52 | 53 | preprocess: 54 | vis_processor: 55 | eval: 56 | name: "blip_image_eval" 57 | image_size: 384 58 | text_processor: 59 | eval: 60 | name: "blip_caption" 61 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | question_answering_model: 46 | arch: pnp_unifiedqav2_fid 47 | 48 | pretrained: "allenai/unifiedqa-v2-t5-base-1363200" 49 | 50 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json" 51 | 52 | preprocess: 53 | vis_processor: 54 | eval: 55 | name: "blip_image_eval" 56 | image_size: 384 57 | text_processor: 58 | eval: 59 | name: "blip_caption" 60 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_answering_model: 47 | arch: pnp_unifiedqav2_fid 48 | 49 | pretrained: "allenai/unifiedqa-v2-t5-large-1363200" 50 | 51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json" 52 | 53 | preprocess: 54 | vis_processor: 55 | eval: 56 | name: "blip_image_eval" 57 | image_size: 384 58 | text_processor: 59 | eval: 60 | name: "blip_caption" 61 | -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 16384, 6 | "d_kv": 128, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 24, 21 | "num_heads": 32, 22 | "num_layers": 24, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "torch_dtype": "float32", 57 | "transformers_version": "4.21.3", 58 | "use_cache": true, 59 | "vocab_size": 32128 60 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 3072, 6 | "d_kv": 64, 7 | "d_model": 768, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 12, 21 | "num_heads": 12, 22 | "num_layers": 12, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 4096, 6 | "d_kv": 64, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 24, 21 | "num_heads": 16, 22 | "num_layers": 24, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/caption_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 9 | from lavis.datasets.datasets.coco_caption_datasets import ( 10 | COCOCapDataset, 11 | COCOCapEvalDataset, 12 | NoCapsEvalDataset, 13 | ) 14 | 15 | from lavis.common.registry import registry 16 | from lavis.datasets.datasets.video_caption_datasets import ( 17 | VideoCaptionDataset, 18 | VideoCaptionEvalDataset, 19 | ) 20 | 21 | 22 | @registry.register_builder("coco_caption") 23 | class COCOCapBuilder(BaseDatasetBuilder): 24 | train_dataset_cls = COCOCapDataset 25 | eval_dataset_cls = COCOCapEvalDataset 26 | 27 | DATASET_CONFIG_DICT = { 28 | "default": "configs/datasets/coco/defaults_cap.yaml", 29 | } 30 | 31 | 32 | @registry.register_builder("nocaps") 33 | class COCOCapBuilder(BaseDatasetBuilder): 34 | eval_dataset_cls = NoCapsEvalDataset 35 | 36 | DATASET_CONFIG_DICT = { 37 | "default": "configs/datasets/nocaps/defaults.yaml", 38 | } 39 | 40 | 41 | @registry.register_builder("msrvtt_caption") 42 | class MSRVTTCapBuilder(BaseDatasetBuilder): 43 | train_dataset_cls = VideoCaptionDataset 44 | eval_dataset_cls = VideoCaptionEvalDataset 45 | 46 | DATASET_CONFIG_DICT = { 47 | "default": "configs/datasets/msrvtt/defaults_cap.yaml", 48 | } 49 | 50 | 51 | @registry.register_builder("msvd_caption") 52 | class MSVDCapBuilder(BaseDatasetBuilder): 53 | train_dataset_cls = VideoCaptionDataset 54 | eval_dataset_cls = VideoCaptionEvalDataset 55 | 56 | DATASET_CONFIG_DICT = { 57 | "default": "configs/datasets/msvd/defaults_cap.yaml", 58 | } 59 | 60 | 61 | @registry.register_builder("vatex_caption") 62 | class VATEXCapBuilder(BaseDatasetBuilder): 63 | train_dataset_cls = VideoCaptionDataset 64 | eval_dataset_cls = VideoCaptionEvalDataset 65 | 66 | DATASET_CONFIG_DICT = { 67 | "default": "configs/datasets/vatex/defaults_cap.yaml", 68 | } 69 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/classification_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset 11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset 12 | 13 | 14 | @registry.register_builder("nlvr") 15 | class NLVRBuilder(BaseDatasetBuilder): 16 | train_dataset_cls = NLVRDataset 17 | eval_dataset_cls = NLVREvalDataset 18 | 19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"} 20 | 21 | 22 | @registry.register_builder("snli_ve") 23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder): 24 | train_dataset_cls = SNLIVisualEntialmentDataset 25 | eval_dataset_cls = SNLIVisualEntialmentDataset 26 | 27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"} 28 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/dialogue_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.avsd_dialogue_datasets import ( 11 | AVSDDialDataset, 12 | AVSDDialEvalDataset, 13 | ) 14 | 15 | 16 | @registry.register_builder("avsd_dialogue") 17 | class AVSDDialBuilder(BaseDatasetBuilder): 18 | train_dataset_cls = AVSDDialDataset 19 | eval_dataset_cls = AVSDDialEvalDataset 20 | 21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"} 22 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/retrieval_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 9 | from lavis.datasets.datasets.retrieval_datasets import ( 10 | RetrievalDataset, 11 | RetrievalEvalDataset, 12 | VideoRetrievalDataset, 13 | VideoRetrievalEvalDataset, 14 | ) 15 | 16 | from lavis.common.registry import registry 17 | 18 | 19 | @registry.register_builder("msrvtt_retrieval") 20 | class MSRVTTRetrievalBuilder(BaseDatasetBuilder): 21 | train_dataset_cls = VideoRetrievalDataset 22 | eval_dataset_cls = VideoRetrievalEvalDataset 23 | 24 | DATASET_CONFIG_DICT = {"default": "configs/datasets/msrvtt/defaults_ret.yaml"} 25 | 26 | 27 | @registry.register_builder("didemo_retrieval") 28 | class DiDeMoRetrievalBuilder(BaseDatasetBuilder): 29 | train_dataset_cls = VideoRetrievalDataset 30 | eval_dataset_cls = VideoRetrievalEvalDataset 31 | 32 | DATASET_CONFIG_DICT = {"default": "configs/datasets/didemo/defaults_ret.yaml"} 33 | 34 | 35 | @registry.register_builder("coco_retrieval") 36 | class COCORetrievalBuilder(BaseDatasetBuilder): 37 | train_dataset_cls = RetrievalDataset 38 | eval_dataset_cls = RetrievalEvalDataset 39 | 40 | DATASET_CONFIG_DICT = {"default": "configs/datasets/coco/defaults_ret.yaml"} 41 | 42 | 43 | @registry.register_builder("flickr30k") 44 | class Flickr30kBuilder(BaseDatasetBuilder): 45 | train_dataset_cls = RetrievalDataset 46 | eval_dataset_cls = RetrievalEvalDataset 47 | 48 | DATASET_CONFIG_DICT = {"default": "configs/datasets/flickr30k/defaults.yaml"} 49 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/video_qa_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.common.utils import get_cache_path 10 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 11 | from lavis.datasets.datasets.video_vqa_datasets import VideoQADataset 12 | 13 | 14 | class VideoQABuilder(BaseDatasetBuilder): 15 | train_dataset_cls = VideoQADataset 16 | eval_dataset_cls = VideoQADataset 17 | 18 | def build(self): 19 | datasets = super().build() 20 | 21 | ans2label = self.config.build_info.annotations.get("ans2label") 22 | if ans2label is None: 23 | raise ValueError("ans2label is not specified in build_info.") 24 | 25 | ans2label = get_cache_path(ans2label.storage) 26 | 27 | for split in datasets: 28 | datasets[split]._build_class_labels(ans2label) 29 | 30 | return datasets 31 | 32 | 33 | @registry.register_builder("msrvtt_qa") 34 | class MSRVTTQABuilder(VideoQABuilder): 35 | DATASET_CONFIG_DICT = { 36 | "default": "configs/datasets/msrvtt/defaults_qa.yaml", 37 | } 38 | 39 | 40 | @registry.register_builder("msvd_qa") 41 | class MSVDQABuilder(VideoQABuilder): 42 | DATASET_CONFIG_DICT = { 43 | "default": "configs/datasets/msvd/defaults_qa.yaml", 44 | } 45 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/builders/vqa_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 9 | 10 | from lavis.common.registry import registry 11 | from lavis.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset 12 | from lavis.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset 13 | from lavis.datasets.datasets.vg_vqa_datasets import VGVQADataset 14 | from lavis.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset 15 | 16 | 17 | @registry.register_builder("coco_vqa") 18 | class COCOVQABuilder(BaseDatasetBuilder): 19 | train_dataset_cls = COCOVQADataset 20 | eval_dataset_cls = COCOVQAEvalDataset 21 | 22 | DATASET_CONFIG_DICT = { 23 | "default": "configs/datasets/coco/defaults_vqa.yaml", 24 | "eval": "configs/datasets/coco/eval_vqa.yaml", 25 | } 26 | 27 | 28 | @registry.register_builder("vg_vqa") 29 | class VGVQABuilder(BaseDatasetBuilder): 30 | train_dataset_cls = VGVQADataset 31 | DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa.yaml"} 32 | 33 | 34 | @registry.register_builder("ok_vqa") 35 | class OKVQABuilder(COCOVQABuilder): 36 | DATASET_CONFIG_DICT = { 37 | "default": "configs/datasets/okvqa/defaults.yaml", 38 | } 39 | 40 | 41 | @registry.register_builder("aok_vqa") 42 | class AOKVQABuilder(BaseDatasetBuilder): 43 | train_dataset_cls = AOKVQADataset 44 | eval_dataset_cls = AOKVQAEvalDataset 45 | 46 | DATASET_CONFIG_DICT = { 47 | "default": "configs/datasets/aokvqa/defaults.yaml", 48 | "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml", 49 | } 50 | 51 | 52 | @registry.register_builder("gqa") 53 | class GQABuilder(BaseDatasetBuilder): 54 | train_dataset_cls = GQADataset 55 | eval_dataset_cls = GQAEvalDataset 56 | 57 | DATASET_CONFIG_DICT = { 58 | "default": "configs/datasets/gqa/defaults.yaml", 59 | "balanced_val": "configs/datasets/gqa/balanced_val.yaml", 60 | "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml", 61 | } -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/image_text_pair_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from collections import OrderedDict 10 | 11 | from lavis.datasets.datasets.base_dataset import BaseDataset 12 | from PIL import Image 13 | 14 | 15 | class __DisplMixin: 16 | def displ_item(self, index): 17 | sample, ann = self.__getitem__(index), self.annotation[index] 18 | 19 | return OrderedDict( 20 | { 21 | "file": os.path.basename(ann["image"]), 22 | "caption": ann["caption"], 23 | "image": sample["image"], 24 | } 25 | ) 26 | 27 | 28 | class ImageTextPairDataset(BaseDataset, __DisplMixin): 29 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 30 | """ 31 | vis_root (string): Root directory of images (e.g. coco/images/) 32 | ann_root (string): directory to store the annotation file 33 | """ 34 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 35 | 36 | def __getitem__(self, index): 37 | 38 | # TODO this assumes image input, not general enough 39 | ann = self.annotation[index] 40 | 41 | image_path = os.path.join(self.vis_root, ann["image"]) 42 | image = Image.open(image_path).convert("RGB") 43 | 44 | image = self.vis_processor(image) 45 | caption = self.text_processor(ann["caption"]) 46 | 47 | return {"image": image, "text_input": caption} 48 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/imagefolder_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from collections import OrderedDict 10 | 11 | from lavis.datasets.datasets.base_dataset import BaseDataset 12 | from PIL import Image 13 | from torchvision import datasets 14 | 15 | 16 | class ImageFolderDataset(BaseDataset): 17 | def __init__(self, vis_processor, vis_root, classnames=[], **kwargs): 18 | super().__init__(vis_processor=vis_processor, vis_root=vis_root) 19 | 20 | self.inner_dataset = datasets.ImageFolder(vis_root) 21 | 22 | self.annotation = [ 23 | {"image": elem[0], "label": elem[1], "image_id": elem[0]} 24 | for elem in self.inner_dataset.imgs 25 | ] 26 | 27 | self.classnames = classnames 28 | 29 | self._add_instance_ids() 30 | 31 | def __len__(self): 32 | return len(self.inner_dataset) 33 | 34 | def __getitem__(self, index): 35 | ann = self.annotation[index] 36 | 37 | img_fn = ann["image"] 38 | image_path = os.path.join(self.vis_root, img_fn) 39 | image = Image.open(image_path).convert("RGB") 40 | 41 | image = self.vis_processor(image) 42 | 43 | return { 44 | "image": image, 45 | "label": ann["label"], 46 | "image_id": ann["image_id"], 47 | "instance_id": ann["instance_id"], 48 | } 49 | 50 | def displ_item(self, index): 51 | sample, ann = self.__getitem__(index), self.annotation[index] 52 | 53 | return OrderedDict( 54 | { 55 | "file": ann["image"], 56 | "label": self.classnames[ann["label"]], 57 | "image": sample["image"], 58 | } 59 | ) 60 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/laion_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import webdataset as wds 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class LaionDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, location): 14 | super().__init__(vis_processor=vis_processor, text_processor=text_processor) 15 | 16 | self.inner_dataset = wds.DataPipeline( 17 | wds.ResampledShards(location), 18 | wds.tarfile_to_samples(handler=wds.warn_and_continue), 19 | wds.shuffle(1000, handler=wds.warn_and_continue), 20 | wds.decode("pilrgb", handler=wds.warn_and_continue), 21 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), 22 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), 23 | wds.map(self.to_dict, handler=wds.warn_and_continue), 24 | ) 25 | 26 | def to_dict(self, sample): 27 | return { 28 | "image": sample[0], 29 | "text_input": self.text_processor(sample[1]["caption"]), 30 | } 31 | 32 | 33 | if __name__ == "__main__": 34 | from torchvision import transforms 35 | 36 | def to_image_text_pair(sample): 37 | return sample[0], sample[1]["caption"] 38 | 39 | normalize = transforms.Normalize( 40 | (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) 41 | ) 42 | 43 | transform_train = transforms.Compose( 44 | [ 45 | transforms.RandomResizedCrop(256, scale=(0.2, 1.0)), 46 | transforms.RandomHorizontalFlip(), 47 | transforms.ToTensor(), 48 | normalize, 49 | ] 50 | ) 51 | 52 | dataset = LaionDataset( 53 | vis_processor=transform_train, 54 | text_processor=lambda x: x, 55 | location="/export/laion/laion2B-multi/part-00000/{00000..01743}.tar", 56 | ) 57 | 58 | import torch 59 | 60 | loader = torch.utils.data.DataLoader(dataset.inner_dataset, batch_size=2) 61 | 62 | print(next(iter(loader))["text_input"]) 63 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/multimodal_classification_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from abc import abstractmethod 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class MultimodalClassificationDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 15 | 16 | self.class_labels = None 17 | 18 | @abstractmethod 19 | def _build_class_labels(self): 20 | pass 21 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/snli_ve_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from collections import OrderedDict 10 | 11 | from lavis.datasets.datasets.multimodal_classification_datasets import ( 12 | MultimodalClassificationDataset, 13 | ) 14 | from PIL import Image 15 | 16 | 17 | class __DisplMixin: 18 | def displ_item(self, index): 19 | sample, ann = self.__getitem__(index), self.annotation[index] 20 | 21 | return OrderedDict( 22 | { 23 | "file": os.path.basename(ann["image"]), 24 | "sentence": ann["sentence"], 25 | "label": ann["label"], 26 | "image": sample["image"], 27 | } 28 | ) 29 | 30 | 31 | class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __DisplMixin): 32 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 33 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 34 | 35 | self.class_labels = self._build_class_labels() 36 | 37 | def _build_class_labels(self): 38 | return {"contradiction": 0, "neutral": 1, "entailment": 2} 39 | 40 | def __getitem__(self, index): 41 | ann = self.annotation[index] 42 | 43 | image_id = ann["image"] 44 | image_path = os.path.join(self.vis_root, "%s.jpg" % image_id) 45 | image = Image.open(image_path).convert("RGB") 46 | 47 | image = self.vis_processor(image) 48 | sentence = self.text_processor(ann["sentence"]) 49 | 50 | return { 51 | "image": image, 52 | "text_input": sentence, 53 | "label": self.class_labels[ann["label"]], 54 | "image_id": image_id, 55 | "instance_id": ann["instance_id"], 56 | } 57 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/vg_vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | 10 | from PIL import Image 11 | 12 | from lavis.datasets.datasets.vqa_datasets import VQADataset 13 | 14 | 15 | class VGVQADataset(VQADataset): 16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 18 | 19 | def __getitem__(self, index): 20 | ann = self.annotation[index] 21 | 22 | image_path = os.path.join(self.vis_root, ann["image"]) 23 | image = Image.open(image_path).convert("RGB") 24 | 25 | image = self.vis_processor(image) 26 | question = self.text_processor(ann["question"]) 27 | 28 | answers = [ann["answer"]] 29 | # TODO this should be configured better 30 | weights = [0.2] 31 | 32 | return { 33 | "image": image, 34 | "text_input": question, 35 | "answers": answers, 36 | "weights": weights, 37 | } 38 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/video_caption_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | from lavis.datasets.datasets.caption_datasets import CaptionDataset 12 | 13 | 14 | class VideoCaptionDataset(CaptionDataset): 15 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 16 | """ 17 | vis_root (string): Root directory of images (e.g. coco/images/) 18 | ann_root (string): directory to store the annotation file 19 | split (string): val or test 20 | """ 21 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 22 | 23 | def __getitem__(self, index): 24 | 25 | ann = self.annotation[index] 26 | 27 | vname = ann["video"] 28 | video_path = os.path.join(self.vis_root, vname) 29 | 30 | video = self.vis_processor(video_path) 31 | caption = self.text_processor(ann["caption"]) 32 | 33 | # "image_id" is kept to stay compatible with the COCO evaluation format 34 | return { 35 | "video": video, 36 | "text_input": caption, 37 | "image_id": self.img_ids[ann["image_id"]], 38 | } 39 | 40 | 41 | class VideoCaptionEvalDataset(BaseDataset): 42 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 43 | """ 44 | vis_root (string): Root directory of images (e.g. coco/images/) 45 | ann_root (string): directory to store the annotation file 46 | split (string): val or test 47 | """ 48 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 49 | 50 | def __getitem__(self, index): 51 | 52 | ann = self.annotation[index] 53 | 54 | vname = ann["video"] 55 | video_path = os.path.join(self.vis_root, vname) 56 | 57 | video = self.vis_processor(video_path) 58 | 59 | return { 60 | "video": video, 61 | "image_id": ann["image_id"], 62 | "instance_id": ann["instance_id"], 63 | } 64 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/video_vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import json 9 | import os 10 | from collections import OrderedDict 11 | 12 | from lavis.datasets.datasets.multimodal_classification_datasets import ( 13 | MultimodalClassificationDataset, 14 | ) 15 | 16 | 17 | class __DisplMixin: 18 | def displ_item(self, index): 19 | ann = self.annotation[index] 20 | 21 | vname = ann["video"] 22 | vpath = os.path.join(self.vis_root, vname) 23 | 24 | return OrderedDict( 25 | {"file": vpath, "question": ann["question"], "answer": ann["answer"]} 26 | ) 27 | 28 | 29 | class VideoQADataset(MultimodalClassificationDataset, __DisplMixin): 30 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 31 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 32 | 33 | def _build_class_labels(self, ans_path): 34 | ans2label = json.load(open(ans_path)) 35 | 36 | self.class_labels = ans2label 37 | 38 | def _get_answer_label(self, answer): 39 | if answer in self.class_labels: 40 | return self.class_labels[answer] 41 | else: 42 | return len(self.class_labels) 43 | 44 | def __getitem__(self, index): 45 | assert ( 46 | self.class_labels 47 | ), f"class_labels of {__class__.__name__} is not built yet." 48 | 49 | ann = self.annotation[index] 50 | 51 | vname = ann["video"] 52 | vpath = os.path.join(self.vis_root, vname) 53 | 54 | frms = self.vis_processor(vpath) 55 | question = self.text_processor(ann["question"]) 56 | 57 | return { 58 | "video": frms, 59 | "text_input": question, 60 | "answers": self._get_answer_label(ann["answer"]), 61 | "question_id": ann["question_id"], 62 | "instance_id": ann["instance_id"], 63 | } 64 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/datasets/vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | from lavis.datasets.datasets.base_dataset import BaseDataset 11 | 12 | 13 | class VQADataset(BaseDataset): 14 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 15 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 16 | 17 | def collater(self, samples): 18 | image_list, question_list, answer_list, weight_list = [], [], [], [] 19 | 20 | num_answers = [] 21 | 22 | for sample in samples: 23 | image_list.append(sample["image"]) 24 | question_list.append(sample["text_input"].capitalize()) 25 | 26 | weight_list.extend(sample["weights"]) 27 | 28 | answers = sample["answers"] 29 | 30 | answer_list.extend(answers) 31 | num_answers.append(len(answers)) 32 | 33 | return { 34 | "image": torch.stack(image_list, dim=0), 35 | "text_input": question_list, 36 | "answer": answer_list, 37 | "weight": torch.Tensor(weight_list), 38 | "n_answers": torch.LongTensor(num_answers), 39 | } 40 | 41 | 42 | class VQAEvalDataset(BaseDataset): 43 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 44 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 45 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | MIT License 6 | 7 | Copyright (c) 2019 Igor Brigadir 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | # Download Conceptual Captions Data 9 | 10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder 11 | 12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333) 13 | 14 | run `download_data_cc3m.py` or `download_data_cc12m.py`. 15 | 16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is. 17 | 18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates. 19 | 20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this. 21 | 22 | It should take about a day or two to download the training data, keep an eye on disk space. 23 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | download_and_extract_archive, 16 | get_abs_path, 17 | get_cache_path, 18 | ) 19 | 20 | 21 | DATA_URL = { 22 | "train": "http://images.cocodataset.org/zips/train2014.zip", # md5: 0da8c0bd3d6becc4dcb32757491aca88 23 | "val": "http://images.cocodataset.org/zips/val2014.zip", # md5: a3d79f5ed8d289b7a7554ce06a5782b3 24 | "test": "http://images.cocodataset.org/zips/test2014.zip", # md5: 04127eef689ceac55e3a572c2c92f264 25 | "test2015": "http://images.cocodataset.org/zips/test2015.zip", # md5: 04127eef689ceac55e3a572c2c92f264 26 | } 27 | 28 | 29 | def download_datasets(root, url): 30 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir) 31 | 32 | 33 | if __name__ == "__main__": 34 | 35 | config_path = get_abs_path("configs/datasets/coco/defaults_cap.yaml") 36 | 37 | storage_dir = OmegaConf.load( 38 | config_path 39 | ).datasets.coco_caption.build_info.images.storage 40 | 41 | download_dir = Path(get_cache_path(storage_dir)).parent / "download" 42 | storage_dir = Path(get_cache_path(storage_dir)) 43 | 44 | if storage_dir.exists(): 45 | print(f"Dataset already exists at {storage_dir}. Aborting.") 46 | exit(0) 47 | 48 | try: 49 | for k, v in DATA_URL.items(): 50 | print("Downloading {} to {}".format(v, k)) 51 | download_datasets(download_dir, v) 52 | except Exception as e: 53 | # remove download dir if failed 54 | cleanup_dir(download_dir) 55 | print("Failed to download or extracting datasets. Aborting.") 56 | 57 | cleanup_dir(download_dir) 58 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_didemo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | download_and_extract_archive, 16 | get_abs_path, 17 | get_cache_path, 18 | ) 19 | 20 | DATA_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/didemo_videos.tar.gz" 21 | 22 | 23 | def download_datasets(root, url): 24 | """ 25 | Download the Imagenet-R dataset archives and expand them 26 | in the folder provided as parameter 27 | """ 28 | download_and_extract_archive(url=url, download_root=root) 29 | 30 | 31 | def move_files(download_path, storage_path): 32 | """ 33 | Move files from download_path to storage_path 34 | """ 35 | print("Moving to {}".format(storage_path)) 36 | 37 | os.makedirs(storage_path, exist_ok=True) 38 | 39 | for file_name in os.listdir(download_path): 40 | os.rename( 41 | os.path.join(download_path, file_name), 42 | os.path.join(storage_path, file_name), 43 | ) 44 | 45 | 46 | if __name__ == "__main__": 47 | 48 | config_path = get_abs_path("configs/datasets/didemo/defaults_ret.yaml") 49 | 50 | storage_dir = OmegaConf.load( 51 | config_path 52 | ).datasets.didemo_retrieval.build_info.videos.storage 53 | 54 | download_dir = Path(get_cache_path(storage_dir)).parent / "download" 55 | storage_dir = Path(get_cache_path(storage_dir)) 56 | 57 | if storage_dir.exists(): 58 | print(f"Dataset already exists at {storage_dir}. Aborting.") 59 | exit(0) 60 | 61 | try: 62 | print("Downloading {} to {}".format(DATA_URL, download_dir)) 63 | download_datasets(download_dir, DATA_URL) 64 | except Exception as e: 65 | # remove download dir if failed 66 | cleanup_dir(download_dir) 67 | print("Failed to download or extracting datasets. Aborting.") 68 | 69 | move_files(download_dir / "videos", storage_dir) 70 | cleanup_dir(download_dir) 71 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_flickr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | get_abs_path, 16 | get_cache_path, 17 | ) 18 | 19 | import opendatasets as od 20 | 21 | 22 | DATA_URL = "https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset" 23 | 24 | print( 25 | """ 26 | To download the dataset, you need to have a Kaggle account and the associated key. 27 | See https://www.kaggle.com/docs/api to create account and a new API token. 28 | """ 29 | ) 30 | 31 | 32 | def move_directory(src_dir, dst_dir): 33 | """ 34 | Move files from download_path to storage_path 35 | """ 36 | print("Moving to {}".format(dst_dir)) 37 | 38 | os.makedirs(dst_dir, exist_ok=True) 39 | 40 | for file_name in os.listdir(src_dir): 41 | os.rename( 42 | os.path.join(src_dir, file_name), 43 | os.path.join(dst_dir, file_name), 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | config_path = get_abs_path("configs/datasets/flickr30k/defaults.yaml") 50 | 51 | storage_dir = OmegaConf.load( 52 | config_path 53 | ).datasets.flickr30k.build_info.images.storage 54 | 55 | storage_dir = Path(get_cache_path(storage_dir)) 56 | download_dir = storage_dir.parent / "download" 57 | 58 | if storage_dir.exists(): 59 | print(f"Dataset already exists at {storage_dir}. Aborting.") 60 | exit(0) 61 | 62 | os.makedirs(download_dir) 63 | 64 | try: 65 | print("Downloading {} to {}".format(DATA_URL, download_dir)) 66 | od.download(DATA_URL, download_dir) 67 | except Exception as e: 68 | print(e) 69 | # remove download dir if failed 70 | cleanup_dir(download_dir) 71 | exit(1) 72 | 73 | move_directory( 74 | download_dir / "flickr-image-dataset" / "flickr30k_images" / "flickr30k_images", 75 | storage_dir / "flickr30k-images", 76 | ) 77 | 78 | cleanup_dir(download_dir) 79 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_gqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | download_and_extract_archive, 16 | get_abs_path, 17 | get_cache_path, 18 | ) 19 | 20 | 21 | DATA_URL = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip" 22 | 23 | 24 | def download_datasets(root, url): 25 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir.parent) 26 | 27 | 28 | if __name__ == "__main__": 29 | 30 | config_path = get_abs_path("configs/datasets/gqa/defaults.yaml") 31 | 32 | storage_dir = OmegaConf.load( 33 | config_path 34 | ).datasets.gqa.build_info.images.storage 35 | 36 | download_dir = Path(get_cache_path(storage_dir)).parent / "download" 37 | storage_dir = Path(get_cache_path(storage_dir)) 38 | 39 | if storage_dir.exists(): 40 | print(f"Dataset already exists at {storage_dir}. Aborting.") 41 | exit(0) 42 | 43 | try: 44 | print("Downloading {}".format(DATA_URL)) 45 | download_datasets(download_dir, DATA_URL) 46 | except Exception as e: 47 | # remove download dir if failed 48 | cleanup_dir(download_dir) 49 | print("Failed to download or extracting datasets. Aborting.") 50 | 51 | cleanup_dir(download_dir) 52 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_msvd.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | download_and_extract_archive, 16 | get_abs_path, 17 | get_cache_path, 18 | ) 19 | 20 | 21 | DATA_URL = "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar" 22 | 23 | 24 | def download_datasets(root, url): 25 | download_and_extract_archive(url=url, download_root=root) 26 | 27 | 28 | def move_files(download_path, storage_path): 29 | """ 30 | Move files from download_path to storage_path 31 | """ 32 | print("Moving to {}".format(storage_path)) 33 | 34 | os.makedirs(storage_path, exist_ok=True) 35 | 36 | for file_name in os.listdir(download_path): 37 | os.rename( 38 | os.path.join(download_path, file_name), 39 | os.path.join(storage_path, file_name), 40 | ) 41 | 42 | 43 | if __name__ == "__main__": 44 | 45 | config_path = get_abs_path("configs/datasets/msvd/defaults_cap.yaml") 46 | 47 | storage_dir = OmegaConf.load( 48 | config_path 49 | ).datasets.msvd_cap.build_info.videos.storage 50 | 51 | download_dir = Path(get_cache_path(storage_dir)).parent / "download" 52 | storage_dir = Path(get_cache_path(storage_dir)) 53 | 54 | if storage_dir.exists(): 55 | print(f"Dataset already exists at {storage_dir}. Aborting.") 56 | exit(0) 57 | 58 | try: 59 | print("Downloading {}".format(DATA_URL)) 60 | download_datasets(download_dir, DATA_URL) 61 | except Exception as e: 62 | # remove download dir if failed 63 | cleanup_dir(download_dir) 64 | print("Failed to download or extracting datasets. Aborting.") 65 | 66 | move_files(download_dir / "YouTubeClips", storage_dir) 67 | cleanup_dir(download_dir) 68 | -------------------------------------------------------------------------------- /Lavis/lavis/datasets/download_scripts/download_vg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.utils import ( 14 | cleanup_dir, 15 | download_and_extract_archive, 16 | get_abs_path, 17 | get_cache_path, 18 | ) 19 | 20 | 21 | DATA_URL = { 22 | "train": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip", 23 | "train2": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip", 24 | } 25 | 26 | 27 | def download_datasets(root, url): 28 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir) 29 | 30 | 31 | if __name__ == "__main__": 32 | 33 | config_path = get_abs_path("configs/datasets/vg/defaults_caption.yaml") 34 | 35 | storage_dir = OmegaConf.load( 36 | config_path 37 | ).datasets.vg_caption.build_info.images.storage 38 | 39 | download_dir = Path(get_cache_path(storage_dir)).parent / "download" 40 | storage_dir = Path(get_cache_path(storage_dir)) 41 | 42 | if storage_dir.exists(): 43 | print(f"Dataset already exists at {storage_dir}. Aborting.") 44 | exit(0) 45 | 46 | try: 47 | for k, v in DATA_URL.items(): 48 | print("Downloading {} to {}".format(v, k)) 49 | download_datasets(download_dir, v) 50 | except Exception as e: 51 | # remove download dir if failed 52 | cleanup_dir(download_dir) 53 | print("Failed to download or extracting datasets. Aborting.") 54 | 55 | cleanup_dir(download_dir) 56 | -------------------------------------------------------------------------------- /Lavis/lavis/models/alpro_models/alpro_outputs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from dataclasses import dataclass 9 | from typing import Optional 10 | 11 | import torch 12 | from transformers.modeling_outputs import ( 13 | BaseModelOutputWithPoolingAndCrossAttentions, 14 | ModelOutput, 15 | ) 16 | 17 | 18 | @dataclass 19 | class AlproSimilarity(ModelOutput): 20 | sim_v2t: torch.FloatTensor = None 21 | sim_t2v: torch.FloatTensor = None 22 | 23 | sim_v2t_targets: Optional[torch.FloatTensor] = None 24 | sim_t2v_targets: Optional[torch.FloatTensor] = None 25 | 26 | 27 | @dataclass 28 | class AlproIntermediateOutput(ModelOutput): 29 | # uni-modal features 30 | video_embeds: torch.FloatTensor = None 31 | text_embeds: Optional[torch.FloatTensor] = None 32 | 33 | # intermediate outputs of multimodal encoder 34 | encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None 35 | encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None 36 | 37 | vtm_logits: Optional[torch.FloatTensor] = None 38 | vtm_labels: Optional[torch.LongTensor] = None 39 | 40 | 41 | @dataclass 42 | class AlproOutput(ModelOutput): 43 | # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional. 44 | sims: Optional[AlproSimilarity] = None 45 | 46 | intermediate_output: AlproIntermediateOutput = None 47 | 48 | loss: Optional[torch.FloatTensor] = None 49 | 50 | loss_vtc: Optional[torch.FloatTensor] = None 51 | 52 | loss_vtm: Optional[torch.FloatTensor] = None 53 | 54 | loss_mlm: Optional[torch.FloatTensor] = None 55 | 56 | 57 | @dataclass 58 | class AlproOutputWithLogits(AlproOutput): 59 | logits: torch.FloatTensor = None 60 | -------------------------------------------------------------------------------- /Lavis/lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /Lavis/lavis/models/clip_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | """ OpenAI pretrained model functions 11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. 12 | 13 | Originally MIT License, Copyright (c) 2021 OpenAI. 14 | """ 15 | -------------------------------------------------------------------------------- /Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /Lavis/lavis/models/clip_models/clip_outputs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | from dataclasses import dataclass 11 | 12 | from typing import Optional 13 | 14 | import torch 15 | from transformers.modeling_outputs import ModelOutput 16 | 17 | 18 | @dataclass 19 | class ClipOutputFeatures(ModelOutput): 20 | """ 21 | Data class of features from AlbefFeatureExtractor. 22 | 23 | Args: 24 | image_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional` 25 | image_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional` 26 | text_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional` 27 | text_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional` 28 | """ 29 | 30 | image_embeds: Optional[torch.FloatTensor] = None 31 | image_embeds_proj: Optional[torch.FloatTensor] = None 32 | 33 | text_embeds: Optional[torch.FloatTensor] = None 34 | text_embeds_proj: Optional[torch.FloatTensor] = None 35 | 36 | 37 | @dataclass 38 | class ClipOutput(ModelOutput): 39 | intermediate_output: Optional[ClipOutputFeatures] = None 40 | 41 | logit_scale_exp: Optional[torch.FloatTensor] = None 42 | 43 | loss: Optional[torch.FloatTensor] = None 44 | -------------------------------------------------------------------------------- /Lavis/lavis/models/clip_models/pics/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/pics/CLIP.png -------------------------------------------------------------------------------- /Lavis/lavis/models/clip_models/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | from torch import nn as nn 11 | from torchvision.ops.misc import FrozenBatchNorm2d 12 | 13 | 14 | def freeze_batch_norm_2d(module, module_match={}, name=""): 15 | """ 16 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 17 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 18 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 19 | Args: 20 | module (torch.nn.Module): Any PyTorch module. 21 | module_match (dict): Dictionary of full module names to freeze (all if empty) 22 | name (str): Full module name (prefix) 23 | Returns: 24 | torch.nn.Module: Resulting module 25 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 26 | """ 27 | res = module 28 | is_match = True 29 | if module_match: 30 | is_match = name in module_match 31 | if is_match and isinstance( 32 | module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm) 33 | ): 34 | res = FrozenBatchNorm2d(module.num_features) 35 | res.num_features = module.num_features 36 | res.affine = module.affine 37 | if module.affine: 38 | res.weight.data = module.weight.data.clone().detach() 39 | res.bias.data = module.bias.data.clone().detach() 40 | res.running_mean.data = module.running_mean.data 41 | res.running_var.data = module.running_var.data 42 | res.eps = module.eps 43 | else: 44 | for child_name, child in module.named_children(): 45 | full_child_name = ".".join([name, child_name]) if name else child_name 46 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 47 | if new_child is not child: 48 | res.add_module(child_name, new_child) 49 | return res 50 | -------------------------------------------------------------------------------- /Lavis/lavis/models/img2prompt_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Lavis/lavis/models/pnp_vqa_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | def prepare_qa_input(sample, num_captions, num_captions_fid): 12 | sample_question_captions = [] 13 | 14 | for question, captions in zip(sample['text_input'], sample['captions']): 15 | assert isinstance(captions, list) 16 | question_captions = [] 17 | question_caption = '' 18 | for cap_id, cap_ in enumerate(captions[0:num_captions]): 19 | question_caption += (cap_.strip() + '. ') 20 | if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0): 21 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 22 | question_captions.append(question_caption) 23 | question_caption = '' 24 | if (cap_id + 1) == num_captions: 25 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 26 | question_captions.append(question_caption) 27 | sample_question_captions.append(question_captions) 28 | 29 | sample['question_captions'] = sample_question_captions 30 | -------------------------------------------------------------------------------- /Lavis/lavis/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /Lavis/lavis/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /Lavis/lavis/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.processors.base_processor import BaseProcessor 9 | 10 | from lavis.processors.alpro_processors import ( 11 | AlproVideoTrainProcessor, 12 | AlproVideoEvalProcessor, 13 | ) 14 | from lavis.processors.blip_processors import ( 15 | BlipImageTrainProcessor, 16 | Blip2ImageTrainProcessor, 17 | BlipImageEvalProcessor, 18 | BlipCaptionProcessor, 19 | ) 20 | from lavis.processors.gpt_processors import ( 21 | GPTVideoFeatureProcessor, 22 | GPTDialogueProcessor, 23 | ) 24 | from lavis.processors.clip_processors import ClipImageTrainProcessor 25 | 26 | from lavis.common.registry import registry 27 | 28 | __all__ = [ 29 | "BaseProcessor", 30 | # ALPRO 31 | "AlproVideoTrainProcessor", 32 | "AlproVideoEvalProcessor", 33 | # BLIP 34 | "BlipImageTrainProcessor", 35 | "Blip2ImageTrainProcessor", 36 | "BlipImageEvalProcessor", 37 | "BlipCaptionProcessor", 38 | "ClipImageTrainProcessor", 39 | # GPT 40 | "GPTVideoFeatureProcessor", 41 | "GPTDialogueProcessor", 42 | ] 43 | 44 | 45 | def load_processor(name, cfg=None): 46 | """ 47 | Example 48 | 49 | >>> processor = load_processor("alpro_video_train", cfg=None) 50 | """ 51 | processor = registry.get_processor_class(name).from_config(cfg) 52 | 53 | return processor 54 | -------------------------------------------------------------------------------- /Lavis/lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /Lavis/lavis/projects/blip2/direct_aokvqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 41.22 7 | 8 | model: 9 | arch: blip2_t5_par 10 | model_type: pretrain_flant5xl 11 | use_grad_checkpoint: False 12 | max_txt_len: 128 13 | prompt: "Question: {} Short Answer: " 14 | multiple_choice: False 15 | 16 | keyword_pipeline: True 17 | reason: True 18 | paraphrase: False 19 | 20 | ext_paraphrase: False 21 | par_num_beams: 5 22 | num_add_candidates: 4 23 | 24 | perform_selection: False 25 | selection_criterion: 'Aconf' 26 | calibrate: False 27 | perform_ensembling: False 28 | dropout_aggregate: False 29 | 30 | constrained: True 31 | verbose: False 32 | 33 | use_caption: False 34 | use_promptcap: False 35 | alt_device: 0 36 | 37 | # for OKVQA evaluation 38 | apply_lemmatizer: False 39 | 40 | datasets: 41 | aok_vqa: # name of the dataset builder 42 | type: eval 43 | vis_processor: 44 | eval: 45 | name: "blip_image_eval" 46 | image_size: 224 47 | text_processor: 48 | eval: 49 | name: "blip_question" 50 | 51 | 52 | run: 53 | task: aok_vqa 54 | # optimization-specific 55 | batch_size_train: 16 56 | batch_size_eval: 10 57 | num_workers: 4 58 | 59 | # inference-specific 60 | max_len: 10 61 | min_len: 1 62 | num_beams: 5 63 | inference_method: "generate" 64 | 65 | seed: 42 66 | output_dir: "output/BLIP2/AOKVQA-direct" 67 | 68 | evaluate: True 69 | test_splits: ["val"] 70 | 71 | # distribution-specific 72 | device: "cuda" 73 | world_size: 1 74 | dist_url: "env://" 75 | distributed: True 76 | -------------------------------------------------------------------------------- /Lavis/lavis/projects/blip2/mc_aokvqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | 7 | model: 8 | arch: blip2_t5_par 9 | model_type: pretrain_flant5xl 10 | use_grad_checkpoint: False 11 | max_txt_len: 128 12 | prompt: "Based on this information, select the correct answer to the question from the options.\nQuestion: {}\nOptions: A. {}, B. {}, C. {}, D. {}\nAnswer: Option " 13 | multiple_choice: True 14 | 15 | keyword_pipeline: False 16 | reason: False 17 | paraphrase: False 18 | 19 | ext_paraphrase: False 20 | par_num_beams: 5 21 | num_add_candidates: 0 22 | 23 | perform_selection: False 24 | selection_criterion: 'Aconf' 25 | calibrate: False 26 | perform_ensembling: False 27 | dropout_aggregate: False 28 | 29 | constrained: True 30 | verbose: False 31 | 32 | use_caption: False 33 | use_promptcap: False 34 | alt_device: 0 35 | 36 | # for OKVQA evaluation 37 | apply_lemmatizer: False 38 | 39 | datasets: 40 | aok_vqa: # name of the dataset builder 41 | type: eval 42 | vis_processor: 43 | eval: 44 | name: "blip_image_eval" 45 | image_size: 224 46 | text_processor: 47 | eval: 48 | name: "blip_question" 49 | # build_info: 50 | # images: 51 | # storage: '/export/share/datasets/vision/coco/images/' 52 | 53 | run: 54 | task: mc_aok_vqa 55 | # optimization-specific 56 | batch_size_train: 16 57 | batch_size_eval: 24 58 | num_workers: 4 59 | 60 | # inference-specific 61 | max_len: 10 62 | min_len: 1 63 | num_beams: 5 64 | inference_method: "generate" 65 | 66 | seed: 42 67 | output_dir: "output/BLIP2/AOKVQA-MC" 68 | 69 | evaluate: True 70 | test_splits: ["val"] 71 | 72 | # distribution-specific 73 | device: "cuda" 74 | world_size: 1 75 | dist_url: "env://" 76 | distributed: True 77 | -------------------------------------------------------------------------------- /Lavis/lavis/projects/blip2/vqav2_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | 7 | model: 8 | arch: blip2_t5_par 9 | model_type: pretrain_flant5xl 10 | use_grad_checkpoint: False 11 | prompt: "Question: {} Short answer:" 12 | max_txt_len: 256 13 | 14 | keyword_pipeline: True 15 | reason: True 16 | paraphrase: False 17 | ext_paraphrase: True 18 | par_num_beams: 5 19 | num_add_candidates: 4 20 | 21 | perform_selection: False 22 | selection_criterion: 'Aconf' 23 | calibrate: False 24 | perform_ensembling: False 25 | dropout_aggregate: False 26 | 27 | constrained: True 28 | verbose: True 29 | 30 | use_caption: False 31 | use_promptcap: False 32 | alt_device: 0 33 | 34 | datasets: 35 | coco_vqa: # name of the dataset builder 36 | type: eval 37 | vis_processor: 38 | eval: 39 | name: "blip_image_eval" 40 | image_size: 224 41 | text_processor: 42 | eval: 43 | name: "blip_question" 44 | # build_info: 45 | # images: 46 | # storage: '/export/share/datasets/vision/coco/images/' 47 | 48 | run: 49 | task: vqa 50 | # optimization-specific 51 | batch_size_train: 16 52 | batch_size_eval: 16 53 | num_workers: 4 54 | 55 | # inference-specific 56 | max_len: 10 57 | min_len: 1 58 | num_beams: 5 59 | inference_method: "generate" 60 | #"Short answer:" 61 | 62 | seed: 42 63 | output_dir: "output/BLIP2/VQA" 64 | 65 | evaluate: True 66 | test_splits: ["val"] 67 | 68 | # distribution-specific 69 | device: "cuda" 70 | world_size: 1 71 | dist_url: "env://" 72 | distributed: True 73 | -------------------------------------------------------------------------------- /Lavis/lavis/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.runners.runner_base import RunnerBase 9 | from lavis.runners.runner_iter import RunnerIter 10 | 11 | __all__ = ["RunnerBase", "RunnerIter"] 12 | -------------------------------------------------------------------------------- /Lavis/lavis/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | from lavis.tasks.captioning import CaptionTask 11 | from lavis.tasks.image_text_pretrain import ImageTextPretrainTask 12 | from lavis.tasks.multimodal_classification import ( 13 | MultimodalClassificationTask, 14 | ) 15 | from lavis.tasks.retrieval import RetrievalTask 16 | from lavis.tasks.vqa import VQATask, GQATask, AOKVQATask, MultiChoiceAOKVQATask 17 | from lavis.tasks.vqa_reading_comprehension import VQARCTask, GQARCTask 18 | from lavis.tasks.dialogue import DialogueTask 19 | 20 | 21 | def setup_task(cfg): 22 | assert "task" in cfg.run_cfg, "Task name must be provided." 23 | 24 | task_name = cfg.run_cfg.task 25 | task = registry.get_task_class(task_name).setup_task(cfg=cfg) 26 | assert task is not None, "Task {} not properly registered.".format(task_name) 27 | 28 | return task 29 | 30 | 31 | __all__ = [ 32 | "BaseTask", 33 | "AOKVQATask", 34 | "RetrievalTask", 35 | "CaptionTask", 36 | "VQATask", 37 | "GQATask", 38 | "VQARCTask", 39 | "GQARCTask", 40 | "MultimodalClassificationTask", 41 | "MultiChoiceAOKVQATask", 42 | # "VideoQATask", 43 | # "VisualEntailmentTask", 44 | "ImageTextPretrainTask", 45 | "DialogueTask", 46 | ] 47 | -------------------------------------------------------------------------------- /Lavis/lavis/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /Lavis/requirements.txt: -------------------------------------------------------------------------------- 1 | contexttimer 2 | decord 3 | einops>=0.4.1 4 | fairscale==0.4.4 5 | ftfy 6 | iopath 7 | ipython 8 | omegaconf 9 | opencv-python-headless==4.5.5.64 10 | opendatasets 11 | packaging 12 | pandas 13 | plotly 14 | pre-commit 15 | pycocoevalcap 16 | pycocotools 17 | python-magic 18 | scikit-image 19 | sentencepiece 20 | spacy 21 | streamlit 22 | timm==0.4.12 23 | torch>=1.10.0 24 | torchvision 25 | tqdm 26 | transformers>=4.25.0,<4.27 27 | webdataset 28 | wheel 29 | rake-nltk 30 | -------------------------------------------------------------------------------- /Lavis/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from setuptools import setup, find_namespace_packages 9 | import platform 10 | 11 | DEPENDENCY_LINKS = [] 12 | if platform.system() == "Windows": 13 | DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html") 14 | 15 | 16 | def fetch_requirements(filename): 17 | with open(filename) as f: 18 | return [ln.strip() for ln in f.read().split("\n")] 19 | 20 | 21 | setup( 22 | name="salesforce-lavis", 23 | version="1.0.1", 24 | author="Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi", 25 | description="LAVIS - A One-stop Library for Language-Vision Intelligence", 26 | long_description=open("README.md", "r", encoding="utf-8").read(), 27 | long_description_content_type="text/markdown", 28 | keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch", 29 | license="3-Clause BSD", 30 | packages=find_namespace_packages(include="lavis.*"), 31 | install_requires=fetch_requirements("requirements.txt"), 32 | python_requires=">=3.7.0", 33 | include_package_data=True, 34 | dependency_links=DEPENDENCY_LINKS, 35 | zip_safe=False, 36 | ) 37 | -------------------------------------------------------------------------------- /MiniGPT-4/environment.yml: -------------------------------------------------------------------------------- 1 | name: minigpt4 2 | channels: 3 | - pytorch 4 | - defaults 5 | - anaconda 6 | dependencies: 7 | - python=3.9 8 | - cudatoolkit 9 | - pip 10 | - pytorch=1.12.1 11 | - pytorch-mutex=1.0=cuda 12 | - torchaudio=0.12.1 13 | - torchvision=0.13.1 14 | - pip: 15 | - accelerate==0.16.0 16 | - aiohttp==3.8.4 17 | - aiosignal==1.3.1 18 | - async-timeout==4.0.2 19 | - attrs==22.2.0 20 | - bitsandbytes==0.37.0 21 | - cchardet==2.1.7 22 | - chardet==5.1.0 23 | - contourpy==1.0.7 24 | - cycler==0.11.0 25 | - filelock==3.9.0 26 | - fonttools==4.38.0 27 | - frozenlist==1.3.3 28 | - huggingface-hub==0.13.4 29 | - importlib-resources==5.12.0 30 | - kiwisolver==1.4.4 31 | - matplotlib==3.7.0 32 | - multidict==6.0.4 33 | - openai==0.27.0 34 | - packaging==23.0 35 | - psutil==5.9.4 36 | - pycocotools==2.0.6 37 | - pyparsing==3.0.9 38 | - python-dateutil==2.8.2 39 | - pyyaml==6.0 40 | - regex==2022.10.31 41 | - tokenizers==0.13.2 42 | - tqdm==4.64.1 43 | - transformers==4.28.0 44 | - timm==0.6.13 45 | - spacy==3.5.1 46 | - webdataset==0.2.48 47 | - scikit-learn==1.2.2 48 | - scipy==1.10.1 49 | - yarl==1.8.2 50 | - zipp==3.14.0 51 | - omegaconf==2.3.0 52 | - opencv-python==4.7.0.72 53 | - iopath==0.1.10 54 | - decord==0.6.0 55 | - tenacity==8.2.2 56 | - peft 57 | - pycocoevalcap 58 | - sentence-transformers 59 | - umap-learn 60 | - notebook 61 | - gradio==3.24.1 62 | - gradio-client==0.0.8 63 | - wandb 64 | - rake-nltk 65 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from minigpt4.common.registry import registry 14 | 15 | from minigpt4.datasets.builders import * 16 | from minigpt4.models import * 17 | from minigpt4.processors import * 18 | from minigpt4.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/common/__init__.py -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/aokvqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | aok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json 17 | storage: 18 | - aokvqa/annotations/aokvqa_v1p0_train.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json 22 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 23 | storage: 24 | - aokvqa/annotations/aokvqa_v1p0_val.json 25 | - aokvqa/annotations/specialized_vocab_train_lavis.json 26 | # - aokvqa/annotations/large_vocab_train_lavis.json 27 | test: 28 | url: 29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 31 | storage: 32 | - aokvqa/annotations/aokvqa_v1p0_test.json 33 | - aokvqa/annotations/specialized_vocab_train_lavis.json 34 | images: 35 | storage: coco/images/ 36 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/aokvqa/eval_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | aok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | - aokvqa/annotations/aokvqa_v1p0_val.json 17 | - aokvqa/annotations/specialized_vocab_train_lavis.json 18 | storage: 19 | - aokvqa/annotations/aokvqa_v1p0_val.json 20 | - aokvqa/annotations/specialized_vocab_train_lavis.json 21 | # - aokvqa/annotations/large_vocab_train_lavis.json 22 | images: 23 | storage: coco/images/ 24 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/cc_sbu/align.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu_align: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/cc_sbu_align/ 6 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/cc_sbu/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/cc_sbu_dataset/{00000..01255}.tar 6 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | - coco/annotations/vqa_val_eval.json 17 | - coco/annotations/answer_list.json 18 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 19 | - coco/annotations/v2_mscoco_val2014_annotations.json 20 | 21 | storage: 22 | - coco/annotations/vqa_val_eval.json 23 | - coco/annotations/answer_list.json 24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 25 | - coco/annotations/v2_mscoco_val2014_annotations.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/datasets/laion/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | laion: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/laion_dataset/{00000..10488}.tar 6 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/default.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | # For default users 3 | # cache_root: "cache" 4 | # For internal use with persistent storage 5 | cache_root: ".cache/lavis" 6 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/models/minigpt4_llama2.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | has_qformer: False 11 | 12 | # generation configs 13 | prompt: "" 14 | 15 | llama_model: 'meta-llama/Llama-2-7b-chat-hf' 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip2_image_train" 21 | image_size: 224 22 | eval: 23 | name: "blip2_image_eval" 24 | image_size: 224 25 | text_processor: 26 | train: 27 | name: "blip_caption" 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/configs/models/minigpt4_vicuna0.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | 12 | # Q-Former 13 | num_query_token: 32 14 | 15 | # generation configs 16 | prompt: "" 17 | 18 | llama_model: "Vision-CAIR/vicuna-7b" 19 | 20 | preprocess: 21 | vis_processor: 22 | train: 23 | name: "blip2_image_train" 24 | image_size: 224 25 | eval: 26 | name: "blip2_image_eval" 27 | image_size: 224 28 | text_processor: 29 | train: 30 | name: "blip_caption" 31 | eval: 32 | name: "blip_caption" 33 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/conversation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/conversation/__init__.py -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/__init__.py -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/builders/vqa_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder 9 | 10 | from minigpt4.common.registry import registry 11 | from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset 12 | from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset 13 | 14 | 15 | @registry.register_builder("coco_vqa") 16 | class COCOVQABuilder(BaseDatasetBuilder): 17 | train_dataset_cls = COCOVQADataset 18 | eval_dataset_cls = COCOVQAEvalDataset 19 | 20 | DATASET_CONFIG_DICT = { 21 | "default": "configs/datasets/coco/defaults_vqa.yaml", 22 | "eval": "configs/datasets/coco/eval_vqa.yaml", 23 | } 24 | 25 | 26 | @registry.register_builder("ok_vqa") 27 | class OKVQABuilder(COCOVQABuilder): 28 | DATASET_CONFIG_DICT = { 29 | "default": "configs/datasets/okvqa/defaults.yaml", 30 | } 31 | 32 | 33 | @registry.register_builder("aok_vqa") 34 | class AOKVQABuilder(BaseDatasetBuilder): 35 | train_dataset_cls = AOKVQADataset 36 | eval_dataset_cls = AOKVQAEvalDataset 37 | 38 | DATASET_CONFIG_DICT = { 39 | "default": "configs/datasets/aokvqa/defaults.yaml", 40 | "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml", 41 | } 42 | 43 | 44 | 45 | # @registry.register_builder("gqa") 46 | # class GQABuilder(BaseDatasetBuilder): 47 | # train_dataset_cls = GQADataset 48 | # eval_dataset_cls = GQAEvalDataset 49 | 50 | # DATASET_CONFIG_DICT = { 51 | # "default": "configs/datasets/gqa/defaults.yaml", 52 | # "balanced_val": "configs/datasets/gqa/balanced_val.yaml", 53 | # "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml", 54 | # } -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/datasets/__init__.py -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/datasets/base_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import json 9 | from typing import Iterable 10 | 11 | from torch.utils.data import Dataset, ConcatDataset 12 | from torch.utils.data.dataloader import default_collate 13 | 14 | 15 | class BaseDataset(Dataset): 16 | def __init__( 17 | self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[] 18 | ): 19 | """ 20 | vis_root (string): Root directory of images (e.g. coco/images/) 21 | ann_root (string): directory to store the annotation file 22 | """ 23 | self.vis_root = vis_root 24 | self.annotation = [] 25 | for ann_path in ann_paths: 26 | self.annotation.extend(json.load(open(ann_path, "r"))['annotations']) 27 | 28 | self.vis_processor = vis_processor 29 | self.text_processor = text_processor 30 | 31 | self._add_instance_ids() 32 | 33 | def __len__(self): 34 | return len(self.annotation) 35 | 36 | def collater(self, samples): 37 | return default_collate(samples) 38 | 39 | def set_processors(self, vis_processor, text_processor): 40 | self.vis_processor = vis_processor 41 | self.text_processor = text_processor 42 | 43 | def _add_instance_ids(self, key="instance_id"): 44 | for idx, ann in enumerate(self.annotation): 45 | ann[key] = str(idx) 46 | 47 | 48 | class ConcatDataset(ConcatDataset): 49 | def __init__(self, datasets: Iterable[Dataset]) -> None: 50 | super().__init__(datasets) 51 | 52 | def collater(self, samples): 53 | # TODO For now only supports datasets with same underlying collater implementations 54 | 55 | all_keys = set() 56 | for s in samples: 57 | all_keys.update(s) 58 | 59 | shared_keys = all_keys 60 | for s in samples: 61 | shared_keys = shared_keys & set(s.keys()) 62 | 63 | samples_shared_keys = [] 64 | for s in samples: 65 | samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys}) 66 | 67 | return self.datasets[0].collater(samples_shared_keys) 68 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/datasets/cc_sbu_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import webdataset as wds 4 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 5 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 6 | 7 | 8 | class CCSBUDataset(BaseDataset): 9 | def __init__(self, vis_processor, text_processor, location): 10 | super().__init__(vis_processor=vis_processor, text_processor=text_processor) 11 | 12 | self.inner_dataset = wds.DataPipeline( 13 | wds.ResampledShards(location), 14 | wds.tarfile_to_samples(handler=wds.warn_and_continue), 15 | wds.shuffle(1000, handler=wds.warn_and_continue), 16 | wds.decode("pilrgb", handler=wds.warn_and_continue), 17 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), 18 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), 19 | wds.map(self.to_dict, handler=wds.warn_and_continue), 20 | ) 21 | 22 | def to_dict(self, sample): 23 | return { 24 | "image": sample[0], 25 | "answer": self.text_processor(sample[1]["caption"]), 26 | } 27 | 28 | 29 | class CCSBUAlignDataset(CaptionDataset): 30 | 31 | def __getitem__(self, index): 32 | 33 | # TODO this assumes image input, not general enough 34 | ann = self.annotation[index] 35 | 36 | img_file = '{}.jpg'.format(ann["image_id"]) 37 | image_path = os.path.join(self.vis_root, img_file) 38 | image = Image.open(image_path).convert("RGB") 39 | 40 | image = self.vis_processor(image) 41 | caption = ann["caption"] 42 | 43 | return { 44 | "image": image, 45 | "answer": caption, 46 | "image_id": self.img_ids[ann["image_id"]], 47 | } -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/datasets/laion_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import webdataset as wds 9 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class LaionDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, location): 14 | super().__init__(vis_processor=vis_processor, text_processor=text_processor) 15 | 16 | self.inner_dataset = wds.DataPipeline( 17 | wds.ResampledShards(location), 18 | wds.tarfile_to_samples(handler=wds.warn_and_continue), 19 | wds.shuffle(1000, handler=wds.warn_and_continue), 20 | wds.decode("pilrgb", handler=wds.warn_and_continue), 21 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), 22 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), 23 | wds.map(self.to_dict, handler=wds.warn_and_continue), 24 | ) 25 | 26 | def to_dict(self, sample): 27 | return { 28 | "image": sample[0], 29 | "answer": self.text_processor(sample[1]["caption"]), 30 | } 31 | 32 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/datasets/datasets/vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 11 | 12 | 13 | class VQADataset(BaseDataset): 14 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 15 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 16 | 17 | def collater(self, samples): 18 | image_list, question_list, answer_list, weight_list = [], [], [], [] 19 | 20 | num_answers = [] 21 | 22 | for sample in samples: 23 | image_list.append(sample["image"]) 24 | question_list.append(sample["text_input"]) 25 | 26 | weight_list.extend(sample["weights"]) 27 | 28 | answers = sample["answers"] 29 | 30 | answer_list.extend(answers) 31 | num_answers.append(len(answers)) 32 | 33 | return { 34 | "image": torch.stack(image_list, dim=0), 35 | "text_input": question_list, 36 | "answer": answer_list, 37 | "weight": torch.Tensor(weight_list), 38 | "n_answers": torch.LongTensor(num_answers), 39 | } 40 | 41 | 42 | class VQAEvalDataset(BaseDataset): 43 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 44 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 45 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.processors.base_processor import BaseProcessor 9 | from minigpt4.processors.blip_processors import ( 10 | Blip2ImageTrainProcessor, 11 | Blip2ImageEvalProcessor, 12 | BlipCaptionProcessor, 13 | ) 14 | 15 | from minigpt4.common.registry import registry 16 | 17 | __all__ = [ 18 | "BaseProcessor", 19 | "Blip2ImageTrainProcessor", 20 | "Blip2ImageEvalProcessor", 21 | "BlipCaptionProcessor", 22 | ] 23 | 24 | 25 | def load_processor(name, cfg=None): 26 | """ 27 | Example 28 | 29 | >>> processor = load_processor("alpro_video_train", cfg=None) 30 | """ 31 | processor = registry.get_processor_class(name).from_config(cfg) 32 | 33 | return processor 34 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/projects/minigpt4/conv_direct_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: mini_gpt4 8 | model_type: pretrain_vicuna0 9 | end_sym: "###" 10 | max_txt_len: 256 11 | ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth' 12 | use_grad_checkpoint: False 13 | answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word." 14 | prompt_template: "### Human: ### Human: Based on the image, answer the question below.\nQuestion: {}" 15 | process_answer: True 16 | answer_processor: 'aok-vqa' 17 | conversation: True 18 | multiple_choice: False 19 | 20 | keyword_pipeline: True 21 | reason: True 22 | paraphrase: False 23 | 24 | perform_selection: False 25 | selection_criterion: 'Aconf' 26 | perform_ensembling: False 27 | 28 | ext_paraphrase: False 29 | par_num_beams: 5 30 | num_add_candidates: 4 31 | verbose: True 32 | alt_device: 0 33 | 34 | datasets: 35 | aok_vqa: # name of the dataset builder 36 | type: eval 37 | vis_processor: 38 | eval: 39 | name: "blip2_image_eval" 40 | image_size: 224 41 | text_processor: 42 | eval: 43 | name: "blip_question" 44 | build_info: 45 | images: 46 | storage: '.cache/lavis/coco/images/' 47 | 48 | run: 49 | task: aok_vqa 50 | # optimization-specific 51 | batch_size_train: 16 52 | batch_size_eval: 5 53 | num_workers: 4 54 | 55 | # inference-specific 56 | max_len: 30 57 | min_len: 1 58 | num_beams: 5 59 | inference_method: "generate" 60 | 61 | seed: 42 62 | output_dir: "output/Vicuna7B/AOK-VQA" 63 | 64 | evaluate: True 65 | test_splits: ["val"] 66 | 67 | # distribution-specific 68 | device: "cuda" 69 | world_size: 1 70 | dist_url: "env://" 71 | distributed: True 72 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/projects/minigpt4/conv_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | 7 | model: 8 | arch: mini_gpt4 9 | model_type: pretrain_vicuna0 10 | end_sym: "###" 11 | max_txt_len: 200 12 | ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth' 13 | use_grad_checkpoint: False 14 | answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word." 15 | prompt_template: "### Human: ### Human: Based on the image, answer the question below.\nQuestion: {}" 16 | process_answer: True 17 | answer_processor: 'vqa' 18 | conversation: True 19 | 20 | ext_paraphrase: False 21 | par_num_beams: 5 22 | num_add_candidates: 4 23 | 24 | keyword_pipeline: True 25 | reason: True 26 | paraphrase: False 27 | 28 | perform_selection: False 29 | selection_criterion: 'Aconf' 30 | perform_ensembling: False 31 | 32 | verbose: True 33 | alt_device: 0 34 | 35 | datasets: 36 | coco_vqa: # name of the dataset builder 37 | type: eval 38 | vis_processor: 39 | eval: 40 | name: "blip2_image_eval" 41 | image_size: 224 42 | text_processor: 43 | eval: 44 | name: "blip_question" 45 | build_info: 46 | images: 47 | storage: '.cache/lavis/coco/images/' 48 | 49 | run: 50 | task: vqa 51 | # optimization-specific 52 | batch_size_train: 16 53 | batch_size_eval: 4 54 | num_workers: 4 55 | 56 | # inference-specific 57 | max_len: 50 58 | min_len: 1 59 | num_beams: 5 60 | inference_method: "generate" 61 | 62 | seed: 42 63 | output_dir: "output/Vicuna7B/VQA" 64 | 65 | evaluate: True 66 | test_splits: ["val"] 67 | 68 | # distribution-specific 69 | device: "cuda" 70 | world_size: 1 71 | dist_url: "env://" 72 | distributed: True 73 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.runners.runner_base import RunnerBase 9 | 10 | __all__ = ["RunnerBase"] 11 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask 11 | from minigpt4.tasks.vqa import VQATask, AOKVQATask, MultiChoiceAOKVQATask 12 | 13 | 14 | def setup_task(cfg): 15 | assert "task" in cfg.run_cfg, "Task name must be provided." 16 | 17 | task_name = cfg.run_cfg.task 18 | task = registry.get_task_class(task_name).setup_task(cfg=cfg) 19 | assert task is not None, "Task {} not properly registered.".format(task_name) 20 | 21 | return task 22 | 23 | 24 | __all__ = [ 25 | "BaseTask", 26 | "ImageTextPretrainTask", 27 | "AOKVQATask", 28 | "VQATask", 29 | "MultiChoiceAOKVQATask", 30 | ] 31 | -------------------------------------------------------------------------------- /MiniGPT-4/minigpt4/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /assets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/README.md -------------------------------------------------------------------------------- /assets/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/intro.png -------------------------------------------------------------------------------- /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/pipeline.png --------------------------------------------------------------------------------