├── .gitignore
├── LICENSE
├── Lavis
├── evaluate.py
├── lavis
│ ├── __init__.py
│ ├── common
│ │ ├── config.py
│ │ ├── dist_utils.py
│ │ ├── gradcam.py
│ │ ├── logger.py
│ │ ├── optims.py
│ │ ├── registry.py
│ │ ├── utils.py
│ │ └── vqa_tools
│ │ │ ├── __init__.py
│ │ │ ├── vqa.py
│ │ │ └── vqa_eval.py
│ ├── configs
│ │ ├── datasets
│ │ │ ├── aokvqa
│ │ │ │ ├── defaults.yaml
│ │ │ │ └── eval_aokvqa.yaml
│ │ │ ├── avsd
│ │ │ │ └── defaults_dial.yaml
│ │ │ ├── coco
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_ret.yaml
│ │ │ │ ├── defaults_vqa.yaml
│ │ │ │ └── eval_vqa.yaml
│ │ │ ├── conceptual_caption
│ │ │ │ ├── defaults_12m.yaml
│ │ │ │ └── defaults_3m.yaml
│ │ │ ├── didemo
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── flickr30k
│ │ │ │ └── defaults.yaml
│ │ │ ├── gqa
│ │ │ │ ├── balanced_testdev.yaml
│ │ │ │ ├── balanced_val.yaml
│ │ │ │ └── defaults.yaml
│ │ │ ├── imagenet
│ │ │ │ └── defaults.yaml
│ │ │ ├── laion
│ │ │ │ └── defaults_2B_multi.yaml
│ │ │ ├── msrvtt
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_qa.yaml
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── msvd
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── nlvr
│ │ │ │ └── defaults.yaml
│ │ │ ├── nocaps
│ │ │ │ └── defaults.yaml
│ │ │ ├── okvqa
│ │ │ │ └── defaults.yaml
│ │ │ ├── sbu_caption
│ │ │ │ └── defaults.yaml
│ │ │ ├── snli_ve
│ │ │ │ └── defaults.yaml
│ │ │ ├── vatex
│ │ │ │ └── defaults_cap.yaml
│ │ │ └── vg
│ │ │ │ ├── defaults_caption.yaml
│ │ │ │ └── defaults_vqa.yaml
│ │ ├── default.yaml
│ │ └── models
│ │ │ ├── albef_classification_ve.yaml
│ │ │ ├── albef_feature_extractor.yaml
│ │ │ ├── albef_nlvr.yaml
│ │ │ ├── albef_pretrain_base.yaml
│ │ │ ├── albef_retrieval_coco.yaml
│ │ │ ├── albef_retrieval_flickr.yaml
│ │ │ ├── albef_vqav2.yaml
│ │ │ ├── alpro_qa_msrvtt.yaml
│ │ │ ├── alpro_qa_msvd.yaml
│ │ │ ├── alpro_retrieval_didemo.yaml
│ │ │ ├── alpro_retrieval_msrvtt.yaml
│ │ │ ├── bert_config.json
│ │ │ ├── bert_config_alpro.json
│ │ │ ├── blip2
│ │ │ ├── blip2_caption_flant5xl.yaml
│ │ │ ├── blip2_caption_opt2.7b.yaml
│ │ │ ├── blip2_caption_opt6.7b.yaml
│ │ │ ├── blip2_coco.yaml
│ │ │ ├── blip2_pretrain.yaml
│ │ │ ├── blip2_pretrain_flant5xl.yaml
│ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml
│ │ │ ├── blip2_pretrain_flant5xxl.yaml
│ │ │ ├── blip2_pretrain_opt2.7b.yaml
│ │ │ ├── blip2_pretrain_opt6.7b.yaml
│ │ │ └── blip2_pretrain_vitL.yaml
│ │ │ ├── blip_caption_base_coco.yaml
│ │ │ ├── blip_caption_large_coco.yaml
│ │ │ ├── blip_classification_base.yaml
│ │ │ ├── blip_feature_extractor_base.yaml
│ │ │ ├── blip_itm_base.yaml
│ │ │ ├── blip_itm_large.yaml
│ │ │ ├── blip_nlvr.yaml
│ │ │ ├── blip_pretrain_base.yaml
│ │ │ ├── blip_pretrain_large.yaml
│ │ │ ├── blip_retrieval_coco.yaml
│ │ │ ├── blip_retrieval_flickr.yaml
│ │ │ ├── blip_vqa_aokvqa.yaml
│ │ │ ├── blip_vqa_okvqa.yaml
│ │ │ ├── blip_vqav2.yaml
│ │ │ ├── clip
│ │ │ ├── RN101-quickgelu.json
│ │ │ ├── RN101.json
│ │ │ ├── RN50-quickgelu.json
│ │ │ ├── RN50.json
│ │ │ ├── RN50x16.json
│ │ │ ├── RN50x4.json
│ │ │ ├── ViT-B-16-plus-240.json
│ │ │ ├── ViT-B-16-plus.json
│ │ │ ├── ViT-B-16.json
│ │ │ ├── ViT-B-32-plus-256.json
│ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ ├── ViT-B-32.json
│ │ │ ├── ViT-H-14.json
│ │ │ ├── ViT-H-16.json
│ │ │ ├── ViT-L-14-280.json
│ │ │ ├── ViT-L-14-336.json
│ │ │ ├── ViT-L-14.json
│ │ │ ├── ViT-L-16-320.json
│ │ │ ├── ViT-L-16.json
│ │ │ ├── ViT-g-14.json
│ │ │ ├── timm-efficientnetv2_rw_s.json
│ │ │ ├── timm-resnet50d.json
│ │ │ ├── timm-resnetaa50d.json
│ │ │ ├── timm-resnetblur50.json
│ │ │ ├── timm-swin_base_patch4_window7_224.json
│ │ │ ├── timm-vit_base_patch16_224.json
│ │ │ ├── timm-vit_base_patch32_224.json
│ │ │ └── timm-vit_small_patch16_224.json
│ │ │ ├── clip_resnet50.yaml
│ │ │ ├── clip_vit_base16.yaml
│ │ │ ├── clip_vit_base32.yaml
│ │ │ ├── clip_vit_large14.yaml
│ │ │ ├── clip_vit_large14_336.yaml
│ │ │ ├── gpt_dialogue_base.yaml
│ │ │ ├── img2prompt-vqa
│ │ │ └── img2prompt_vqa_base.yaml
│ │ │ ├── med_config.json
│ │ │ ├── med_config_albef.json
│ │ │ ├── med_large_config.json
│ │ │ └── pnp-vqa
│ │ │ ├── pnp_vqa_3b.yaml
│ │ │ ├── pnp_vqa_base.yaml
│ │ │ ├── pnp_vqa_large.yaml
│ │ │ ├── unifiedqav2_3b_config.json
│ │ │ ├── unifiedqav2_base_config.json
│ │ │ └── unifiedqav2_large_config.json
│ ├── datasets
│ │ ├── builders
│ │ │ ├── __init__.py
│ │ │ ├── base_dataset_builder.py
│ │ │ ├── caption_builder.py
│ │ │ ├── classification_builder.py
│ │ │ ├── dialogue_builder.py
│ │ │ ├── image_text_pair_builder.py
│ │ │ ├── imagefolder_builder.py
│ │ │ ├── retrieval_builder.py
│ │ │ ├── video_qa_builder.py
│ │ │ └── vqa_builder.py
│ │ ├── data_utils.py
│ │ ├── datasets
│ │ │ ├── aok_vqa_datasets.py
│ │ │ ├── avsd_dialogue_datasets.py
│ │ │ ├── base_dataset.py
│ │ │ ├── caption_datasets.py
│ │ │ ├── coco_caption_datasets.py
│ │ │ ├── coco_vqa_datasets.py
│ │ │ ├── dataloader_utils.py
│ │ │ ├── dialogue_datasets.py
│ │ │ ├── gqa_datasets.py
│ │ │ ├── image_text_pair_datasets.py
│ │ │ ├── imagefolder_dataset.py
│ │ │ ├── laion_dataset.py
│ │ │ ├── multimodal_classification_datasets.py
│ │ │ ├── nlvr_datasets.py
│ │ │ ├── retrieval_datasets.py
│ │ │ ├── snli_ve_datasets.py
│ │ │ ├── vg_vqa_datasets.py
│ │ │ ├── video_caption_datasets.py
│ │ │ ├── video_vqa_datasets.py
│ │ │ └── vqa_datasets.py
│ │ └── download_scripts
│ │ │ ├── DownloadConceptualCaptions
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── create_annotation_12m.ipynb
│ │ │ ├── create_annotation_3m.ipynb
│ │ │ ├── download_data_cc12m.py
│ │ │ └── download_data_cc3m.py
│ │ │ ├── download_coco.py
│ │ │ ├── download_didemo.py
│ │ │ ├── download_flickr.py
│ │ │ ├── download_gqa.py
│ │ │ ├── download_msrvtt.py
│ │ │ ├── download_msvd.py
│ │ │ ├── download_nocaps.py
│ │ │ ├── download_sbu.py
│ │ │ └── download_vg.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── albef_models
│ │ │ ├── __init__.py
│ │ │ ├── albef_classification.py
│ │ │ ├── albef_feature_extractor.py
│ │ │ ├── albef_nlvr.py
│ │ │ ├── albef_outputs.py
│ │ │ ├── albef_pretrain.py
│ │ │ ├── albef_retrieval.py
│ │ │ └── albef_vqa.py
│ │ ├── alpro_models
│ │ │ ├── __init__.py
│ │ │ ├── alpro_outputs.py
│ │ │ ├── alpro_qa.py
│ │ │ └── alpro_retrieval.py
│ │ ├── base_model.py
│ │ ├── blip2_models
│ │ │ ├── Qformer.py
│ │ │ ├── __init__.py
│ │ │ ├── blip2.py
│ │ │ ├── blip2_image_text_matching.py
│ │ │ ├── blip2_opt.py
│ │ │ ├── blip2_qformer.py
│ │ │ ├── blip2_t5.py
│ │ │ ├── blip2_t5_par.py
│ │ │ ├── modeling_opt.py
│ │ │ └── modeling_t5.py
│ │ ├── blip_models
│ │ │ ├── __init__.py
│ │ │ ├── blip.py
│ │ │ ├── blip_caption.py
│ │ │ ├── blip_classification.py
│ │ │ ├── blip_feature_extractor.py
│ │ │ ├── blip_image_text_matching.py
│ │ │ ├── blip_nlvr.py
│ │ │ ├── blip_outputs.py
│ │ │ ├── blip_pretrain.py
│ │ │ ├── blip_retrieval.py
│ │ │ ├── blip_vqa.py
│ │ │ └── nlvr_encoder.py
│ │ ├── clip_models
│ │ │ ├── __init__.py
│ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ ├── clip_outputs.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── pics
│ │ │ │ └── CLIP.png
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ └── utils.py
│ │ ├── clip_vit.py
│ │ ├── eva_vit.py
│ │ ├── gpt_models
│ │ │ └── gpt_dialogue.py
│ │ ├── img2prompt_models
│ │ │ ├── __init__.py
│ │ │ └── img2prompt_vqa.py
│ │ ├── med.py
│ │ ├── pnp_vqa_models
│ │ │ ├── __init__.py
│ │ │ ├── pnp_unifiedqav2_fid.py
│ │ │ └── pnp_vqa.py
│ │ ├── timesformer
│ │ │ ├── __init__.py
│ │ │ ├── conv2d_same.py
│ │ │ ├── features.py
│ │ │ ├── helpers.py
│ │ │ ├── linear.py
│ │ │ ├── vit.py
│ │ │ └── vit_utils.py
│ │ └── vit.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── alpro_processors.py
│ │ ├── base_processor.py
│ │ ├── blip_processors.py
│ │ ├── clip_processors.py
│ │ ├── functional_video.py
│ │ ├── gpt_processors.py
│ │ ├── randaugment.py
│ │ └── transforms_video.py
│ ├── projects
│ │ └── blip2
│ │ │ ├── direct_aokvqa_zeroshot_flant5xl_eval.yaml
│ │ │ ├── mc_aokvqa_zeroshot_flant5xl_eval.yaml
│ │ │ └── vqav2_zeroshot_flant5xl_eval.yaml
│ ├── runners
│ │ ├── __init__.py
│ │ ├── runner_base.py
│ │ └── runner_iter.py
│ └── tasks
│ │ ├── __init__.py
│ │ ├── base_task.py
│ │ ├── captioning.py
│ │ ├── dialogue.py
│ │ ├── image_text_pretrain.py
│ │ ├── multimodal_classification.py
│ │ ├── retrieval.py
│ │ ├── vqa.py
│ │ └── vqa_reading_comprehension.py
├── requirements.txt
├── setup.py
└── train.py
├── MiniGPT-4
├── demo.py
├── environment.yml
├── evaluate.py
├── minigpt4
│ ├── __init__.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── dist_utils.py
│ │ ├── gradcam.py
│ │ ├── logger.py
│ │ ├── optims.py
│ │ ├── registry.py
│ │ ├── utils.py
│ │ └── vqa_tools
│ │ │ ├── __init__.py
│ │ │ ├── vqa.py
│ │ │ └── vqa_eval.py
│ ├── configs
│ │ ├── datasets
│ │ │ ├── aokvqa
│ │ │ │ ├── defaults.yaml
│ │ │ │ └── eval_aokvqa.yaml
│ │ │ ├── cc_sbu
│ │ │ │ ├── align.yaml
│ │ │ │ └── defaults.yaml
│ │ │ ├── coco
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_ret.yaml
│ │ │ │ ├── defaults_vqa.yaml
│ │ │ │ └── eval_vqa.yaml
│ │ │ └── laion
│ │ │ │ └── defaults.yaml
│ │ ├── default.yaml
│ │ └── models
│ │ │ ├── minigpt4_llama2.yaml
│ │ │ └── minigpt4_vicuna0.yaml
│ ├── conversation
│ │ ├── __init__.py
│ │ └── conversation.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── builders
│ │ │ ├── __init__.py
│ │ │ ├── base_dataset_builder.py
│ │ │ ├── image_text_pair_builder.py
│ │ │ └── vqa_builder.py
│ │ ├── data_utils.py
│ │ └── datasets
│ │ │ ├── __init__.py
│ │ │ ├── aok_vqa_datasets.py
│ │ │ ├── base_dataset.py
│ │ │ ├── caption_datasets.py
│ │ │ ├── cc_sbu_dataset.py
│ │ │ ├── coco_vqa_datasets.py
│ │ │ ├── dataloader_utils.py
│ │ │ ├── laion_dataset.py
│ │ │ ├── old_dataloader_utils.py
│ │ │ └── vqa_datasets.py
│ ├── models
│ │ ├── Qformer.py
│ │ ├── __init__.py
│ │ ├── base_model.py
│ │ ├── blip2.py
│ │ ├── blip2_outputs.py
│ │ ├── eva_vit.py
│ │ ├── mini_gpt4.py
│ │ └── modeling_llama.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── base_processor.py
│ │ ├── blip_processors.py
│ │ └── randaugment.py
│ ├── projects
│ │ └── minigpt4
│ │ │ ├── conv_direct_aokvqa.yaml
│ │ │ ├── conv_mc_aokvqa.yaml
│ │ │ └── conv_vqav2.yaml
│ ├── runners
│ │ ├── __init__.py
│ │ └── runner_base.py
│ └── tasks
│ │ ├── __init__.py
│ │ ├── base_task.py
│ │ ├── image_text_pretrain.py
│ │ └── vqa.py
└── train.py
├── README.md
└── assets
├── README.md
├── intro.png
└── pipeline.png
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Archiki Prasad
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | import sys
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.registry import registry
14 |
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 |
20 |
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 |
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 |
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 |
--------------------------------------------------------------------------------
/Lavis/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from matplotlib import pyplot as plt
3 | from scipy.ndimage import filters
4 | from skimage import transform as skimage_transform
5 |
6 |
7 | def getAttMap(img, attMap, blur=True, overlap=True):
8 | attMap -= attMap.min()
9 | if attMap.max() > 0:
10 | attMap /= attMap.max()
11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 | if blur:
13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 | attMap -= attMap.min()
15 | attMap /= attMap.max()
16 | cmap = plt.get_cmap("jet")
17 | attMapV = cmap(attMap)
18 | attMapV = np.delete(attMapV, 3, 2)
19 | if overlap:
20 | attMap = (
21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 | )
24 | return attMap
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | __author__ = "aagrawal"
9 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/aokvqa/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | aok_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
17 | storage:
18 | - aokvqa/annotations/aokvqa_v1p0_train.json
19 | val:
20 | url:
21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
22 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
23 | storage:
24 | - aokvqa/annotations/aokvqa_v1p0_val.json
25 | - aokvqa/annotations/specialized_vocab_train_lavis.json
26 | # - aokvqa/annotations/large_vocab_train_lavis.json
27 | test:
28 | url:
29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
31 | storage:
32 | - aokvqa/annotations/aokvqa_v1p0_test.json
33 | - aokvqa/annotations/specialized_vocab_train_lavis.json
34 | images:
35 | storage: coco/images/
36 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/aokvqa/eval_aokvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | aok_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url:
16 | - aokvqa/annotations/aokvqa_v1p0_val.json
17 | - aokvqa/annotations/specialized_vocab_train_lavis.json
18 | storage:
19 | - aokvqa/annotations/aokvqa_v1p0_val.json
20 | - aokvqa/annotations/specialized_vocab_train_lavis.json
21 | images:
22 | storage: coco/images/
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | avsd_dialogue: # name of the dataset builder
8 | dataset_card: dataset_card/avsd_dialogue.md
9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 | storage: avsd/annotations/train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 | storage: avsd/annotations/val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 | storage: avsd/annotations/test.json
23 | features:
24 | storage: avsd/features/
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_caption: # name of the dataset builder
8 | dataset_card: dataset_card/coco_caption.md
9 | # data_dir: ${env.data_dir}/datasets
10 | data_type: images # [images|videos|features]
11 |
12 | build_info:
13 | # Be careful not to append minus sign (-) before split to avoid itemizing
14 | annotations:
15 | train:
16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 | md5: aa31ac474cf6250ebb81d18348a07ed8
18 | storage: coco/annotations/coco_karpathy_train.json
19 | val:
20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 | md5: b273847456ef5580e33713b1f7de52a0
22 | storage: coco/annotations/coco_karpathy_val.json
23 | test:
24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 | storage: coco/annotations/coco_karpathy_test.json
27 | images:
28 | storage: coco/images/
29 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_retrieval:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 | md5: aa31ac474cf6250ebb81d18348a07ed8
17 | storage: coco/annotations/coco_karpathy_train.json
18 | val:
19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 | md5: b273847456ef5580e33713b1f7de52a0
21 | storage: coco/annotations/coco_karpathy_val.json
22 | test:
23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 | storage: coco/annotations/coco_karpathy_test.json
26 | images:
27 | storage: coco/images/
28 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url:
16 | - coco/annotations/vqa_val_eval.json
17 | - coco/annotations/answer_list.json
18 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
19 | - coco/annotations/v2_mscoco_val2014_annotations.json
20 | storage:
21 | - coco/annotations/vqa_val_eval.json
22 | - coco/annotations/answer_list.json
23 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
24 | - coco/annotations/v2_mscoco_val2014_annotations.json
25 | images:
26 | storage: coco/images/
27 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | conceptual_caption_12m:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - /export/home/workspace/datasets/cc12m.json
17 | storage:
18 | - conceptual_caption/annotations/cc12m.json
19 | images:
20 | storage: conceptual_caption/images_12m
21 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | conceptual_caption_3m:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - /export/home/workspace/datasets/cc3m.json
17 | storage:
18 | - conceptual_caption/annotations/cc3m.json
19 | images:
20 | storage: conceptual_caption/images
21 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | didemo_retrieval: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 | storage: didemo/annotations/retrieval_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 | storage: didemo/annotations/retrieval_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 | storage: didemo/annotations/retrieval_test.json
23 | videos:
24 | storage: didemo/videos
25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | flickr30k:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images
10 |
11 | build_info:
12 | annotations:
13 | train:
14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 | storage: flickr30k/annotations/train.json
16 | val:
17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 | storage: flickr30k/annotations/val.json
19 | test:
20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 | storage: flickr30k/annotations/test.json
22 | images:
23 | storage: flickr30k/images
24 | # storage: /export/share/datasets/vision/flickr30k
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/balanced_testdev.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | gqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 | storage:
18 | - gqa/annotations/train_balanced_questions.json
19 | val:
20 | url:
21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
22 | storage:
23 | - gqa/annotations/testdev_balanced_questions.json
24 | test:
25 | url:
26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 | storage:
28 | - gqa/annotations/test_balanced_questions.json
29 | images:
30 | storage: gqa/images/
31 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/balanced_val.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | gqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 | storage:
18 | - gqa/annotations/train_balanced_questions.json
19 | val:
20 | url:
21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22 | storage:
23 | - gqa/annotations/val_balanced_questions.json
24 | test:
25 | url:
26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 | storage:
28 | - gqa/annotations/test_balanced_questions.json
29 | images:
30 | storage: gqa/images/
31 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/gqa/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | gqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
17 | - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
18 | storage:
19 | - gqa/annotations/train_all_questions_0.json
20 | - gqa/annotations/val_all_questions.json
21 | val:
22 | url:
23 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
24 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
25 | storage:
26 | - aokvqa/annotations/aokvqa_v1p0_val.json
27 | - aokvqa/annotations/large_vocab_train_lavis.json
28 | test:
29 | url:
30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
31 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
32 | storage:
33 | - aokvqa/annotations/aokvqa_v1p0_test.json
34 | - aokvqa/annotations/large_vocab_train_lavis.json
35 | images:
36 | storage: gqa/images/
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | imagenet:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | splits: ["val"]
14 | images:
15 | storage: /export/share/datasets/vision/imagenet
16 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | laion2B_multi:
8 |
9 | data_type: images
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msrvtt_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 | storage: msrvtt/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 | storage: msrvtt/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 | storage: msrvtt/annotations/cap_test.json
23 | videos:
24 | storage: msrvtt/videos
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_qa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msrvtt_qa: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
16 | storage: msrvtt/annotations/qa_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
19 | storage: msrvtt/annotations/qa_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
22 | storage: msrvtt/annotations/qa_test.json
23 | ans2label:
24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
25 | storage: msrvtt/annotations/qa_ans2label.json
26 | videos:
27 | storage: msrvtt/videos
28 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msrvtt_retrieval: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 | storage: msrvtt/annotations/retrieval_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 | storage: msrvtt/annotations/retrieval_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 | storage: msrvtt/annotations/retrieval_test.json
23 | videos:
24 | storage: msrvtt/videos
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msvd_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 | storage: msvd/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 | storage: msvd/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 | storage: msvd/annotations/cap_test.json
23 | videos:
24 | storage: msvd/videos
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/msvd/defaults_qa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msvd_qa: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16 | storage: msvd/annotations/qa_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19 | storage: msvd/annotations/qa_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22 | storage: msvd/annotations/qa_test.json
23 | ans2label:
24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25 | storage: msvd/annotations/qa_ans2label.json
26 | videos:
27 | storage: msvd/videos
28 |
29 | instance_id_key: question_id
30 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | nlvr:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 | storage: nlvr/annotations/train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 | storage: nlvr/annotations/dev.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 | storage: nlvr/annotations/test.json
23 | images:
24 | storage: /export/share/datasets/vision/NLVR2/
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | nocaps: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 | storage: nocaps/annotations/nocaps_val.json
17 | test:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 | storage: nocaps/annotations/nocaps_test.json
20 | images:
21 | storage: nocaps/images
22 | # storage: /export/share/datasets/vision/nocaps/
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/okvqa/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | ok_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | # TODO make this order insensitive
17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
18 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
19 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
20 | storage:
21 | - okvqa/annotations/okvqa_train.json
22 | # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
23 | # - okvqa/annotations/mscoco_train2014_annotations.json
24 | test:
25 | url:
26 | # TODO make this order insensitive
27 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
28 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
31 | storage:
32 | - okvqa/annotations/vqa_val_eval.json
33 | - okvqa/annotations/answer_list.json
34 | - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
35 | - okvqa/annotations/mscoco_val2014_annotations.json
36 | images:
37 | storage: coco/images/
38 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | sbu_caption:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 | storage:
19 | - sbu_captions/annotations/sbu.json
20 | images:
21 | storage: sbu_captions/images
22 | # storage: /export/share/datasets/vision_language/sbu_resize
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | snli_ve:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 | storage: snli/annotations/ve_train.json
17 | val:
18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 | storage: snli/annotations/ve_dev.json
20 | test:
21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 | storage: snli/annotations/ve_test.json
23 | images:
24 | storage: flickr30k/images/flickr30k-images
25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msvd_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 | storage: vatex/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 | storage: vatex/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 | storage: vatex/annotations/cap_test.json
23 | videos:
24 | storage: /export/share/dongxuli/data/vatex
25 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | vg_caption:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 | storage: vg/annotations/vg_caption.json
17 | images:
18 | storage: vg/images/
19 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | vg_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 | storage: vg/annotations/vg_qa.json
17 | images:
18 | storage: vg/images/
19 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | env:
7 | # For default users
8 | # cache_root: "cache"
9 | # For internal use with persistent storage
10 | cache_root: ".cache/lavis"
11 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_classification
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 |
13 | num_classes: 3
14 |
15 | use_distill: True
16 | momentum: 0.995
17 | alpha: 0.4
18 |
19 | # vit encoder
20 | vit_type: "base"
21 | vit_grad_ckpt: False
22 | vit_ckpt_layer: 0
23 | vit_layer_norm_epsilon: 1e-6
24 |
25 | image_size: 384
26 |
27 | # bert config
28 | med_config_path: "configs/models/med_config_albef.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | eval:
35 | name: "blip_image_eval"
36 | text_processor:
37 | train:
38 | name: "blip_caption"
39 | eval:
40 | name: "blip_caption"
41 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_pretrain
8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
9 |
10 | # vit encoder
11 | vit_type: "base"
12 | image_size: 224
13 | vit_ckpt_layer: 0
14 | vit_drop_path_rate: 0
15 | vit_layer_norm_epsilon: 1e-6
16 | vit_grad_ckpt: False
17 |
18 | # bert config
19 | med_config_path: "configs/models/med_config_albef.json"
20 |
21 | embed_dim: 256
22 |
23 | preprocess:
24 | vis_processor:
25 | eval:
26 | name: "blip_image_eval"
27 | image_size: 224
28 | text_processor:
29 | eval:
30 | name: "blip_caption"
31 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_nlvr
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 |
13 | num_classes: 2
14 |
15 | use_distill: True
16 | momentum: 0.995
17 | alpha: 0.4
18 |
19 | # vit encoder
20 | vit_type: "base"
21 | vit_grad_ckpt: False
22 | vit_ckpt_layer: 0
23 | vit_layer_norm_epsilon: 1e-6
24 |
25 | image_size: 384
26 |
27 | # bert config
28 | med_config_path: "configs/models/med_config_albef.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 384
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 384
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_pretrain
8 |
9 | load_pretrained: True
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | image_size: 224
15 | vit_ckpt_layer: 0
16 | vit_drop_path_rate: 0
17 | vit_layer_norm_epsilon: 1e-6
18 | vit_grad_ckpt: False
19 |
20 | # bert config
21 | med_config_path: "configs/models/med_config_albef.json"
22 | mlm_mask_prob: 0.15
23 |
24 | embed_dim: 256
25 | momentum: 0.995
26 | alpha: 0.4
27 | temp: 0.07
28 |
29 | max_txt_len: 30
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 256
36 | text_processor:
37 | train:
38 | name: "blip_caption"
39 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_retrieval
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 |
13 | queue_size: 65536
14 |
15 | # vit encoder
16 | vit_type: "base"
17 | image_size: 384
18 | vit_ckpt_layer: 0
19 | vit_drop_path_rate: 0
20 | vit_layer_norm_epsilon: 1e-6
21 | vit_grad_ckpt: False
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_config_albef.json"
25 |
26 | embed_dim: 256
27 | momentum: 0.995
28 | alpha: 0.4
29 | temp: 0.07
30 | use_distill: True
31 |
32 | max_txt_len: 30
33 |
34 | preprocess:
35 | vis_processor:
36 | train:
37 | name: "blip_image_train"
38 | image_size: 384
39 | eval:
40 | name: "blip_image_eval"
41 | image_size: 384
42 | text_processor:
43 | train:
44 | name: "blip_caption"
45 | eval:
46 | name: "blip_caption"
47 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_retrieval
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 |
13 | queue_size: 65536
14 |
15 | # vit encoder
16 | vit_type: "base"
17 | image_size: 384
18 | vit_ckpt_layer: 0
19 | vit_drop_path_rate: 0
20 | vit_layer_norm_epsilon: 1e-6
21 | vit_grad_ckpt: False
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_config_albef.json"
25 |
26 | embed_dim: 256
27 | momentum: 0.995
28 | alpha: 0.4
29 | temp: 0.07
30 | use_distill: True
31 |
32 | max_txt_len: 30
33 |
34 | preprocess:
35 | vis_processor:
36 | train:
37 | name: "blip_image_train"
38 | image_size: 384
39 | eval:
40 | name: "blip_image_eval"
41 | image_size: 384
42 | text_processor:
43 | train:
44 | name: "blip_caption"
45 | eval:
46 | name: "blip_caption"
47 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_vqa
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 |
13 | use_distill: True
14 | momentum: 0.995
15 | alpha: 0.4
16 |
17 | # vit encoder
18 | vit_type: "base"
19 | vit_grad_ckpt: False
20 | vit_ckpt_layer: 0
21 | vit_layer_norm_epsilon: 1e-6
22 |
23 | image_size: 384
24 |
25 | # bert config
26 | med_config_path: "configs/models/med_config_albef.json"
27 |
28 | preprocess:
29 | vis_processor:
30 | train:
31 | name: "blip_image_train"
32 | image_size: 384
33 | eval:
34 | name: "blip_image_eval"
35 | image_size: 384
36 | text_processor:
37 | train:
38 | name: "blip_question"
39 | eval:
40 | name: "blip_question"
41 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | num_classes: 1500
9 |
10 | load_finetuned: True
11 |
12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 |
15 | timesformer:
16 | n_frms: 16
17 | image_size: 224
18 |
19 | patch_size: 16
20 | attn_drop_rate: 0.
21 | drop_rate: 0.
22 | drop_path_rate: 0.1
23 |
24 | use_grad_ckpt: True
25 | ckpt_layer: 12
26 |
27 | # bert config
28 | med_config_path: "configs/models/bert_config_alpro.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "alpro_video_train"
34 | n_frms: 16
35 | image_size: 224
36 | eval:
37 | name: "alpro_video_eval"
38 | n_frms: 16
39 | image_size: 224
40 | text_processor:
41 | train:
42 | name: "blip_caption"
43 | eval:
44 | name: "blip_caption"
45 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | num_classes: 2423
9 |
10 | load_finetuned: True
11 |
12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 |
15 | timesformer:
16 | n_frms: 16
17 | image_size: 224
18 |
19 | patch_size: 16
20 | attn_drop_rate: 0.
21 | drop_rate: 0.
22 | drop_path_rate: 0.1
23 | use_grad_ckpt: True
24 | ckpt_layer: 12
25 |
26 | # bert config
27 | med_config_path: "configs/models/bert_config_alpro.json"
28 |
29 | preprocess:
30 | vis_processor:
31 | train:
32 | name: "alpro_video_train"
33 | n_frms: 16
34 | image_size: 224
35 | eval:
36 | name: "alpro_video_eval"
37 | n_frms: 16
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 |
9 | load_finetuned: True
10 |
11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 |
14 | timesformer:
15 | n_frms: 8
16 | image_size: 224
17 |
18 | patch_size: 16
19 | attn_drop_rate: 0.
20 | drop_rate: 0.
21 | drop_path_rate: 0.1
22 | use_grad_ckpt: False
23 |
24 | # bert config
25 | med_config_path: "configs/models/bert_config_alpro.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | eval:
30 | name: "alpro_video_eval"
31 | n_frms: 8
32 | image_size: 224
33 | text_processor:
34 | eval:
35 | name: "blip_caption"
36 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 |
9 | load_finetuned: True
10 |
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 |
14 | timesformer:
15 | n_frms: 8
16 | image_size: 224
17 |
18 | patch_size: 16
19 | attn_drop_rate: 0.
20 | drop_rate: 0.
21 | drop_path_rate: 0.1
22 | use_grad_ckpt: False
23 |
24 | # bert config
25 | med_config_path: "configs/models/bert_config_alpro.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "alpro_video_train"
31 | n_frms: 8
32 | image_size: 224
33 | eval:
34 | name: "alpro_video_eval"
35 | n_frms: 8
36 | image_size: 224
37 | text_processor:
38 | train:
39 | name: "blip_caption"
40 | eval:
41 | name: "blip_caption"
42 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30522,
19 | "encoder_width": 768,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": true,
18 | "type_vocab_size": 2,
19 | "vocab_size": 30522,
20 | "encoder_width": 768,
21 | "add_cross_attention": false,
22 | "fusion_layer": 6
23 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_flant5xl
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xl"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_opt2.7b
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-2.7b"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_opt6.7b
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-6.7b"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: coco
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: True
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 364
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 364
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | eval:
36 | name: "blip_caption"
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 224
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 224
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | eval:
36 | name: "blip_caption"
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xl"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | vit_model: "clip_L"
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # T5
25 | t5_model: "google/flan-t5-xl"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xxl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xxl"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_opt2.7b
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-2.7b"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_opt6.7b
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-6.7b"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | vit_model: "clip_L"
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 |
25 | preprocess:
26 | vis_processor:
27 | train:
28 | name: "blip_image_train"
29 | image_size: 224
30 | eval:
31 | name: "blip_image_eval"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 | eval:
37 | name: "blip_caption"
38 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 |
18 | image_size: 384
19 |
20 | # bert config
21 | med_config_path: "configs/models/med_config.json"
22 |
23 | # generation configs
24 | prompt: "a picture of "
25 |
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | eval:
32 | name: "blip_image_eval"
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 | prompt: "a picture of "
37 | eval:
38 | name: "blip_caption"
39 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 |
13 | vit_type: "large"
14 | vit_grad_ckpt: True
15 | vit_ckpt_layer: 5
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_large_config.json"
21 |
22 | # generation configs
23 | prompt: "a picture of "
24 |
25 |
26 | preprocess:
27 | vis_processor:
28 | train:
29 | name: "blip_image_train"
30 | eval:
31 | name: "blip_image_eval"
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | prompt: "a picture of "
36 | eval:
37 | name: "blip_caption"
38 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_classification
8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
9 |
10 | use_distill: True
11 | momentum: 0.995
12 | alpha: 0.4
13 |
14 | # vit encoder
15 | vit_type: "base"
16 | vit_grad_ckpt: False
17 | vit_ckpt_layer: 0
18 |
19 | image_size: 384
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
9 |
10 | # vit encoder
11 | vit_type: "base"
12 | vit_grad_ckpt: False
13 | vit_ckpt_layer: 0
14 |
15 | image_size: 224
16 |
17 | # bert config
18 | med_config_path: "configs/models/med_config.json"
19 |
20 | embed_dim: 256
21 |
22 | preprocess:
23 | vis_processor:
24 | eval:
25 | name: "blip_image_eval"
26 | image_size: 224
27 | text_processor:
28 | eval:
29 | name: "blip_caption"
30 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_image_text_matching
8 |
9 | load_finetuned: True
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_config.json"
21 |
22 | embed_dim: 256
23 |
24 | preprocess:
25 | vis_processor:
26 | eval:
27 | name: "blip_image_eval"
28 | image_size: 384
29 | text_processor:
30 | eval:
31 | name: "blip_caption"
32 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_image_text_matching
8 |
9 | load_finetuned: True
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 |
12 | # vit encoder
13 | vit_type: "large"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_large_config.json"
21 |
22 | embed_dim: 256
23 |
24 | preprocess:
25 | vis_processor:
26 | eval:
27 | name: "blip_image_eval"
28 | image_size: 384
29 | text_processor:
30 | eval:
31 | name: "blip_caption"
32 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_nlvr
8 | model_type: nlvr
9 | load_finetuned: True
10 |
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 |
14 | num_classes: 2
15 |
16 | # vit encoder
17 | vit_type: "base"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 | vit_layer_norm_epsilon: 1e-6
21 |
22 | image_size: 384
23 |
24 | # bert config
25 | med_config_path: "configs/models/med_config.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | image_size: 384
32 | eval:
33 | name: "blip_image_eval"
34 | image_size: 384
35 | text_processor:
36 | train:
37 | name: "blip_caption"
38 | eval:
39 | name: "blip_caption"
40 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 |
9 | load_pretrained: True
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 224
18 | alpha: 0.4
19 |
20 | # bert config
21 | med_config_path: "configs/models/bert_config.json"
22 |
23 | embed_dim: 256
24 |
25 | # generation configs
26 | prompt: "a picture of "
27 |
28 | preprocess:
29 | vis_processor:
30 | train:
31 | name: "blip_image_train"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 |
9 | # vit encoder
10 | vit_type: "large"
11 | vit_grad_ckpt: True
12 | vit_ckpt_layer: 5
13 |
14 | image_size: 224
15 |
16 | # bert config
17 | med_config_path: "configs/models/med_large_config.json"
18 |
19 | embed_dim: 256
20 |
21 | # generation configs
22 | prompt: "a picture of "
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | queue_size: 57600
14 |
15 | # vit encoder
16 | vit_type: "base"
17 | vit_grad_ckpt: True
18 | vit_ckpt_layer: 4
19 |
20 | image_size: 384
21 |
22 | # bert config
23 | med_config_path: "configs/models/med_config.json"
24 |
25 | embed_dim: 256
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | image_size: 384
32 | eval:
33 | name: "blip_image_eval"
34 | image_size: 384
35 | text_processor:
36 | train:
37 | name: "blip_caption"
38 | eval:
39 | name: "blip_caption"
40 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | queue_size: 57600
14 | alpha: 0.4
15 |
16 | negative_all_rank: False
17 |
18 | # vit encoder
19 | vit_type: "base"
20 | vit_grad_ckpt: True
21 | vit_ckpt_layer: 4
22 |
23 | image_size: 384
24 |
25 | # bert config
26 | med_config_path: "configs/models/med_config.json"
27 |
28 | embed_dim: 256
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 384
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 384
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 240,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 280,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 320,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "efficientnetv2_rw_s",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 288
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnet50d",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnetaa50d",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnetblur50",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "swin_base_patch4_window7_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_base_patch16_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_base_patch32_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_small_patch16_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: RN50
10 |
11 | pretrained: openai
12 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-B-16
10 |
11 | pretrained: openai
12 |
13 | preprocess:
14 | vis_processor:
15 | eval:
16 | name: "clip_image_eval"
17 | image_size: 224
18 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-B-32
10 | # ['RN50',
11 | # 'RN50-quickgelu',
12 | # 'RN50x4',
13 | # 'RN50x16',
14 | # 'RN101',
15 | # 'RN101-quickgelu',
16 | # 'timm-efficientnetv2_rw_s',
17 | # 'timm-resnet50d',
18 | # 'timm-resnetaa50d',
19 | # 'timm-resnetblur50',
20 | # 'timm-swin_base_patch4_window7_224',
21 | # 'timm-vit_base_patch16_224',
22 | # 'timm-vit_base_patch32_224',
23 | # 'timm-vit_small_patch16_224',
24 | # 'ViT-B-16',
25 | # 'ViT-B-16-plus',
26 | # 'ViT-B-16-plus-240',
27 | # 'ViT-B-32',
28 | # 'ViT-B-32-plus-256',
29 | # 'ViT-B-32-quickgelu',
30 | # 'ViT-g-14',
31 | # 'ViT-H-14',
32 | # 'ViT-H-16',
33 | # 'ViT-L-14',
34 | # 'ViT-L-14-280',
35 | # 'ViT-L-14-336',
36 | # 'ViT-L-16',
37 | # 'ViT-L-16-320']
38 |
39 | pretrained: openai
40 | # "openai"
41 | # following not available for all models
42 | # "yfcc15m"
43 | # "cc12m"
44 | # "laion400m_e31"
45 | # "laion400m_e32"
46 | # "laion400m_avg"
47 |
48 | preprocess:
49 | vis_processor:
50 | eval:
51 | name: "clip_image_eval"
52 | image_size: 224
53 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-L-14
10 | # ['RN50',
11 | # 'RN50-quickgelu',
12 | # 'RN50x4',
13 | # 'RN50x16',
14 | # 'RN101',
15 | # 'RN101-quickgelu',
16 | # 'timm-efficientnetv2_rw_s',
17 | # 'timm-resnet50d',
18 | # 'timm-resnetaa50d',
19 | # 'timm-resnetblur50',
20 | # 'timm-swin_base_patch4_window7_224',
21 | # 'timm-vit_base_patch16_224',
22 | # 'timm-vit_base_patch32_224',
23 | # 'timm-vit_small_patch16_224',
24 | # 'ViT-B-16',
25 | # 'ViT-B-16-plus',
26 | # 'ViT-B-16-plus-240',
27 | # 'ViT-B-32',
28 | # 'ViT-B-32-plus-256',
29 | # 'ViT-B-32-quickgelu',
30 | # 'ViT-g-14',
31 | # 'ViT-H-14',
32 | # 'ViT-H-16',
33 | # 'ViT-L-14',
34 | # 'ViT-L-14-280',
35 | # 'ViT-L-14-336',
36 | # 'ViT-L-16',
37 | # 'ViT-L-16-320']
38 |
39 | pretrained: openai
40 | # "openai"
41 | # following not available for all models
42 | # "yfcc15m"
43 | # "cc12m"
44 | # "laion400m_e31"
45 | # "laion400m_e32"
46 | # "laion400m_avg"
47 |
48 | preprocess:
49 | vis_processor:
50 | eval:
51 | name: "clip_image_eval"
52 | image_size: 224
53 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-L-14-336
10 | # ['RN50',
11 | # 'RN50-quickgelu',
12 | # 'RN50x4',
13 | # 'RN50x16',
14 | # 'RN101',
15 | # 'RN101-quickgelu',
16 | # 'timm-efficientnetv2_rw_s',
17 | # 'timm-resnet50d',
18 | # 'timm-resnetaa50d',
19 | # 'timm-resnetblur50',
20 | # 'timm-swin_base_patch4_window7_224',
21 | # 'timm-vit_base_patch16_224',
22 | # 'timm-vit_base_patch32_224',
23 | # 'timm-vit_small_patch16_224',
24 | # 'ViT-B-16',
25 | # 'ViT-B-16-plus',
26 | # 'ViT-B-16-plus-240',
27 | # 'ViT-B-32',
28 | # 'ViT-B-32-plus-256',
29 | # 'ViT-B-32-quickgelu',
30 | # 'ViT-g-14',
31 | # 'ViT-H-14',
32 | # 'ViT-H-16',
33 | # 'ViT-L-14',
34 | # 'ViT-L-14-280',
35 | # 'ViT-L-14-336',
36 | # 'ViT-L-16',
37 | # 'ViT-L-16-320']
38 |
39 | pretrained: openai
40 | # "openai"
41 | # following not available for all models
42 | # "yfcc15m"
43 | # "cc12m"
44 | # "laion400m_e31"
45 | # "laion400m_e32"
46 | # "laion400m_avg"
47 |
48 | preprocess:
49 | vis_processor:
50 | eval:
51 | name: "clip_image_eval"
52 | image_size: 336
53 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: gpt_dialogue
8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 |
11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens
12 |
13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 |
15 | preprocess:
16 | vis_processor:
17 | train:
18 | name: "gpt_video_ft"
19 | eval:
20 | name: "gpt_video_ft"
21 | text_processor:
22 | train:
23 | name: "gpt_dialogue"
24 | eval:
25 | name: "gpt_dialogue"
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: img2prompt_vqa
8 | model_type: base
9 |
10 | image_question_matching_model:
11 | arch: blip_image_text_matching
12 | load_finetuned: True
13 |
14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 |
16 | # vit encoder
17 | vit_type: "large"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 |
21 | image_size: 384
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_large_config.json"
25 |
26 | embed_dim: 256
27 |
28 | image_captioning_model:
29 | arch: blip_caption
30 | load_finetuned: True
31 |
32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 |
34 | vit_type: "large"
35 | vit_grad_ckpt: True
36 | vit_ckpt_layer: 5
37 |
38 | image_size: 384
39 |
40 | # bert config
41 | med_config_path: "configs/models/med_large_config.json"
42 |
43 | # generation configs
44 | prompt: "a picture of "
45 |
46 | question_generation_moodel:
47 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth"
48 |
49 |
50 |
51 | preprocess:
52 | vis_processor:
53 | eval:
54 | name: "blip_image_eval"
55 | image_size: 384
56 | text_processor:
57 | eval:
58 | name: "blip_caption"
59 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30524,
19 | "encoder_width": 768,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30522,
19 | "encoder_width": 768,
20 | "add_cross_attention": true,
21 | "fusion_layer": 6
22 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30524,
19 | "encoder_width": 1024,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pnp_vqa
8 | model_type: 3b
9 |
10 | image_question_matching_model:
11 | arch: blip_image_text_matching
12 | load_finetuned: True
13 |
14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 |
16 | # vit encoder
17 | vit_type: "large"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 |
21 | image_size: 384
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_large_config.json"
25 |
26 | embed_dim: 256
27 |
28 | image_captioning_model:
29 | arch: blip_caption
30 | load_finetuned: True
31 |
32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 |
34 | vit_type: "large"
35 | vit_grad_ckpt: True
36 | vit_ckpt_layer: 5
37 |
38 | image_size: 384
39 |
40 | # bert config
41 | med_config_path: "configs/models/med_large_config.json"
42 |
43 | # generation configs
44 | prompt: "a picture of "
45 |
46 | question_answering_model:
47 | arch: pnp_unifiedqav2_fid
48 |
49 | pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
50 |
51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
52 |
53 | preprocess:
54 | vis_processor:
55 | eval:
56 | name: "blip_image_eval"
57 | image_size: 384
58 | text_processor:
59 | eval:
60 | name: "blip_caption"
61 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pnp_vqa
8 | model_type: base
9 |
10 | image_question_matching_model:
11 | arch: blip_image_text_matching
12 | load_finetuned: True
13 |
14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 |
16 | # vit encoder
17 | vit_type: "large"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 |
21 | image_size: 384
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_large_config.json"
25 |
26 | embed_dim: 256
27 |
28 | image_captioning_model:
29 | arch: blip_caption
30 | load_finetuned: True
31 |
32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 |
34 | vit_type: "large"
35 | vit_grad_ckpt: True
36 | vit_ckpt_layer: 5
37 |
38 | image_size: 384
39 |
40 | # bert config
41 | med_config_path: "configs/models/med_large_config.json"
42 |
43 | # generation configs
44 | prompt: "a picture of "
45 | question_answering_model:
46 | arch: pnp_unifiedqav2_fid
47 |
48 | pretrained: "allenai/unifiedqa-v2-t5-base-1363200"
49 |
50 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json"
51 |
52 | preprocess:
53 | vis_processor:
54 | eval:
55 | name: "blip_image_eval"
56 | image_size: 384
57 | text_processor:
58 | eval:
59 | name: "blip_caption"
60 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pnp_vqa
8 | model_type: large
9 |
10 | image_question_matching_model:
11 | arch: blip_image_text_matching
12 | load_finetuned: True
13 |
14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 |
16 | # vit encoder
17 | vit_type: "large"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 |
21 | image_size: 384
22 |
23 | # bert config
24 | med_config_path: "configs/models/med_large_config.json"
25 |
26 | embed_dim: 256
27 |
28 | image_captioning_model:
29 | arch: blip_caption
30 | load_finetuned: True
31 |
32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 |
34 | vit_type: "large"
35 | vit_grad_ckpt: True
36 | vit_ckpt_layer: 5
37 |
38 | image_size: 384
39 |
40 | # bert config
41 | med_config_path: "configs/models/med_large_config.json"
42 |
43 | # generation configs
44 | prompt: "a picture of "
45 |
46 | question_answering_model:
47 | arch: pnp_unifiedqav2_fid
48 |
49 | pretrained: "allenai/unifiedqa-v2-t5-large-1363200"
50 |
51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json"
52 |
53 | preprocess:
54 | vis_processor:
55 | eval:
56 | name: "blip_image_eval"
57 | image_size: 384
58 | text_processor:
59 | eval:
60 | name: "blip_caption"
61 |
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "T5ForConditionalGeneration"
4 | ],
5 | "d_ff": 16384,
6 | "d_kv": 128,
7 | "d_model": 1024,
8 | "decoder_start_token_id": 0,
9 | "dense_act_fn": "relu",
10 | "dropout_rate": 0.1,
11 | "eos_token_id": 1,
12 | "feed_forward_proj": "relu",
13 | "gradient_checkpointing": false,
14 | "initializer_factor": 1.0,
15 | "is_encoder_decoder": true,
16 | "is_gated_act": false,
17 | "layer_norm_epsilon": 1e-06,
18 | "model_type": "t5",
19 | "n_positions": 512,
20 | "num_decoder_layers": 24,
21 | "num_heads": 32,
22 | "num_layers": 24,
23 | "output_past": true,
24 | "pad_token_id": 0,
25 | "relative_attention_max_distance": 128,
26 | "relative_attention_num_buckets": 32,
27 | "task_specific_params": {
28 | "summarization": {
29 | "early_stopping": true,
30 | "length_penalty": 2.0,
31 | "max_length": 200,
32 | "min_length": 30,
33 | "no_repeat_ngram_size": 3,
34 | "num_beams": 4,
35 | "prefix": "summarize: "
36 | },
37 | "translation_en_to_de": {
38 | "early_stopping": true,
39 | "max_length": 300,
40 | "num_beams": 4,
41 | "prefix": "translate English to German: "
42 | },
43 | "translation_en_to_fr": {
44 | "early_stopping": true,
45 | "max_length": 300,
46 | "num_beams": 4,
47 | "prefix": "translate English to French: "
48 | },
49 | "translation_en_to_ro": {
50 | "early_stopping": true,
51 | "max_length": 300,
52 | "num_beams": 4,
53 | "prefix": "translate English to Romanian: "
54 | }
55 | },
56 | "torch_dtype": "float32",
57 | "transformers_version": "4.21.3",
58 | "use_cache": true,
59 | "vocab_size": 32128
60 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "T5ForConditionalGeneration"
4 | ],
5 | "d_ff": 3072,
6 | "d_kv": 64,
7 | "d_model": 768,
8 | "decoder_start_token_id": 0,
9 | "dense_act_fn": "relu",
10 | "dropout_rate": 0.1,
11 | "eos_token_id": 1,
12 | "feed_forward_proj": "relu",
13 | "gradient_checkpointing": false,
14 | "initializer_factor": 1.0,
15 | "is_encoder_decoder": true,
16 | "is_gated_act": false,
17 | "layer_norm_epsilon": 1e-06,
18 | "model_type": "t5",
19 | "n_positions": 512,
20 | "num_decoder_layers": 12,
21 | "num_heads": 12,
22 | "num_layers": 12,
23 | "output_past": true,
24 | "pad_token_id": 0,
25 | "relative_attention_max_distance": 128,
26 | "relative_attention_num_buckets": 32,
27 | "task_specific_params": {
28 | "summarization": {
29 | "early_stopping": true,
30 | "length_penalty": 2.0,
31 | "max_length": 200,
32 | "min_length": 30,
33 | "no_repeat_ngram_size": 3,
34 | "num_beams": 4,
35 | "prefix": "summarize: "
36 | },
37 | "translation_en_to_de": {
38 | "early_stopping": true,
39 | "max_length": 300,
40 | "num_beams": 4,
41 | "prefix": "translate English to German: "
42 | },
43 | "translation_en_to_fr": {
44 | "early_stopping": true,
45 | "max_length": 300,
46 | "num_beams": 4,
47 | "prefix": "translate English to French: "
48 | },
49 | "translation_en_to_ro": {
50 | "early_stopping": true,
51 | "max_length": 300,
52 | "num_beams": 4,
53 | "prefix": "translate English to Romanian: "
54 | }
55 | },
56 | "transformers_version": "4.21.3",
57 | "use_cache": true,
58 | "vocab_size": 32128
59 | }
--------------------------------------------------------------------------------
/Lavis/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "T5ForConditionalGeneration"
4 | ],
5 | "d_ff": 4096,
6 | "d_kv": 64,
7 | "d_model": 1024,
8 | "decoder_start_token_id": 0,
9 | "dense_act_fn": "relu",
10 | "dropout_rate": 0.1,
11 | "eos_token_id": 1,
12 | "feed_forward_proj": "relu",
13 | "gradient_checkpointing": false,
14 | "initializer_factor": 1.0,
15 | "is_encoder_decoder": true,
16 | "is_gated_act": false,
17 | "layer_norm_epsilon": 1e-06,
18 | "model_type": "t5",
19 | "n_positions": 512,
20 | "num_decoder_layers": 24,
21 | "num_heads": 16,
22 | "num_layers": 24,
23 | "output_past": true,
24 | "pad_token_id": 0,
25 | "relative_attention_max_distance": 128,
26 | "relative_attention_num_buckets": 32,
27 | "task_specific_params": {
28 | "summarization": {
29 | "early_stopping": true,
30 | "length_penalty": 2.0,
31 | "max_length": 200,
32 | "min_length": 30,
33 | "no_repeat_ngram_size": 3,
34 | "num_beams": 4,
35 | "prefix": "summarize: "
36 | },
37 | "translation_en_to_de": {
38 | "early_stopping": true,
39 | "max_length": 300,
40 | "num_beams": 4,
41 | "prefix": "translate English to German: "
42 | },
43 | "translation_en_to_fr": {
44 | "early_stopping": true,
45 | "max_length": 300,
46 | "num_beams": 4,
47 | "prefix": "translate English to French: "
48 | },
49 | "translation_en_to_ro": {
50 | "early_stopping": true,
51 | "max_length": 300,
52 | "num_beams": 4,
53 | "prefix": "translate English to Romanian: "
54 | }
55 | },
56 | "transformers_version": "4.21.3",
57 | "use_cache": true,
58 | "vocab_size": 32128
59 | }
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/caption_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
9 | from lavis.datasets.datasets.coco_caption_datasets import (
10 | COCOCapDataset,
11 | COCOCapEvalDataset,
12 | NoCapsEvalDataset,
13 | )
14 |
15 | from lavis.common.registry import registry
16 | from lavis.datasets.datasets.video_caption_datasets import (
17 | VideoCaptionDataset,
18 | VideoCaptionEvalDataset,
19 | )
20 |
21 |
22 | @registry.register_builder("coco_caption")
23 | class COCOCapBuilder(BaseDatasetBuilder):
24 | train_dataset_cls = COCOCapDataset
25 | eval_dataset_cls = COCOCapEvalDataset
26 |
27 | DATASET_CONFIG_DICT = {
28 | "default": "configs/datasets/coco/defaults_cap.yaml",
29 | }
30 |
31 |
32 | @registry.register_builder("nocaps")
33 | class COCOCapBuilder(BaseDatasetBuilder):
34 | eval_dataset_cls = NoCapsEvalDataset
35 |
36 | DATASET_CONFIG_DICT = {
37 | "default": "configs/datasets/nocaps/defaults.yaml",
38 | }
39 |
40 |
41 | @registry.register_builder("msrvtt_caption")
42 | class MSRVTTCapBuilder(BaseDatasetBuilder):
43 | train_dataset_cls = VideoCaptionDataset
44 | eval_dataset_cls = VideoCaptionEvalDataset
45 |
46 | DATASET_CONFIG_DICT = {
47 | "default": "configs/datasets/msrvtt/defaults_cap.yaml",
48 | }
49 |
50 |
51 | @registry.register_builder("msvd_caption")
52 | class MSVDCapBuilder(BaseDatasetBuilder):
53 | train_dataset_cls = VideoCaptionDataset
54 | eval_dataset_cls = VideoCaptionEvalDataset
55 |
56 | DATASET_CONFIG_DICT = {
57 | "default": "configs/datasets/msvd/defaults_cap.yaml",
58 | }
59 |
60 |
61 | @registry.register_builder("vatex_caption")
62 | class VATEXCapBuilder(BaseDatasetBuilder):
63 | train_dataset_cls = VideoCaptionDataset
64 | eval_dataset_cls = VideoCaptionEvalDataset
65 |
66 | DATASET_CONFIG_DICT = {
67 | "default": "configs/datasets/vatex/defaults_cap.yaml",
68 | }
69 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 |
13 |
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 | train_dataset_cls = NLVRDataset
17 | eval_dataset_cls = NLVREvalDataset
18 |
19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 |
21 |
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 | train_dataset_cls = SNLIVisualEntialmentDataset
25 | eval_dataset_cls = SNLIVisualEntialmentDataset
26 |
27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 | AVSDDialDataset,
12 | AVSDDialEvalDataset,
13 | )
14 |
15 |
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 | train_dataset_cls = AVSDDialDataset
19 | eval_dataset_cls = AVSDDialEvalDataset
20 |
21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/retrieval_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
9 | from lavis.datasets.datasets.retrieval_datasets import (
10 | RetrievalDataset,
11 | RetrievalEvalDataset,
12 | VideoRetrievalDataset,
13 | VideoRetrievalEvalDataset,
14 | )
15 |
16 | from lavis.common.registry import registry
17 |
18 |
19 | @registry.register_builder("msrvtt_retrieval")
20 | class MSRVTTRetrievalBuilder(BaseDatasetBuilder):
21 | train_dataset_cls = VideoRetrievalDataset
22 | eval_dataset_cls = VideoRetrievalEvalDataset
23 |
24 | DATASET_CONFIG_DICT = {"default": "configs/datasets/msrvtt/defaults_ret.yaml"}
25 |
26 |
27 | @registry.register_builder("didemo_retrieval")
28 | class DiDeMoRetrievalBuilder(BaseDatasetBuilder):
29 | train_dataset_cls = VideoRetrievalDataset
30 | eval_dataset_cls = VideoRetrievalEvalDataset
31 |
32 | DATASET_CONFIG_DICT = {"default": "configs/datasets/didemo/defaults_ret.yaml"}
33 |
34 |
35 | @registry.register_builder("coco_retrieval")
36 | class COCORetrievalBuilder(BaseDatasetBuilder):
37 | train_dataset_cls = RetrievalDataset
38 | eval_dataset_cls = RetrievalEvalDataset
39 |
40 | DATASET_CONFIG_DICT = {"default": "configs/datasets/coco/defaults_ret.yaml"}
41 |
42 |
43 | @registry.register_builder("flickr30k")
44 | class Flickr30kBuilder(BaseDatasetBuilder):
45 | train_dataset_cls = RetrievalDataset
46 | eval_dataset_cls = RetrievalEvalDataset
47 |
48 | DATASET_CONFIG_DICT = {"default": "configs/datasets/flickr30k/defaults.yaml"}
49 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/video_qa_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.common.utils import get_cache_path
10 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
11 | from lavis.datasets.datasets.video_vqa_datasets import VideoQADataset
12 |
13 |
14 | class VideoQABuilder(BaseDatasetBuilder):
15 | train_dataset_cls = VideoQADataset
16 | eval_dataset_cls = VideoQADataset
17 |
18 | def build(self):
19 | datasets = super().build()
20 |
21 | ans2label = self.config.build_info.annotations.get("ans2label")
22 | if ans2label is None:
23 | raise ValueError("ans2label is not specified in build_info.")
24 |
25 | ans2label = get_cache_path(ans2label.storage)
26 |
27 | for split in datasets:
28 | datasets[split]._build_class_labels(ans2label)
29 |
30 | return datasets
31 |
32 |
33 | @registry.register_builder("msrvtt_qa")
34 | class MSRVTTQABuilder(VideoQABuilder):
35 | DATASET_CONFIG_DICT = {
36 | "default": "configs/datasets/msrvtt/defaults_qa.yaml",
37 | }
38 |
39 |
40 | @registry.register_builder("msvd_qa")
41 | class MSVDQABuilder(VideoQABuilder):
42 | DATASET_CONFIG_DICT = {
43 | "default": "configs/datasets/msvd/defaults_qa.yaml",
44 | }
45 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/builders/vqa_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
9 |
10 | from lavis.common.registry import registry
11 | from lavis.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
12 | from lavis.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
13 | from lavis.datasets.datasets.vg_vqa_datasets import VGVQADataset
14 | from lavis.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
15 |
16 |
17 | @registry.register_builder("coco_vqa")
18 | class COCOVQABuilder(BaseDatasetBuilder):
19 | train_dataset_cls = COCOVQADataset
20 | eval_dataset_cls = COCOVQAEvalDataset
21 |
22 | DATASET_CONFIG_DICT = {
23 | "default": "configs/datasets/coco/defaults_vqa.yaml",
24 | "eval": "configs/datasets/coco/eval_vqa.yaml",
25 | }
26 |
27 |
28 | @registry.register_builder("vg_vqa")
29 | class VGVQABuilder(BaseDatasetBuilder):
30 | train_dataset_cls = VGVQADataset
31 | DATASET_CONFIG_DICT = {"default": "configs/datasets/vg/defaults_vqa.yaml"}
32 |
33 |
34 | @registry.register_builder("ok_vqa")
35 | class OKVQABuilder(COCOVQABuilder):
36 | DATASET_CONFIG_DICT = {
37 | "default": "configs/datasets/okvqa/defaults.yaml",
38 | }
39 |
40 |
41 | @registry.register_builder("aok_vqa")
42 | class AOKVQABuilder(BaseDatasetBuilder):
43 | train_dataset_cls = AOKVQADataset
44 | eval_dataset_cls = AOKVQAEvalDataset
45 |
46 | DATASET_CONFIG_DICT = {
47 | "default": "configs/datasets/aokvqa/defaults.yaml",
48 | "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml",
49 | }
50 |
51 |
52 | @registry.register_builder("gqa")
53 | class GQABuilder(BaseDatasetBuilder):
54 | train_dataset_cls = GQADataset
55 | eval_dataset_cls = GQAEvalDataset
56 |
57 | DATASET_CONFIG_DICT = {
58 | "default": "configs/datasets/gqa/defaults.yaml",
59 | "balanced_val": "configs/datasets/gqa/balanced_val.yaml",
60 | "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml",
61 | }
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/image_text_pair_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from collections import OrderedDict
10 |
11 | from lavis.datasets.datasets.base_dataset import BaseDataset
12 | from PIL import Image
13 |
14 |
15 | class __DisplMixin:
16 | def displ_item(self, index):
17 | sample, ann = self.__getitem__(index), self.annotation[index]
18 |
19 | return OrderedDict(
20 | {
21 | "file": os.path.basename(ann["image"]),
22 | "caption": ann["caption"],
23 | "image": sample["image"],
24 | }
25 | )
26 |
27 |
28 | class ImageTextPairDataset(BaseDataset, __DisplMixin):
29 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
30 | """
31 | vis_root (string): Root directory of images (e.g. coco/images/)
32 | ann_root (string): directory to store the annotation file
33 | """
34 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
35 |
36 | def __getitem__(self, index):
37 |
38 | # TODO this assumes image input, not general enough
39 | ann = self.annotation[index]
40 |
41 | image_path = os.path.join(self.vis_root, ann["image"])
42 | image = Image.open(image_path).convert("RGB")
43 |
44 | image = self.vis_processor(image)
45 | caption = self.text_processor(ann["caption"])
46 |
47 | return {"image": image, "text_input": caption}
48 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/imagefolder_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from collections import OrderedDict
10 |
11 | from lavis.datasets.datasets.base_dataset import BaseDataset
12 | from PIL import Image
13 | from torchvision import datasets
14 |
15 |
16 | class ImageFolderDataset(BaseDataset):
17 | def __init__(self, vis_processor, vis_root, classnames=[], **kwargs):
18 | super().__init__(vis_processor=vis_processor, vis_root=vis_root)
19 |
20 | self.inner_dataset = datasets.ImageFolder(vis_root)
21 |
22 | self.annotation = [
23 | {"image": elem[0], "label": elem[1], "image_id": elem[0]}
24 | for elem in self.inner_dataset.imgs
25 | ]
26 |
27 | self.classnames = classnames
28 |
29 | self._add_instance_ids()
30 |
31 | def __len__(self):
32 | return len(self.inner_dataset)
33 |
34 | def __getitem__(self, index):
35 | ann = self.annotation[index]
36 |
37 | img_fn = ann["image"]
38 | image_path = os.path.join(self.vis_root, img_fn)
39 | image = Image.open(image_path).convert("RGB")
40 |
41 | image = self.vis_processor(image)
42 |
43 | return {
44 | "image": image,
45 | "label": ann["label"],
46 | "image_id": ann["image_id"],
47 | "instance_id": ann["instance_id"],
48 | }
49 |
50 | def displ_item(self, index):
51 | sample, ann = self.__getitem__(index), self.annotation[index]
52 |
53 | return OrderedDict(
54 | {
55 | "file": ann["image"],
56 | "label": self.classnames[ann["label"]],
57 | "image": sample["image"],
58 | }
59 | )
60 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import webdataset as wds
9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 |
11 |
12 | class LaionDataset(BaseDataset):
13 | def __init__(self, vis_processor, text_processor, location):
14 | super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 |
16 | self.inner_dataset = wds.DataPipeline(
17 | wds.ResampledShards(location),
18 | wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 | wds.shuffle(1000, handler=wds.warn_and_continue),
20 | wds.decode("pilrgb", handler=wds.warn_and_continue),
21 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 | wds.map(self.to_dict, handler=wds.warn_and_continue),
24 | )
25 |
26 | def to_dict(self, sample):
27 | return {
28 | "image": sample[0],
29 | "text_input": self.text_processor(sample[1]["caption"]),
30 | }
31 |
32 |
33 | if __name__ == "__main__":
34 | from torchvision import transforms
35 |
36 | def to_image_text_pair(sample):
37 | return sample[0], sample[1]["caption"]
38 |
39 | normalize = transforms.Normalize(
40 | (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
41 | )
42 |
43 | transform_train = transforms.Compose(
44 | [
45 | transforms.RandomResizedCrop(256, scale=(0.2, 1.0)),
46 | transforms.RandomHorizontalFlip(),
47 | transforms.ToTensor(),
48 | normalize,
49 | ]
50 | )
51 |
52 | dataset = LaionDataset(
53 | vis_processor=transform_train,
54 | text_processor=lambda x: x,
55 | location="/export/laion/laion2B-multi/part-00000/{00000..01743}.tar",
56 | )
57 |
58 | import torch
59 |
60 | loader = torch.utils.data.DataLoader(dataset.inner_dataset, batch_size=2)
61 |
62 | print(next(iter(loader))["text_input"])
63 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from abc import abstractmethod
9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 |
11 |
12 | class MultimodalClassificationDataset(BaseDataset):
13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 |
16 | self.class_labels = None
17 |
18 | @abstractmethod
19 | def _build_class_labels(self):
20 | pass
21 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/snli_ve_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from collections import OrderedDict
10 |
11 | from lavis.datasets.datasets.multimodal_classification_datasets import (
12 | MultimodalClassificationDataset,
13 | )
14 | from PIL import Image
15 |
16 |
17 | class __DisplMixin:
18 | def displ_item(self, index):
19 | sample, ann = self.__getitem__(index), self.annotation[index]
20 |
21 | return OrderedDict(
22 | {
23 | "file": os.path.basename(ann["image"]),
24 | "sentence": ann["sentence"],
25 | "label": ann["label"],
26 | "image": sample["image"],
27 | }
28 | )
29 |
30 |
31 | class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __DisplMixin):
32 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
33 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
34 |
35 | self.class_labels = self._build_class_labels()
36 |
37 | def _build_class_labels(self):
38 | return {"contradiction": 0, "neutral": 1, "entailment": 2}
39 |
40 | def __getitem__(self, index):
41 | ann = self.annotation[index]
42 |
43 | image_id = ann["image"]
44 | image_path = os.path.join(self.vis_root, "%s.jpg" % image_id)
45 | image = Image.open(image_path).convert("RGB")
46 |
47 | image = self.vis_processor(image)
48 | sentence = self.text_processor(ann["sentence"])
49 |
50 | return {
51 | "image": image,
52 | "text_input": sentence,
53 | "label": self.class_labels[ann["label"]],
54 | "image_id": image_id,
55 | "instance_id": ann["instance_id"],
56 | }
57 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 |
10 | from PIL import Image
11 |
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 |
14 |
15 | class VGVQADataset(VQADataset):
16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 |
19 | def __getitem__(self, index):
20 | ann = self.annotation[index]
21 |
22 | image_path = os.path.join(self.vis_root, ann["image"])
23 | image = Image.open(image_path).convert("RGB")
24 |
25 | image = self.vis_processor(image)
26 | question = self.text_processor(ann["question"])
27 |
28 | answers = [ann["answer"]]
29 | # TODO this should be configured better
30 | weights = [0.2]
31 |
32 | return {
33 | "image": image,
34 | "text_input": question,
35 | "answers": answers,
36 | "weights": weights,
37 | }
38 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/video_caption_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 |
11 | from lavis.datasets.datasets.caption_datasets import CaptionDataset
12 |
13 |
14 | class VideoCaptionDataset(CaptionDataset):
15 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
16 | """
17 | vis_root (string): Root directory of images (e.g. coco/images/)
18 | ann_root (string): directory to store the annotation file
19 | split (string): val or test
20 | """
21 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
22 |
23 | def __getitem__(self, index):
24 |
25 | ann = self.annotation[index]
26 |
27 | vname = ann["video"]
28 | video_path = os.path.join(self.vis_root, vname)
29 |
30 | video = self.vis_processor(video_path)
31 | caption = self.text_processor(ann["caption"])
32 |
33 | # "image_id" is kept to stay compatible with the COCO evaluation format
34 | return {
35 | "video": video,
36 | "text_input": caption,
37 | "image_id": self.img_ids[ann["image_id"]],
38 | }
39 |
40 |
41 | class VideoCaptionEvalDataset(BaseDataset):
42 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
43 | """
44 | vis_root (string): Root directory of images (e.g. coco/images/)
45 | ann_root (string): directory to store the annotation file
46 | split (string): val or test
47 | """
48 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
49 |
50 | def __getitem__(self, index):
51 |
52 | ann = self.annotation[index]
53 |
54 | vname = ann["video"]
55 | video_path = os.path.join(self.vis_root, vname)
56 |
57 | video = self.vis_processor(video_path)
58 |
59 | return {
60 | "video": video,
61 | "image_id": ann["image_id"],
62 | "instance_id": ann["instance_id"],
63 | }
64 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/video_vqa_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import json
9 | import os
10 | from collections import OrderedDict
11 |
12 | from lavis.datasets.datasets.multimodal_classification_datasets import (
13 | MultimodalClassificationDataset,
14 | )
15 |
16 |
17 | class __DisplMixin:
18 | def displ_item(self, index):
19 | ann = self.annotation[index]
20 |
21 | vname = ann["video"]
22 | vpath = os.path.join(self.vis_root, vname)
23 |
24 | return OrderedDict(
25 | {"file": vpath, "question": ann["question"], "answer": ann["answer"]}
26 | )
27 |
28 |
29 | class VideoQADataset(MultimodalClassificationDataset, __DisplMixin):
30 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
31 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
32 |
33 | def _build_class_labels(self, ans_path):
34 | ans2label = json.load(open(ans_path))
35 |
36 | self.class_labels = ans2label
37 |
38 | def _get_answer_label(self, answer):
39 | if answer in self.class_labels:
40 | return self.class_labels[answer]
41 | else:
42 | return len(self.class_labels)
43 |
44 | def __getitem__(self, index):
45 | assert (
46 | self.class_labels
47 | ), f"class_labels of {__class__.__name__} is not built yet."
48 |
49 | ann = self.annotation[index]
50 |
51 | vname = ann["video"]
52 | vpath = os.path.join(self.vis_root, vname)
53 |
54 | frms = self.vis_processor(vpath)
55 | question = self.text_processor(ann["question"])
56 |
57 | return {
58 | "video": frms,
59 | "text_input": question,
60 | "answers": self._get_answer_label(ann["answer"]),
61 | "question_id": ann["question_id"],
62 | "instance_id": ann["instance_id"],
63 | }
64 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/datasets/vqa_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import torch
9 |
10 | from lavis.datasets.datasets.base_dataset import BaseDataset
11 |
12 |
13 | class VQADataset(BaseDataset):
14 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
15 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
16 |
17 | def collater(self, samples):
18 | image_list, question_list, answer_list, weight_list = [], [], [], []
19 |
20 | num_answers = []
21 |
22 | for sample in samples:
23 | image_list.append(sample["image"])
24 | question_list.append(sample["text_input"].capitalize())
25 |
26 | weight_list.extend(sample["weights"])
27 |
28 | answers = sample["answers"]
29 |
30 | answer_list.extend(answers)
31 | num_answers.append(len(answers))
32 |
33 | return {
34 | "image": torch.stack(image_list, dim=0),
35 | "text_input": question_list,
36 | "answer": answer_list,
37 | "weight": torch.Tensor(weight_list),
38 | "n_answers": torch.LongTensor(num_answers),
39 | }
40 |
41 |
42 | class VQAEvalDataset(BaseDataset):
43 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
44 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
45 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | MIT License
6 |
7 | Copyright (c) 2019 Igor Brigadir
8 |
9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 |
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 |
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md:
--------------------------------------------------------------------------------
1 |
7 |
8 | # Download Conceptual Captions Data
9 |
10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
11 |
12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
13 |
14 | run `download_data_cc3m.py` or `download_data_cc12m.py`.
15 |
16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
17 |
18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
19 |
20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
21 |
22 | It should take about a day or two to download the training data, keep an eye on disk space.
23 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_coco.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | download_and_extract_archive,
16 | get_abs_path,
17 | get_cache_path,
18 | )
19 |
20 |
21 | DATA_URL = {
22 | "train": "http://images.cocodataset.org/zips/train2014.zip", # md5: 0da8c0bd3d6becc4dcb32757491aca88
23 | "val": "http://images.cocodataset.org/zips/val2014.zip", # md5: a3d79f5ed8d289b7a7554ce06a5782b3
24 | "test": "http://images.cocodataset.org/zips/test2014.zip", # md5: 04127eef689ceac55e3a572c2c92f264
25 | "test2015": "http://images.cocodataset.org/zips/test2015.zip", # md5: 04127eef689ceac55e3a572c2c92f264
26 | }
27 |
28 |
29 | def download_datasets(root, url):
30 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
31 |
32 |
33 | if __name__ == "__main__":
34 |
35 | config_path = get_abs_path("configs/datasets/coco/defaults_cap.yaml")
36 |
37 | storage_dir = OmegaConf.load(
38 | config_path
39 | ).datasets.coco_caption.build_info.images.storage
40 |
41 | download_dir = Path(get_cache_path(storage_dir)).parent / "download"
42 | storage_dir = Path(get_cache_path(storage_dir))
43 |
44 | if storage_dir.exists():
45 | print(f"Dataset already exists at {storage_dir}. Aborting.")
46 | exit(0)
47 |
48 | try:
49 | for k, v in DATA_URL.items():
50 | print("Downloading {} to {}".format(v, k))
51 | download_datasets(download_dir, v)
52 | except Exception as e:
53 | # remove download dir if failed
54 | cleanup_dir(download_dir)
55 | print("Failed to download or extracting datasets. Aborting.")
56 |
57 | cleanup_dir(download_dir)
58 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_didemo.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | download_and_extract_archive,
16 | get_abs_path,
17 | get_cache_path,
18 | )
19 |
20 | DATA_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/didemo_videos.tar.gz"
21 |
22 |
23 | def download_datasets(root, url):
24 | """
25 | Download the Imagenet-R dataset archives and expand them
26 | in the folder provided as parameter
27 | """
28 | download_and_extract_archive(url=url, download_root=root)
29 |
30 |
31 | def move_files(download_path, storage_path):
32 | """
33 | Move files from download_path to storage_path
34 | """
35 | print("Moving to {}".format(storage_path))
36 |
37 | os.makedirs(storage_path, exist_ok=True)
38 |
39 | for file_name in os.listdir(download_path):
40 | os.rename(
41 | os.path.join(download_path, file_name),
42 | os.path.join(storage_path, file_name),
43 | )
44 |
45 |
46 | if __name__ == "__main__":
47 |
48 | config_path = get_abs_path("configs/datasets/didemo/defaults_ret.yaml")
49 |
50 | storage_dir = OmegaConf.load(
51 | config_path
52 | ).datasets.didemo_retrieval.build_info.videos.storage
53 |
54 | download_dir = Path(get_cache_path(storage_dir)).parent / "download"
55 | storage_dir = Path(get_cache_path(storage_dir))
56 |
57 | if storage_dir.exists():
58 | print(f"Dataset already exists at {storage_dir}. Aborting.")
59 | exit(0)
60 |
61 | try:
62 | print("Downloading {} to {}".format(DATA_URL, download_dir))
63 | download_datasets(download_dir, DATA_URL)
64 | except Exception as e:
65 | # remove download dir if failed
66 | cleanup_dir(download_dir)
67 | print("Failed to download or extracting datasets. Aborting.")
68 |
69 | move_files(download_dir / "videos", storage_dir)
70 | cleanup_dir(download_dir)
71 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_flickr.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | get_abs_path,
16 | get_cache_path,
17 | )
18 |
19 | import opendatasets as od
20 |
21 |
22 | DATA_URL = "https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset"
23 |
24 | print(
25 | """
26 | To download the dataset, you need to have a Kaggle account and the associated key.
27 | See https://www.kaggle.com/docs/api to create account and a new API token.
28 | """
29 | )
30 |
31 |
32 | def move_directory(src_dir, dst_dir):
33 | """
34 | Move files from download_path to storage_path
35 | """
36 | print("Moving to {}".format(dst_dir))
37 |
38 | os.makedirs(dst_dir, exist_ok=True)
39 |
40 | for file_name in os.listdir(src_dir):
41 | os.rename(
42 | os.path.join(src_dir, file_name),
43 | os.path.join(dst_dir, file_name),
44 | )
45 |
46 |
47 | if __name__ == "__main__":
48 |
49 | config_path = get_abs_path("configs/datasets/flickr30k/defaults.yaml")
50 |
51 | storage_dir = OmegaConf.load(
52 | config_path
53 | ).datasets.flickr30k.build_info.images.storage
54 |
55 | storage_dir = Path(get_cache_path(storage_dir))
56 | download_dir = storage_dir.parent / "download"
57 |
58 | if storage_dir.exists():
59 | print(f"Dataset already exists at {storage_dir}. Aborting.")
60 | exit(0)
61 |
62 | os.makedirs(download_dir)
63 |
64 | try:
65 | print("Downloading {} to {}".format(DATA_URL, download_dir))
66 | od.download(DATA_URL, download_dir)
67 | except Exception as e:
68 | print(e)
69 | # remove download dir if failed
70 | cleanup_dir(download_dir)
71 | exit(1)
72 |
73 | move_directory(
74 | download_dir / "flickr-image-dataset" / "flickr30k_images" / "flickr30k_images",
75 | storage_dir / "flickr30k-images",
76 | )
77 |
78 | cleanup_dir(download_dir)
79 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_gqa.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | download_and_extract_archive,
16 | get_abs_path,
17 | get_cache_path,
18 | )
19 |
20 |
21 | DATA_URL = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
22 |
23 |
24 | def download_datasets(root, url):
25 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir.parent)
26 |
27 |
28 | if __name__ == "__main__":
29 |
30 | config_path = get_abs_path("configs/datasets/gqa/defaults.yaml")
31 |
32 | storage_dir = OmegaConf.load(
33 | config_path
34 | ).datasets.gqa.build_info.images.storage
35 |
36 | download_dir = Path(get_cache_path(storage_dir)).parent / "download"
37 | storage_dir = Path(get_cache_path(storage_dir))
38 |
39 | if storage_dir.exists():
40 | print(f"Dataset already exists at {storage_dir}. Aborting.")
41 | exit(0)
42 |
43 | try:
44 | print("Downloading {}".format(DATA_URL))
45 | download_datasets(download_dir, DATA_URL)
46 | except Exception as e:
47 | # remove download dir if failed
48 | cleanup_dir(download_dir)
49 | print("Failed to download or extracting datasets. Aborting.")
50 |
51 | cleanup_dir(download_dir)
52 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_msvd.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | download_and_extract_archive,
16 | get_abs_path,
17 | get_cache_path,
18 | )
19 |
20 |
21 | DATA_URL = "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar"
22 |
23 |
24 | def download_datasets(root, url):
25 | download_and_extract_archive(url=url, download_root=root)
26 |
27 |
28 | def move_files(download_path, storage_path):
29 | """
30 | Move files from download_path to storage_path
31 | """
32 | print("Moving to {}".format(storage_path))
33 |
34 | os.makedirs(storage_path, exist_ok=True)
35 |
36 | for file_name in os.listdir(download_path):
37 | os.rename(
38 | os.path.join(download_path, file_name),
39 | os.path.join(storage_path, file_name),
40 | )
41 |
42 |
43 | if __name__ == "__main__":
44 |
45 | config_path = get_abs_path("configs/datasets/msvd/defaults_cap.yaml")
46 |
47 | storage_dir = OmegaConf.load(
48 | config_path
49 | ).datasets.msvd_cap.build_info.videos.storage
50 |
51 | download_dir = Path(get_cache_path(storage_dir)).parent / "download"
52 | storage_dir = Path(get_cache_path(storage_dir))
53 |
54 | if storage_dir.exists():
55 | print(f"Dataset already exists at {storage_dir}. Aborting.")
56 | exit(0)
57 |
58 | try:
59 | print("Downloading {}".format(DATA_URL))
60 | download_datasets(download_dir, DATA_URL)
61 | except Exception as e:
62 | # remove download dir if failed
63 | cleanup_dir(download_dir)
64 | print("Failed to download or extracting datasets. Aborting.")
65 |
66 | move_files(download_dir / "YouTubeClips", storage_dir)
67 | cleanup_dir(download_dir)
68 |
--------------------------------------------------------------------------------
/Lavis/lavis/datasets/download_scripts/download_vg.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | from pathlib import Path
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.utils import (
14 | cleanup_dir,
15 | download_and_extract_archive,
16 | get_abs_path,
17 | get_cache_path,
18 | )
19 |
20 |
21 | DATA_URL = {
22 | "train": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
23 | "train2": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip",
24 | }
25 |
26 |
27 | def download_datasets(root, url):
28 | download_and_extract_archive(url=url, download_root=root, extract_root=storage_dir)
29 |
30 |
31 | if __name__ == "__main__":
32 |
33 | config_path = get_abs_path("configs/datasets/vg/defaults_caption.yaml")
34 |
35 | storage_dir = OmegaConf.load(
36 | config_path
37 | ).datasets.vg_caption.build_info.images.storage
38 |
39 | download_dir = Path(get_cache_path(storage_dir)).parent / "download"
40 | storage_dir = Path(get_cache_path(storage_dir))
41 |
42 | if storage_dir.exists():
43 | print(f"Dataset already exists at {storage_dir}. Aborting.")
44 | exit(0)
45 |
46 | try:
47 | for k, v in DATA_URL.items():
48 | print("Downloading {} to {}".format(v, k))
49 | download_datasets(download_dir, v)
50 | except Exception as e:
51 | # remove download dir if failed
52 | cleanup_dir(download_dir)
53 | print("Failed to download or extracting datasets. Aborting.")
54 |
55 | cleanup_dir(download_dir)
56 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/alpro_models/alpro_outputs.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from dataclasses import dataclass
9 | from typing import Optional
10 |
11 | import torch
12 | from transformers.modeling_outputs import (
13 | BaseModelOutputWithPoolingAndCrossAttentions,
14 | ModelOutput,
15 | )
16 |
17 |
18 | @dataclass
19 | class AlproSimilarity(ModelOutput):
20 | sim_v2t: torch.FloatTensor = None
21 | sim_t2v: torch.FloatTensor = None
22 |
23 | sim_v2t_targets: Optional[torch.FloatTensor] = None
24 | sim_t2v_targets: Optional[torch.FloatTensor] = None
25 |
26 |
27 | @dataclass
28 | class AlproIntermediateOutput(ModelOutput):
29 | # uni-modal features
30 | video_embeds: torch.FloatTensor = None
31 | text_embeds: Optional[torch.FloatTensor] = None
32 |
33 | # intermediate outputs of multimodal encoder
34 | encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
35 | encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
36 |
37 | vtm_logits: Optional[torch.FloatTensor] = None
38 | vtm_labels: Optional[torch.LongTensor] = None
39 |
40 |
41 | @dataclass
42 | class AlproOutput(ModelOutput):
43 | # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
44 | sims: Optional[AlproSimilarity] = None
45 |
46 | intermediate_output: AlproIntermediateOutput = None
47 |
48 | loss: Optional[torch.FloatTensor] = None
49 |
50 | loss_vtc: Optional[torch.FloatTensor] = None
51 |
52 | loss_vtm: Optional[torch.FloatTensor] = None
53 |
54 | loss_mlm: Optional[torch.FloatTensor] = None
55 |
56 |
57 | @dataclass
58 | class AlproOutputWithLogits(AlproOutput):
59 | logits: torch.FloatTensor = None
60 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/blip2_models/__init__.py
--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/mlfoundations/open_clip
8 | """
9 |
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 |
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/clip_outputs.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/mlfoundations/open_clip
8 | """
9 |
10 | from dataclasses import dataclass
11 |
12 | from typing import Optional
13 |
14 | import torch
15 | from transformers.modeling_outputs import ModelOutput
16 |
17 |
18 | @dataclass
19 | class ClipOutputFeatures(ModelOutput):
20 | """
21 | Data class of features from AlbefFeatureExtractor.
22 |
23 | Args:
24 | image_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional`
25 | image_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional`
26 | text_embeds: `torch.FloatTensor` of shape `(batch_size, 1, embed_dim)`, `optional`
27 | text_features: `torch.FloatTensor` of shape `(batch_size, 1, feature_dim)`, `optional`
28 | """
29 |
30 | image_embeds: Optional[torch.FloatTensor] = None
31 | image_embeds_proj: Optional[torch.FloatTensor] = None
32 |
33 | text_embeds: Optional[torch.FloatTensor] = None
34 | text_embeds_proj: Optional[torch.FloatTensor] = None
35 |
36 |
37 | @dataclass
38 | class ClipOutput(ModelOutput):
39 | intermediate_output: Optional[ClipOutputFeatures] = None
40 |
41 | logit_scale_exp: Optional[torch.FloatTensor] = None
42 |
43 | loss: Optional[torch.FloatTensor] = None
44 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/Lavis/lavis/models/clip_models/pics/CLIP.png
--------------------------------------------------------------------------------
/Lavis/lavis/models/clip_models/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/mlfoundations/open_clip
8 | """
9 |
10 | from torch import nn as nn
11 | from torchvision.ops.misc import FrozenBatchNorm2d
12 |
13 |
14 | def freeze_batch_norm_2d(module, module_match={}, name=""):
15 | """
16 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
17 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
18 | returned. Otherwise, the module is walked recursively and submodules are converted in place.
19 | Args:
20 | module (torch.nn.Module): Any PyTorch module.
21 | module_match (dict): Dictionary of full module names to freeze (all if empty)
22 | name (str): Full module name (prefix)
23 | Returns:
24 | torch.nn.Module: Resulting module
25 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
26 | """
27 | res = module
28 | is_match = True
29 | if module_match:
30 | is_match = name in module_match
31 | if is_match and isinstance(
32 | module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)
33 | ):
34 | res = FrozenBatchNorm2d(module.num_features)
35 | res.num_features = module.num_features
36 | res.affine = module.affine
37 | if module.affine:
38 | res.weight.data = module.weight.data.clone().detach()
39 | res.bias.data = module.bias.data.clone().detach()
40 | res.running_mean.data = module.running_mean.data
41 | res.running_var.data = module.running_var.data
42 | res.eps = module.eps
43 | else:
44 | for child_name, child in module.named_children():
45 | full_child_name = ".".join([name, child_name]) if name else child_name
46 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
47 | if new_child is not child:
48 | res.add_module(child_name, new_child)
49 | return res
50 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import torch
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/pnp_vqa_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import torch
9 |
10 |
11 | def prepare_qa_input(sample, num_captions, num_captions_fid):
12 | sample_question_captions = []
13 |
14 | for question, captions in zip(sample['text_input'], sample['captions']):
15 | assert isinstance(captions, list)
16 | question_captions = []
17 | question_caption = ''
18 | for cap_id, cap_ in enumerate(captions[0:num_captions]):
19 | question_caption += (cap_.strip() + '. ')
20 | if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0):
21 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
22 | question_captions.append(question_caption)
23 | question_caption = ''
24 | if (cap_id + 1) == num_captions:
25 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
26 | question_captions.append(question_caption)
27 | sample_question_captions.append(question_captions)
28 |
29 | sample['question_captions'] = sample_question_captions
30 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 |
--------------------------------------------------------------------------------
/Lavis/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | """ Linear layer (alternate definition)
9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 |
14 |
15 | class Linear(nn.Linear):
16 | def forward(self, input: torch.Tensor) -> torch.Tensor:
17 | if torch.jit.is_scripting():
18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 | else:
21 | return F.linear(input, self.weight, self.bias)
22 |
--------------------------------------------------------------------------------
/Lavis/lavis/processors/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.processors.base_processor import BaseProcessor
9 |
10 | from lavis.processors.alpro_processors import (
11 | AlproVideoTrainProcessor,
12 | AlproVideoEvalProcessor,
13 | )
14 | from lavis.processors.blip_processors import (
15 | BlipImageTrainProcessor,
16 | Blip2ImageTrainProcessor,
17 | BlipImageEvalProcessor,
18 | BlipCaptionProcessor,
19 | )
20 | from lavis.processors.gpt_processors import (
21 | GPTVideoFeatureProcessor,
22 | GPTDialogueProcessor,
23 | )
24 | from lavis.processors.clip_processors import ClipImageTrainProcessor
25 |
26 | from lavis.common.registry import registry
27 |
28 | __all__ = [
29 | "BaseProcessor",
30 | # ALPRO
31 | "AlproVideoTrainProcessor",
32 | "AlproVideoEvalProcessor",
33 | # BLIP
34 | "BlipImageTrainProcessor",
35 | "Blip2ImageTrainProcessor",
36 | "BlipImageEvalProcessor",
37 | "BlipCaptionProcessor",
38 | "ClipImageTrainProcessor",
39 | # GPT
40 | "GPTVideoFeatureProcessor",
41 | "GPTDialogueProcessor",
42 | ]
43 |
44 |
45 | def load_processor(name, cfg=None):
46 | """
47 | Example
48 |
49 | >>> processor = load_processor("alpro_video_train", cfg=None)
50 | """
51 | processor = registry.get_processor_class(name).from_config(cfg)
52 |
53 | return processor
54 |
--------------------------------------------------------------------------------
/Lavis/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from omegaconf import OmegaConf
9 |
10 |
11 | class BaseProcessor:
12 | def __init__(self):
13 | self.transform = lambda x: x
14 | return
15 |
16 | def __call__(self, item):
17 | return self.transform(item)
18 |
19 | @classmethod
20 | def from_config(cls, cfg=None):
21 | return cls()
22 |
23 | def build(self, **kwargs):
24 | cfg = OmegaConf.create(kwargs)
25 |
26 | return self.from_config(cfg)
27 |
--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/direct_aokvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | # Overall Accuracy is: 41.22
7 |
8 | model:
9 | arch: blip2_t5_par
10 | model_type: pretrain_flant5xl
11 | use_grad_checkpoint: False
12 | max_txt_len: 128
13 | prompt: "Question: {} Short Answer: "
14 | multiple_choice: False
15 |
16 | keyword_pipeline: True
17 | reason: True
18 | paraphrase: False
19 |
20 | ext_paraphrase: False
21 | par_num_beams: 5
22 | num_add_candidates: 4
23 |
24 | perform_selection: False
25 | selection_criterion: 'Aconf'
26 | calibrate: False
27 | perform_ensembling: False
28 | dropout_aggregate: False
29 |
30 | constrained: True
31 | verbose: False
32 |
33 | use_caption: False
34 | use_promptcap: False
35 | alt_device: 0
36 |
37 | # for OKVQA evaluation
38 | apply_lemmatizer: False
39 |
40 | datasets:
41 | aok_vqa: # name of the dataset builder
42 | type: eval
43 | vis_processor:
44 | eval:
45 | name: "blip_image_eval"
46 | image_size: 224
47 | text_processor:
48 | eval:
49 | name: "blip_question"
50 |
51 |
52 | run:
53 | task: aok_vqa
54 | # optimization-specific
55 | batch_size_train: 16
56 | batch_size_eval: 10
57 | num_workers: 4
58 |
59 | # inference-specific
60 | max_len: 10
61 | min_len: 1
62 | num_beams: 5
63 | inference_method: "generate"
64 |
65 | seed: 42
66 | output_dir: "output/BLIP2/AOKVQA-direct"
67 |
68 | evaluate: True
69 | test_splits: ["val"]
70 |
71 | # distribution-specific
72 | device: "cuda"
73 | world_size: 1
74 | dist_url: "env://"
75 | distributed: True
76 |
--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/mc_aokvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 |
7 | model:
8 | arch: blip2_t5_par
9 | model_type: pretrain_flant5xl
10 | use_grad_checkpoint: False
11 | max_txt_len: 128
12 | prompt: "Based on this information, select the correct answer to the question from the options.\nQuestion: {}\nOptions: A. {}, B. {}, C. {}, D. {}\nAnswer: Option "
13 | multiple_choice: True
14 |
15 | keyword_pipeline: False
16 | reason: False
17 | paraphrase: False
18 |
19 | ext_paraphrase: False
20 | par_num_beams: 5
21 | num_add_candidates: 0
22 |
23 | perform_selection: False
24 | selection_criterion: 'Aconf'
25 | calibrate: False
26 | perform_ensembling: False
27 | dropout_aggregate: False
28 |
29 | constrained: True
30 | verbose: False
31 |
32 | use_caption: False
33 | use_promptcap: False
34 | alt_device: 0
35 |
36 | # for OKVQA evaluation
37 | apply_lemmatizer: False
38 |
39 | datasets:
40 | aok_vqa: # name of the dataset builder
41 | type: eval
42 | vis_processor:
43 | eval:
44 | name: "blip_image_eval"
45 | image_size: 224
46 | text_processor:
47 | eval:
48 | name: "blip_question"
49 | # build_info:
50 | # images:
51 | # storage: '/export/share/datasets/vision/coco/images/'
52 |
53 | run:
54 | task: mc_aok_vqa
55 | # optimization-specific
56 | batch_size_train: 16
57 | batch_size_eval: 24
58 | num_workers: 4
59 |
60 | # inference-specific
61 | max_len: 10
62 | min_len: 1
63 | num_beams: 5
64 | inference_method: "generate"
65 |
66 | seed: 42
67 | output_dir: "output/BLIP2/AOKVQA-MC"
68 |
69 | evaluate: True
70 | test_splits: ["val"]
71 |
72 | # distribution-specific
73 | device: "cuda"
74 | world_size: 1
75 | dist_url: "env://"
76 | distributed: True
77 |
--------------------------------------------------------------------------------
/Lavis/lavis/projects/blip2/vqav2_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 |
7 | model:
8 | arch: blip2_t5_par
9 | model_type: pretrain_flant5xl
10 | use_grad_checkpoint: False
11 | prompt: "Question: {} Short answer:"
12 | max_txt_len: 256
13 |
14 | keyword_pipeline: True
15 | reason: True
16 | paraphrase: False
17 | ext_paraphrase: True
18 | par_num_beams: 5
19 | num_add_candidates: 4
20 |
21 | perform_selection: False
22 | selection_criterion: 'Aconf'
23 | calibrate: False
24 | perform_ensembling: False
25 | dropout_aggregate: False
26 |
27 | constrained: True
28 | verbose: True
29 |
30 | use_caption: False
31 | use_promptcap: False
32 | alt_device: 0
33 |
34 | datasets:
35 | coco_vqa: # name of the dataset builder
36 | type: eval
37 | vis_processor:
38 | eval:
39 | name: "blip_image_eval"
40 | image_size: 224
41 | text_processor:
42 | eval:
43 | name: "blip_question"
44 | # build_info:
45 | # images:
46 | # storage: '/export/share/datasets/vision/coco/images/'
47 |
48 | run:
49 | task: vqa
50 | # optimization-specific
51 | batch_size_train: 16
52 | batch_size_eval: 16
53 | num_workers: 4
54 |
55 | # inference-specific
56 | max_len: 10
57 | min_len: 1
58 | num_beams: 5
59 | inference_method: "generate"
60 | #"Short answer:"
61 |
62 | seed: 42
63 | output_dir: "output/BLIP2/VQA"
64 |
65 | evaluate: True
66 | test_splits: ["val"]
67 |
68 | # distribution-specific
69 | device: "cuda"
70 | world_size: 1
71 | dist_url: "env://"
72 | distributed: True
73 |
--------------------------------------------------------------------------------
/Lavis/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.runners.runner_base import RunnerBase
9 | from lavis.runners.runner_iter import RunnerIter
10 |
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 |
--------------------------------------------------------------------------------
/Lavis/lavis/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.tasks.base_task import BaseTask
10 | from lavis.tasks.captioning import CaptionTask
11 | from lavis.tasks.image_text_pretrain import ImageTextPretrainTask
12 | from lavis.tasks.multimodal_classification import (
13 | MultimodalClassificationTask,
14 | )
15 | from lavis.tasks.retrieval import RetrievalTask
16 | from lavis.tasks.vqa import VQATask, GQATask, AOKVQATask, MultiChoiceAOKVQATask
17 | from lavis.tasks.vqa_reading_comprehension import VQARCTask, GQARCTask
18 | from lavis.tasks.dialogue import DialogueTask
19 |
20 |
21 | def setup_task(cfg):
22 | assert "task" in cfg.run_cfg, "Task name must be provided."
23 |
24 | task_name = cfg.run_cfg.task
25 | task = registry.get_task_class(task_name).setup_task(cfg=cfg)
26 | assert task is not None, "Task {} not properly registered.".format(task_name)
27 |
28 | return task
29 |
30 |
31 | __all__ = [
32 | "BaseTask",
33 | "AOKVQATask",
34 | "RetrievalTask",
35 | "CaptionTask",
36 | "VQATask",
37 | "GQATask",
38 | "VQARCTask",
39 | "GQARCTask",
40 | "MultimodalClassificationTask",
41 | "MultiChoiceAOKVQATask",
42 | # "VideoQATask",
43 | # "VisualEntailmentTask",
44 | "ImageTextPretrainTask",
45 | "DialogueTask",
46 | ]
47 |
--------------------------------------------------------------------------------
/Lavis/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.tasks.base_task import BaseTask
10 |
11 |
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 | def __init__(self):
15 | super().__init__()
16 |
17 | def evaluation(self, model, data_loader, cuda_enabled=True):
18 | pass
19 |
--------------------------------------------------------------------------------
/Lavis/requirements.txt:
--------------------------------------------------------------------------------
1 | contexttimer
2 | decord
3 | einops>=0.4.1
4 | fairscale==0.4.4
5 | ftfy
6 | iopath
7 | ipython
8 | omegaconf
9 | opencv-python-headless==4.5.5.64
10 | opendatasets
11 | packaging
12 | pandas
13 | plotly
14 | pre-commit
15 | pycocoevalcap
16 | pycocotools
17 | python-magic
18 | scikit-image
19 | sentencepiece
20 | spacy
21 | streamlit
22 | timm==0.4.12
23 | torch>=1.10.0
24 | torchvision
25 | tqdm
26 | transformers>=4.25.0,<4.27
27 | webdataset
28 | wheel
29 | rake-nltk
30 |
--------------------------------------------------------------------------------
/Lavis/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from setuptools import setup, find_namespace_packages
9 | import platform
10 |
11 | DEPENDENCY_LINKS = []
12 | if platform.system() == "Windows":
13 | DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
14 |
15 |
16 | def fetch_requirements(filename):
17 | with open(filename) as f:
18 | return [ln.strip() for ln in f.read().split("\n")]
19 |
20 |
21 | setup(
22 | name="salesforce-lavis",
23 | version="1.0.1",
24 | author="Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi",
25 | description="LAVIS - A One-stop Library for Language-Vision Intelligence",
26 | long_description=open("README.md", "r", encoding="utf-8").read(),
27 | long_description_content_type="text/markdown",
28 | keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch",
29 | license="3-Clause BSD",
30 | packages=find_namespace_packages(include="lavis.*"),
31 | install_requires=fetch_requirements("requirements.txt"),
32 | python_requires=">=3.7.0",
33 | include_package_data=True,
34 | dependency_links=DEPENDENCY_LINKS,
35 | zip_safe=False,
36 | )
37 |
--------------------------------------------------------------------------------
/MiniGPT-4/environment.yml:
--------------------------------------------------------------------------------
1 | name: minigpt4
2 | channels:
3 | - pytorch
4 | - defaults
5 | - anaconda
6 | dependencies:
7 | - python=3.9
8 | - cudatoolkit
9 | - pip
10 | - pytorch=1.12.1
11 | - pytorch-mutex=1.0=cuda
12 | - torchaudio=0.12.1
13 | - torchvision=0.13.1
14 | - pip:
15 | - accelerate==0.16.0
16 | - aiohttp==3.8.4
17 | - aiosignal==1.3.1
18 | - async-timeout==4.0.2
19 | - attrs==22.2.0
20 | - bitsandbytes==0.37.0
21 | - cchardet==2.1.7
22 | - chardet==5.1.0
23 | - contourpy==1.0.7
24 | - cycler==0.11.0
25 | - filelock==3.9.0
26 | - fonttools==4.38.0
27 | - frozenlist==1.3.3
28 | - huggingface-hub==0.13.4
29 | - importlib-resources==5.12.0
30 | - kiwisolver==1.4.4
31 | - matplotlib==3.7.0
32 | - multidict==6.0.4
33 | - openai==0.27.0
34 | - packaging==23.0
35 | - psutil==5.9.4
36 | - pycocotools==2.0.6
37 | - pyparsing==3.0.9
38 | - python-dateutil==2.8.2
39 | - pyyaml==6.0
40 | - regex==2022.10.31
41 | - tokenizers==0.13.2
42 | - tqdm==4.64.1
43 | - transformers==4.28.0
44 | - timm==0.6.13
45 | - spacy==3.5.1
46 | - webdataset==0.2.48
47 | - scikit-learn==1.2.2
48 | - scipy==1.10.1
49 | - yarl==1.8.2
50 | - zipp==3.14.0
51 | - omegaconf==2.3.0
52 | - opencv-python==4.7.0.72
53 | - iopath==0.1.10
54 | - decord==0.6.0
55 | - tenacity==8.2.2
56 | - peft
57 | - pycocoevalcap
58 | - sentence-transformers
59 | - umap-learn
60 | - notebook
61 | - gradio==3.24.1
62 | - gradio-client==0.0.8
63 | - wandb
64 | - rake-nltk
65 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | import sys
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from minigpt4.common.registry import registry
14 |
15 | from minigpt4.datasets.builders import *
16 | from minigpt4.models import *
17 | from minigpt4.processors import *
18 | from minigpt4.tasks import *
19 |
20 |
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 |
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 |
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/common/__init__.py
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/gradcam.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from matplotlib import pyplot as plt
3 | from scipy.ndimage import filters
4 | from skimage import transform as skimage_transform
5 |
6 |
7 | def getAttMap(img, attMap, blur=True, overlap=True):
8 | attMap -= attMap.min()
9 | if attMap.max() > 0:
10 | attMap /= attMap.max()
11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 | if blur:
13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 | attMap -= attMap.min()
15 | attMap /= attMap.max()
16 | cmap = plt.get_cmap("jet")
17 | attMapV = cmap(attMap)
18 | attMapV = np.delete(attMapV, 3, 2)
19 | if overlap:
20 | attMap = (
21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 | )
24 | return attMap
25 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | __author__ = "aagrawal"
9 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/aokvqa/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | aok_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
17 | storage:
18 | - aokvqa/annotations/aokvqa_v1p0_train.json
19 | val:
20 | url:
21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
22 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
23 | storage:
24 | - aokvqa/annotations/aokvqa_v1p0_val.json
25 | - aokvqa/annotations/specialized_vocab_train_lavis.json
26 | # - aokvqa/annotations/large_vocab_train_lavis.json
27 | test:
28 | url:
29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
31 | storage:
32 | - aokvqa/annotations/aokvqa_v1p0_test.json
33 | - aokvqa/annotations/specialized_vocab_train_lavis.json
34 | images:
35 | storage: coco/images/
36 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/aokvqa/eval_aokvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | aok_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url:
16 | - aokvqa/annotations/aokvqa_v1p0_val.json
17 | - aokvqa/annotations/specialized_vocab_train_lavis.json
18 | storage:
19 | - aokvqa/annotations/aokvqa_v1p0_val.json
20 | - aokvqa/annotations/specialized_vocab_train_lavis.json
21 | # - aokvqa/annotations/large_vocab_train_lavis.json
22 | images:
23 | storage: coco/images/
24 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/cc_sbu/align.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 | cc_sbu_align:
3 | data_type: images
4 | build_info:
5 | storage: /path/to/cc_sbu_align/
6 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/cc_sbu/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 | cc_sbu:
3 | data_type: images
4 | build_info:
5 | storage: /path/to/cc_sbu_dataset/{00000..01255}.tar
6 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_caption: # name of the dataset builder
8 | dataset_card: dataset_card/coco_caption.md
9 | # data_dir: ${env.data_dir}/datasets
10 | data_type: images # [images|videos|features]
11 |
12 | build_info:
13 | # Be careful not to append minus sign (-) before split to avoid itemizing
14 | annotations:
15 | train:
16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 | md5: aa31ac474cf6250ebb81d18348a07ed8
18 | storage: coco/annotations/coco_karpathy_train.json
19 | val:
20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 | md5: b273847456ef5580e33713b1f7de52a0
22 | storage: coco/annotations/coco_karpathy_val.json
23 | test:
24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 | storage: coco/annotations/coco_karpathy_test.json
27 | images:
28 | storage: coco/images/
29 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_retrieval:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 | md5: aa31ac474cf6250ebb81d18348a07ed8
17 | storage: coco/annotations/coco_karpathy_train.json
18 | val:
19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 | md5: b273847456ef5580e33713b1f7de52a0
21 | storage: coco/annotations/coco_karpathy_val.json
22 | test:
23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 | storage: coco/annotations/coco_karpathy_test.json
26 | images:
27 | storage: coco/images/
28 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url:
16 | - coco/annotations/vqa_val_eval.json
17 | - coco/annotations/answer_list.json
18 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
19 | - coco/annotations/v2_mscoco_val2014_annotations.json
20 |
21 | storage:
22 | - coco/annotations/vqa_val_eval.json
23 | - coco/annotations/answer_list.json
24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
25 | - coco/annotations/v2_mscoco_val2014_annotations.json
26 | images:
27 | storage: coco/images/
28 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/datasets/laion/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 | laion:
3 | data_type: images
4 | build_info:
5 | storage: /path/to/laion_dataset/{00000..10488}.tar
6 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | # For default users
3 | # cache_root: "cache"
4 | # For internal use with persistent storage
5 | cache_root: ".cache/lavis"
6 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/models/minigpt4_llama2.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: mini_gpt4
3 |
4 | # vit encoder
5 | image_size: 224
6 | drop_path_rate: 0
7 | use_grad_checkpoint: False
8 | vit_precision: "fp16"
9 | freeze_vit: True
10 | has_qformer: False
11 |
12 | # generation configs
13 | prompt: ""
14 |
15 | llama_model: 'meta-llama/Llama-2-7b-chat-hf'
16 |
17 | preprocess:
18 | vis_processor:
19 | train:
20 | name: "blip2_image_train"
21 | image_size: 224
22 | eval:
23 | name: "blip2_image_eval"
24 | image_size: 224
25 | text_processor:
26 | train:
27 | name: "blip_caption"
28 | eval:
29 | name: "blip_caption"
30 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/configs/models/minigpt4_vicuna0.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: mini_gpt4
3 |
4 | # vit encoder
5 | image_size: 224
6 | drop_path_rate: 0
7 | use_grad_checkpoint: False
8 | vit_precision: "fp16"
9 | freeze_vit: True
10 | freeze_qformer: True
11 |
12 | # Q-Former
13 | num_query_token: 32
14 |
15 | # generation configs
16 | prompt: ""
17 |
18 | llama_model: "Vision-CAIR/vicuna-7b"
19 |
20 | preprocess:
21 | vis_processor:
22 | train:
23 | name: "blip2_image_train"
24 | image_size: 224
25 | eval:
26 | name: "blip2_image_eval"
27 | image_size: 224
28 | text_processor:
29 | train:
30 | name: "blip_caption"
31 | eval:
32 | name: "blip_caption"
33 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/conversation/__init__.py
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/__init__.py
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/builders/vqa_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
9 |
10 | from minigpt4.common.registry import registry
11 | from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
12 | from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
13 |
14 |
15 | @registry.register_builder("coco_vqa")
16 | class COCOVQABuilder(BaseDatasetBuilder):
17 | train_dataset_cls = COCOVQADataset
18 | eval_dataset_cls = COCOVQAEvalDataset
19 |
20 | DATASET_CONFIG_DICT = {
21 | "default": "configs/datasets/coco/defaults_vqa.yaml",
22 | "eval": "configs/datasets/coco/eval_vqa.yaml",
23 | }
24 |
25 |
26 | @registry.register_builder("ok_vqa")
27 | class OKVQABuilder(COCOVQABuilder):
28 | DATASET_CONFIG_DICT = {
29 | "default": "configs/datasets/okvqa/defaults.yaml",
30 | }
31 |
32 |
33 | @registry.register_builder("aok_vqa")
34 | class AOKVQABuilder(BaseDatasetBuilder):
35 | train_dataset_cls = AOKVQADataset
36 | eval_dataset_cls = AOKVQAEvalDataset
37 |
38 | DATASET_CONFIG_DICT = {
39 | "default": "configs/datasets/aokvqa/defaults.yaml",
40 | "eval": "configs/datasets/aokvqa/eval_aokvqa.yaml",
41 | }
42 |
43 |
44 |
45 | # @registry.register_builder("gqa")
46 | # class GQABuilder(BaseDatasetBuilder):
47 | # train_dataset_cls = GQADataset
48 | # eval_dataset_cls = GQAEvalDataset
49 |
50 | # DATASET_CONFIG_DICT = {
51 | # "default": "configs/datasets/gqa/defaults.yaml",
52 | # "balanced_val": "configs/datasets/gqa/balanced_val.yaml",
53 | # "balanced_testdev": "configs/datasets/gqa/balanced_testdev.yaml",
54 | # }
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/MiniGPT-4/minigpt4/datasets/datasets/__init__.py
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/base_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import json
9 | from typing import Iterable
10 |
11 | from torch.utils.data import Dataset, ConcatDataset
12 | from torch.utils.data.dataloader import default_collate
13 |
14 |
15 | class BaseDataset(Dataset):
16 | def __init__(
17 | self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
18 | ):
19 | """
20 | vis_root (string): Root directory of images (e.g. coco/images/)
21 | ann_root (string): directory to store the annotation file
22 | """
23 | self.vis_root = vis_root
24 | self.annotation = []
25 | for ann_path in ann_paths:
26 | self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
27 |
28 | self.vis_processor = vis_processor
29 | self.text_processor = text_processor
30 |
31 | self._add_instance_ids()
32 |
33 | def __len__(self):
34 | return len(self.annotation)
35 |
36 | def collater(self, samples):
37 | return default_collate(samples)
38 |
39 | def set_processors(self, vis_processor, text_processor):
40 | self.vis_processor = vis_processor
41 | self.text_processor = text_processor
42 |
43 | def _add_instance_ids(self, key="instance_id"):
44 | for idx, ann in enumerate(self.annotation):
45 | ann[key] = str(idx)
46 |
47 |
48 | class ConcatDataset(ConcatDataset):
49 | def __init__(self, datasets: Iterable[Dataset]) -> None:
50 | super().__init__(datasets)
51 |
52 | def collater(self, samples):
53 | # TODO For now only supports datasets with same underlying collater implementations
54 |
55 | all_keys = set()
56 | for s in samples:
57 | all_keys.update(s)
58 |
59 | shared_keys = all_keys
60 | for s in samples:
61 | shared_keys = shared_keys & set(s.keys())
62 |
63 | samples_shared_keys = []
64 | for s in samples:
65 | samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
66 |
67 | return self.datasets[0].collater(samples_shared_keys)
68 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/cc_sbu_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | from PIL import Image
3 | import webdataset as wds
4 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
5 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
6 |
7 |
8 | class CCSBUDataset(BaseDataset):
9 | def __init__(self, vis_processor, text_processor, location):
10 | super().__init__(vis_processor=vis_processor, text_processor=text_processor)
11 |
12 | self.inner_dataset = wds.DataPipeline(
13 | wds.ResampledShards(location),
14 | wds.tarfile_to_samples(handler=wds.warn_and_continue),
15 | wds.shuffle(1000, handler=wds.warn_and_continue),
16 | wds.decode("pilrgb", handler=wds.warn_and_continue),
17 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
18 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
19 | wds.map(self.to_dict, handler=wds.warn_and_continue),
20 | )
21 |
22 | def to_dict(self, sample):
23 | return {
24 | "image": sample[0],
25 | "answer": self.text_processor(sample[1]["caption"]),
26 | }
27 |
28 |
29 | class CCSBUAlignDataset(CaptionDataset):
30 |
31 | def __getitem__(self, index):
32 |
33 | # TODO this assumes image input, not general enough
34 | ann = self.annotation[index]
35 |
36 | img_file = '{}.jpg'.format(ann["image_id"])
37 | image_path = os.path.join(self.vis_root, img_file)
38 | image = Image.open(image_path).convert("RGB")
39 |
40 | image = self.vis_processor(image)
41 | caption = ann["caption"]
42 |
43 | return {
44 | "image": image,
45 | "answer": caption,
46 | "image_id": self.img_ids[ann["image_id"]],
47 | }
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import webdataset as wds
9 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
10 |
11 |
12 | class LaionDataset(BaseDataset):
13 | def __init__(self, vis_processor, text_processor, location):
14 | super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 |
16 | self.inner_dataset = wds.DataPipeline(
17 | wds.ResampledShards(location),
18 | wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 | wds.shuffle(1000, handler=wds.warn_and_continue),
20 | wds.decode("pilrgb", handler=wds.warn_and_continue),
21 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 | wds.map(self.to_dict, handler=wds.warn_and_continue),
24 | )
25 |
26 | def to_dict(self, sample):
27 | return {
28 | "image": sample[0],
29 | "answer": self.text_processor(sample[1]["caption"]),
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/datasets/datasets/vqa_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import torch
9 |
10 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
11 |
12 |
13 | class VQADataset(BaseDataset):
14 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
15 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
16 |
17 | def collater(self, samples):
18 | image_list, question_list, answer_list, weight_list = [], [], [], []
19 |
20 | num_answers = []
21 |
22 | for sample in samples:
23 | image_list.append(sample["image"])
24 | question_list.append(sample["text_input"])
25 |
26 | weight_list.extend(sample["weights"])
27 |
28 | answers = sample["answers"]
29 |
30 | answer_list.extend(answers)
31 | num_answers.append(len(answers))
32 |
33 | return {
34 | "image": torch.stack(image_list, dim=0),
35 | "text_input": question_list,
36 | "answer": answer_list,
37 | "weight": torch.Tensor(weight_list),
38 | "n_answers": torch.LongTensor(num_answers),
39 | }
40 |
41 |
42 | class VQAEvalDataset(BaseDataset):
43 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
44 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
45 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/processors/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from minigpt4.processors.base_processor import BaseProcessor
9 | from minigpt4.processors.blip_processors import (
10 | Blip2ImageTrainProcessor,
11 | Blip2ImageEvalProcessor,
12 | BlipCaptionProcessor,
13 | )
14 |
15 | from minigpt4.common.registry import registry
16 |
17 | __all__ = [
18 | "BaseProcessor",
19 | "Blip2ImageTrainProcessor",
20 | "Blip2ImageEvalProcessor",
21 | "BlipCaptionProcessor",
22 | ]
23 |
24 |
25 | def load_processor(name, cfg=None):
26 | """
27 | Example
28 |
29 | >>> processor = load_processor("alpro_video_train", cfg=None)
30 | """
31 | processor = registry.get_processor_class(name).from_config(cfg)
32 |
33 | return processor
34 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/processors/base_processor.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from omegaconf import OmegaConf
9 |
10 |
11 | class BaseProcessor:
12 | def __init__(self):
13 | self.transform = lambda x: x
14 | return
15 |
16 | def __call__(self, item):
17 | return self.transform(item)
18 |
19 | @classmethod
20 | def from_config(cls, cfg=None):
21 | return cls()
22 |
23 | def build(self, **kwargs):
24 | cfg = OmegaConf.create(kwargs)
25 |
26 | return self.from_config(cfg)
27 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/projects/minigpt4/conv_direct_aokvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: mini_gpt4
8 | model_type: pretrain_vicuna0
9 | end_sym: "###"
10 | max_txt_len: 256
11 | ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth'
12 | use_grad_checkpoint: False
13 | answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word."
14 | prompt_template: "### Human:
### Human: Based on the image, answer the question below.\nQuestion: {}"
15 | process_answer: True
16 | answer_processor: 'aok-vqa'
17 | conversation: True
18 | multiple_choice: False
19 |
20 | keyword_pipeline: True
21 | reason: True
22 | paraphrase: False
23 |
24 | perform_selection: False
25 | selection_criterion: 'Aconf'
26 | perform_ensembling: False
27 |
28 | ext_paraphrase: False
29 | par_num_beams: 5
30 | num_add_candidates: 4
31 | verbose: True
32 | alt_device: 0
33 |
34 | datasets:
35 | aok_vqa: # name of the dataset builder
36 | type: eval
37 | vis_processor:
38 | eval:
39 | name: "blip2_image_eval"
40 | image_size: 224
41 | text_processor:
42 | eval:
43 | name: "blip_question"
44 | build_info:
45 | images:
46 | storage: '.cache/lavis/coco/images/'
47 |
48 | run:
49 | task: aok_vqa
50 | # optimization-specific
51 | batch_size_train: 16
52 | batch_size_eval: 5
53 | num_workers: 4
54 |
55 | # inference-specific
56 | max_len: 30
57 | min_len: 1
58 | num_beams: 5
59 | inference_method: "generate"
60 |
61 | seed: 42
62 | output_dir: "output/Vicuna7B/AOK-VQA"
63 |
64 | evaluate: True
65 | test_splits: ["val"]
66 |
67 | # distribution-specific
68 | device: "cuda"
69 | world_size: 1
70 | dist_url: "env://"
71 | distributed: True
72 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/projects/minigpt4/conv_vqav2.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 |
7 | model:
8 | arch: mini_gpt4
9 | model_type: pretrain_vicuna0
10 | end_sym: "###"
11 | max_txt_len: 200
12 | ckpt: '.cache/hub/models--minigpt4--vicuna-7b/pretrained_minigpt4_vicuna_7b.pth'
13 | use_grad_checkpoint: False
14 | answer_refinement_prompt: " Assistant: {}\n###Human: Shorten your answer to the question as much as possible, preferrably only 1 word."
15 | prompt_template: "### Human:
### Human: Based on the image, answer the question below.\nQuestion: {}"
16 | process_answer: True
17 | answer_processor: 'vqa'
18 | conversation: True
19 |
20 | ext_paraphrase: False
21 | par_num_beams: 5
22 | num_add_candidates: 4
23 |
24 | keyword_pipeline: True
25 | reason: True
26 | paraphrase: False
27 |
28 | perform_selection: False
29 | selection_criterion: 'Aconf'
30 | perform_ensembling: False
31 |
32 | verbose: True
33 | alt_device: 0
34 |
35 | datasets:
36 | coco_vqa: # name of the dataset builder
37 | type: eval
38 | vis_processor:
39 | eval:
40 | name: "blip2_image_eval"
41 | image_size: 224
42 | text_processor:
43 | eval:
44 | name: "blip_question"
45 | build_info:
46 | images:
47 | storage: '.cache/lavis/coco/images/'
48 |
49 | run:
50 | task: vqa
51 | # optimization-specific
52 | batch_size_train: 16
53 | batch_size_eval: 4
54 | num_workers: 4
55 |
56 | # inference-specific
57 | max_len: 50
58 | min_len: 1
59 | num_beams: 5
60 | inference_method: "generate"
61 |
62 | seed: 42
63 | output_dir: "output/Vicuna7B/VQA"
64 |
65 | evaluate: True
66 | test_splits: ["val"]
67 |
68 | # distribution-specific
69 | device: "cuda"
70 | world_size: 1
71 | dist_url: "env://"
72 | distributed: True
73 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/runners/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from minigpt4.runners.runner_base import RunnerBase
9 |
10 | __all__ = ["RunnerBase"]
11 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from minigpt4.common.registry import registry
9 | from minigpt4.tasks.base_task import BaseTask
10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
11 | from minigpt4.tasks.vqa import VQATask, AOKVQATask, MultiChoiceAOKVQATask
12 |
13 |
14 | def setup_task(cfg):
15 | assert "task" in cfg.run_cfg, "Task name must be provided."
16 |
17 | task_name = cfg.run_cfg.task
18 | task = registry.get_task_class(task_name).setup_task(cfg=cfg)
19 | assert task is not None, "Task {} not properly registered.".format(task_name)
20 |
21 | return task
22 |
23 |
24 | __all__ = [
25 | "BaseTask",
26 | "ImageTextPretrainTask",
27 | "AOKVQATask",
28 | "VQATask",
29 | "MultiChoiceAOKVQATask",
30 | ]
31 |
--------------------------------------------------------------------------------
/MiniGPT-4/minigpt4/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from minigpt4.common.registry import registry
9 | from minigpt4.tasks.base_task import BaseTask
10 |
11 |
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 | def __init__(self):
15 | super().__init__()
16 |
17 | def evaluation(self, model, data_loader, cuda_enabled=True):
18 | pass
19 |
--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/README.md
--------------------------------------------------------------------------------
/assets/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/intro.png
--------------------------------------------------------------------------------
/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/archiki/RepARe/d45028cebda5b9102dcfba3ff26d94eda1d083b6/assets/pipeline.png
--------------------------------------------------------------------------------