├── README.md
├── adversarial_images
    ├── .DS_Store
    ├── clean.jpeg
    ├── prompt_constrained_16.bmp
    ├── prompt_constrained_32.bmp
    ├── prompt_constrained_64.bmp
    └── prompt_unconstrained.bmp
├── assets
    ├── .DS_Store
    ├── attack.png
    ├── crime.png
    ├── demo.png
    ├── gender.png
    ├── human_race.png
    ├── race.png
    ├── religious-1.png
    └── religious-2.png
├── blip_utils
    └── visual_attacker.py
├── cal_metrics.py
├── demo.py
├── environment.yml
├── eval_configs
    ├── llama.yaml
    └── minigpt4_eval.yaml
├── get_metric.py
├── harmful_corpus
    ├── README.md
    ├── derogatory_corpus.csv
    ├── manual_harmful_instructions.csv
    └── red_teaming_prompts.jsonl
├── instructblip_inference.py
├── instructblip_visual_attack.py
├── lavis
    ├── .DS_Store
    ├── LICENSE_Lavis.md
    ├── __init__.py
    ├── common
    │   ├── config.py
    │   ├── dist_utils.py
    │   ├── gradcam.py
    │   ├── logger.py
    │   ├── optims.py
    │   ├── registry.py
    │   ├── utils.py
    │   └── vqa_tools
    │   │   ├── __init__.py
    │   │   ├── vqa.py
    │   │   └── vqa_eval.py
    ├── configs
    │   ├── datasets
    │   │   ├── aokvqa
    │   │   │   └── defaults.yaml
    │   │   ├── avsd
    │   │   │   └── defaults_dial.yaml
    │   │   ├── coco
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_ret.yaml
    │   │   │   ├── defaults_vqa.yaml
    │   │   │   └── eval_vqa.yaml
    │   │   ├── conceptual_caption
    │   │   │   ├── defaults_12m.yaml
    │   │   │   └── defaults_3m.yaml
    │   │   ├── didemo
    │   │   │   └── defaults_ret.yaml
    │   │   ├── flickr30k
    │   │   │   └── defaults.yaml
    │   │   ├── gqa
    │   │   │   ├── balanced_testdev.yaml
    │   │   │   ├── balanced_val.yaml
    │   │   │   └── defaults.yaml
    │   │   ├── imagenet
    │   │   │   └── defaults.yaml
    │   │   ├── laion
    │   │   │   └── defaults_2B_multi.yaml
    │   │   ├── msrvtt
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_qa.yaml
    │   │   │   └── defaults_ret.yaml
    │   │   ├── msvd
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_qa.yaml
    │   │   ├── nlvr
    │   │   │   └── defaults.yaml
    │   │   ├── nocaps
    │   │   │   └── defaults.yaml
    │   │   ├── okvqa
    │   │   │   └── defaults.yaml
    │   │   ├── sbu_caption
    │   │   │   └── defaults.yaml
    │   │   ├── snli_ve
    │   │   │   └── defaults.yaml
    │   │   ├── vatex
    │   │   │   └── defaults_cap.yaml
    │   │   └── vg
    │   │   │   ├── defaults_caption.yaml
    │   │   │   └── defaults_vqa.yaml
    │   ├── default.yaml
    │   └── models
    │   │   ├── albef_classification_ve.yaml
    │   │   ├── albef_feature_extractor.yaml
    │   │   ├── albef_nlvr.yaml
    │   │   ├── albef_pretrain_base.yaml
    │   │   ├── albef_retrieval_coco.yaml
    │   │   ├── albef_retrieval_flickr.yaml
    │   │   ├── albef_vqav2.yaml
    │   │   ├── alpro_qa_msrvtt.yaml
    │   │   ├── alpro_qa_msvd.yaml
    │   │   ├── alpro_retrieval_didemo.yaml
    │   │   ├── alpro_retrieval_msrvtt.yaml
    │   │   ├── bert_config.json
    │   │   ├── bert_config_alpro.json
    │   │   ├── blip2
    │   │       ├── blip2_caption_flant5xl.yaml
    │   │       ├── blip2_caption_opt2.7b.yaml
    │   │       ├── blip2_caption_opt6.7b.yaml
    │   │       ├── blip2_coco.yaml
    │   │       ├── blip2_instruct_flant5xl.yaml
    │   │       ├── blip2_instruct_flant5xxl.yaml
    │   │       ├── blip2_instruct_vicuna13b.yaml
    │   │       ├── blip2_instruct_vicuna7b.yaml
    │   │       ├── blip2_pretrain.yaml
    │   │       ├── blip2_pretrain_flant5xl.yaml
    │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
    │   │       ├── blip2_pretrain_flant5xxl.yaml
    │   │       ├── blip2_pretrain_llama7b.yaml
    │   │       ├── blip2_pretrain_opt2.7b.yaml
    │   │       ├── blip2_pretrain_opt6.7b.yaml
    │   │       └── blip2_pretrain_vitL.yaml
    │   │   ├── blip_caption_base_coco.yaml
    │   │   ├── blip_caption_large_coco.yaml
    │   │   ├── blip_classification_base.yaml
    │   │   ├── blip_feature_extractor_base.yaml
    │   │   ├── blip_itm_base.yaml
    │   │   ├── blip_itm_large.yaml
    │   │   ├── blip_nlvr.yaml
    │   │   ├── blip_pretrain_base.yaml
    │   │   ├── blip_pretrain_large.yaml
    │   │   ├── blip_retrieval_coco.yaml
    │   │   ├── blip_retrieval_flickr.yaml
    │   │   ├── blip_vqa_aokvqa.yaml
    │   │   ├── blip_vqa_okvqa.yaml
    │   │   ├── blip_vqav2.yaml
    │   │   ├── clip
    │   │       ├── RN101-quickgelu.json
    │   │       ├── RN101.json
    │   │       ├── RN50-quickgelu.json
    │   │       ├── RN50.json
    │   │       ├── RN50x16.json
    │   │       ├── RN50x4.json
    │   │       ├── ViT-B-16-plus-240.json
    │   │       ├── ViT-B-16-plus.json
    │   │       ├── ViT-B-16.json
    │   │       ├── ViT-B-32-plus-256.json
    │   │       ├── ViT-B-32-quickgelu.json
    │   │       ├── ViT-B-32.json
    │   │       ├── ViT-H-14.json
    │   │       ├── ViT-H-16.json
    │   │       ├── ViT-L-14-280.json
    │   │       ├── ViT-L-14-336.json
    │   │       ├── ViT-L-14.json
    │   │       ├── ViT-L-16-320.json
    │   │       ├── ViT-L-16.json
    │   │       ├── ViT-g-14.json
    │   │       ├── timm-efficientnetv2_rw_s.json
    │   │       ├── timm-resnet50d.json
    │   │       ├── timm-resnetaa50d.json
    │   │       ├── timm-resnetblur50.json
    │   │       ├── timm-swin_base_patch4_window7_224.json
    │   │       ├── timm-vit_base_patch16_224.json
    │   │       ├── timm-vit_base_patch32_224.json
    │   │       └── timm-vit_small_patch16_224.json
    │   │   ├── clip_resnet50.yaml
    │   │   ├── clip_vit_base16.yaml
    │   │   ├── clip_vit_base32.yaml
    │   │   ├── clip_vit_large14.yaml
    │   │   ├── clip_vit_large14_336.yaml
    │   │   ├── gpt_dialogue_base.yaml
    │   │   ├── img2prompt-vqa
    │   │       └── img2prompt_vqa_base.yaml
    │   │   ├── med_config.json
    │   │   ├── med_config_albef.json
    │   │   ├── med_large_config.json
    │   │   └── pnp-vqa
    │   │       ├── pnp_vqa_3b.yaml
    │   │       ├── pnp_vqa_base.yaml
    │   │       ├── pnp_vqa_large.yaml
    │   │       ├── unifiedqav2_3b_config.json
    │   │       ├── unifiedqav2_base_config.json
    │   │       └── unifiedqav2_large_config.json
    ├── datasets
    │   ├── builders
    │   │   ├── __init__.py
    │   │   ├── base_dataset_builder.py
    │   │   ├── caption_builder.py
    │   │   ├── classification_builder.py
    │   │   ├── dialogue_builder.py
    │   │   ├── image_text_pair_builder.py
    │   │   ├── imagefolder_builder.py
    │   │   ├── retrieval_builder.py
    │   │   ├── video_qa_builder.py
    │   │   └── vqa_builder.py
    │   ├── data_utils.py
    │   ├── datasets
    │   │   ├── aok_vqa_datasets.py
    │   │   ├── avsd_dialogue_datasets.py
    │   │   ├── base_dataset.py
    │   │   ├── caption_datasets.py
    │   │   ├── coco_caption_datasets.py
    │   │   ├── coco_vqa_datasets.py
    │   │   ├── dataloader_utils.py
    │   │   ├── dialogue_datasets.py
    │   │   ├── gqa_datasets.py
    │   │   ├── image_text_pair_datasets.py
    │   │   ├── imagefolder_dataset.py
    │   │   ├── laion_dataset.py
    │   │   ├── multimodal_classification_datasets.py
    │   │   ├── nlvr_datasets.py
    │   │   ├── retrieval_datasets.py
    │   │   ├── snli_ve_datasets.py
    │   │   ├── vg_vqa_datasets.py
    │   │   ├── video_caption_datasets.py
    │   │   ├── video_vqa_datasets.py
    │   │   └── vqa_datasets.py
    │   └── download_scripts
    │   │   ├── DownloadConceptualCaptions
    │   │       ├── LICENSE
    │   │       ├── README.md
    │   │       ├── create_annotation_12m.ipynb
    │   │       ├── create_annotation_3m.ipynb
    │   │       ├── download_data_cc12m.py
    │   │       └── download_data_cc3m.py
    │   │   ├── download_coco.py
    │   │   ├── download_didemo.py
    │   │   ├── download_flickr.py
    │   │   ├── download_gqa.py
    │   │   ├── download_msrvtt.py
    │   │   ├── download_msvd.py
    │   │   ├── download_nocaps.py
    │   │   ├── download_sbu.py
    │   │   └── download_vg.py
    ├── models
    │   ├── __init__.py
    │   ├── albef_models
    │   │   ├── __init__.py
    │   │   ├── albef_classification.py
    │   │   ├── albef_feature_extractor.py
    │   │   ├── albef_nlvr.py
    │   │   ├── albef_outputs.py
    │   │   ├── albef_pretrain.py
    │   │   ├── albef_retrieval.py
    │   │   └── albef_vqa.py
    │   ├── alpro_models
    │   │   ├── __init__.py
    │   │   ├── alpro_outputs.py
    │   │   ├── alpro_qa.py
    │   │   └── alpro_retrieval.py
    │   ├── base_model.py
    │   ├── blip2_models
    │   │   ├── Qformer.py
    │   │   ├── __init__.py
    │   │   ├── blip2.py
    │   │   ├── blip2_image_text_matching.py
    │   │   ├── blip2_opt.py
    │   │   ├── blip2_qformer.py
    │   │   ├── blip2_t5.py
    │   │   ├── blip2_t5_instruct.py
    │   │   ├── blip2_vicuna_instruct.py
    │   │   ├── modeling_llama.py
    │   │   ├── modeling_opt.py
    │   │   └── modeling_t5.py
    │   ├── blip_models
    │   │   ├── __init__.py
    │   │   ├── blip.py
    │   │   ├── blip_caption.py
    │   │   ├── blip_classification.py
    │   │   ├── blip_feature_extractor.py
    │   │   ├── blip_image_text_matching.py
    │   │   ├── blip_nlvr.py
    │   │   ├── blip_outputs.py
    │   │   ├── blip_pretrain.py
    │   │   ├── blip_retrieval.py
    │   │   ├── blip_vqa.py
    │   │   └── nlvr_encoder.py
    │   ├── clip_models
    │   │   ├── __init__.py
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── clip_outputs.py
    │   │   ├── loss.py
    │   │   ├── model.py
    │   │   ├── pics
    │   │   │   └── CLIP.png
    │   │   ├── pretrained.py
    │   │   ├── timm_model.py
    │   │   ├── tokenizer.py
    │   │   ├── transform.py
    │   │   └── utils.py
    │   ├── clip_vit.py
    │   ├── eva_vit.py
    │   ├── gpt_models
    │   │   └── gpt_dialogue.py
    │   ├── img2prompt_models
    │   │   ├── __init__.py
    │   │   └── img2prompt_vqa.py
    │   ├── med.py
    │   ├── pnp_vqa_models
    │   │   ├── __init__.py
    │   │   ├── pnp_unifiedqav2_fid.py
    │   │   └── pnp_vqa.py
    │   ├── timesformer
    │   │   ├── __init__.py
    │   │   ├── conv2d_same.py
    │   │   ├── features.py
    │   │   ├── helpers.py
    │   │   ├── linear.py
    │   │   ├── vit.py
    │   │   └── vit_utils.py
    │   └── vit.py
    ├── processors
    │   ├── __init__.py
    │   ├── alpro_processors.py
    │   ├── base_processor.py
    │   ├── blip_processors.py
    │   ├── clip_processors.py
    │   ├── functional_video.py
    │   ├── gpt_processors.py
    │   ├── randaugment.py
    │   └── transforms_video.py
    ├── projects
    │   ├── albef
    │   │   ├── eval
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr30k_eval.yaml
    │   │   │   ├── snli_ve_eval.yaml
    │   │   │   ├── vqa_test.yaml
    │   │   │   └── vqa_val.yaml
    │   │   └── train
    │   │   │   ├── aokvqa_ft.yaml
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── okvqa_ft.yaml
    │   │   │   ├── pretrain.yaml
    │   │   │   ├── ret_coco_ft.yaml
    │   │   │   ├── ret_flickr30k_ft.yaml
    │   │   │   ├── snli_ve_ft.yaml
    │   │   │   └── vqa_ft.yaml
    │   ├── alpro
    │   │   ├── eval
    │   │   │   ├── didemo_ret_eval.yaml
    │   │   │   ├── msrvtt_qa_eval.yaml
    │   │   │   ├── msrvtt_ret_eval.yaml
    │   │   │   └── msvd_qa_eval.yaml
    │   │   └── train
    │   │   │   ├── didemo_ret_ft.yaml
    │   │   │   ├── msrvtt_qa_ft.yaml
    │   │   │   ├── msrvtt_retrieval_ft.yaml
    │   │   │   └── msvd_qa_ft.yaml
    │   ├── blip
    │   │   ├── coco_cap_ft_iter.yaml
    │   │   ├── eval
    │   │   │   ├── aokvqa_eval.yaml
    │   │   │   ├── caption_coco_eval.yaml
    │   │   │   ├── caption_coco_eval_large.yaml
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── nocaps_eval.yaml
    │   │   │   ├── okvqa_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   └── vqav2_eval.yaml
    │   │   └── train
    │   │   │   ├── aokvqa_ft.yaml
    │   │   │   ├── caption_coco_ft.yaml
    │   │   │   ├── caption_coco_large_ft.yaml
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── okvqa_ft.yaml
    │   │   │   ├── pretrain_14m.yaml
    │   │   │   ├── retrieval_coco_ft.yaml
    │   │   │   ├── retrieval_flickr_ft.yaml
    │   │   │   └── vqav2_ft.yaml
    │   ├── blip2
    │   │   ├── eval
    │   │   │   ├── caption_coco_flant5xl_eval.yaml
    │   │   │   ├── caption_coco_opt2.7b_eval.yaml
    │   │   │   ├── caption_coco_opt6.7b_eval.yaml
    │   │   │   ├── gqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── okvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   ├── vqav2_zeroshot_flant5xl_eval.yaml
    │   │   │   └── vqav2_zeroshot_opt_eval.yaml
    │   │   └── train
    │   │   │   ├── caption_coco_ft.yaml
    │   │   │   ├── pretrain_stage1.yaml
    │   │   │   ├── pretrain_stage2.yaml
    │   │   │   └── retrieval_coco_ft.yaml
    │   ├── clip
    │   │   ├── exp_coco_ret_eval.yaml
    │   │   ├── exp_flickr_ret_eval.yaml
    │   │   └── exp_imnet_zs_eval.yaml
    │   ├── gpt
    │   │   ├── eval
    │   │   │   └── dialogue_avsd_eval.yaml
    │   │   └── train
    │   │   │   └── dialogue_avsd_ft.yaml
    │   └── pnp-vqa
    │   │   └── eval
    │   │       ├── gqa_eval.yaml
    │   │       ├── gqa_eval_3b.yaml
    │   │       ├── gqa_eval_large.yaml
    │   │       ├── okvqa_eval.yaml
    │   │       ├── okvqa_eval_3b.yaml
    │   │       ├── okvqa_eval_large.yaml
    │   │       ├── vqav2_eval.yaml
    │   │       ├── vqav2_eval_3b.yaml
    │   │       ├── vqav2_eval_large.yaml
    │   │       ├── vqav2_test_eval.yaml
    │   │       ├── vqav2_test_eval_3b.yaml
    │   │       └── vqav2_test_eval_large.yaml
    ├── runners
    │   ├── __init__.py
    │   ├── runner_base.py
    │   └── runner_iter.py
    └── tasks
    │   ├── __init__.py
    │   ├── base_task.py
    │   ├── captioning.py
    │   ├── dialogue.py
    │   ├── image_text_pretrain.py
    │   ├── multimodal_classification.py
    │   ├── retrieval.py
    │   ├── vqa.py
    │   └── vqa_reading_comprehension.py
├── llava_llama_2
    ├── LICENSE
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── README.md
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_ds.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   ├── table
    │   │   ├── answer
    │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   ├── answer_bard.jsonl
    │   │   │   ├── answer_gpt35.jsonl
    │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   └── answer_vicuna-13b.jsonl
    │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   ├── model.jsonl
    │   │   ├── prompt.jsonl
    │   │   ├── question.jsonl
    │   │   ├── results
    │   │   │   └── test_sqa_llava_13b_v0.json
    │   │   ├── review
    │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   ├── reviewer.jsonl
    │   │   └── rule.json
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_llama.py
    │   │   ├── llava_mpt.py
    │   │   └── mpt
    │   │   │   ├── adapt_tokenizer.py
    │   │   │   ├── attention.py
    │   │   │   ├── blocks.py
    │   │   │   ├── configuration_mpt.py
    │   │   │   ├── custom_embedding.py
    │   │   │   ├── flash_attn_triton.py
    │   │   │   ├── hf_prefixlm_converter.py
    │   │   │   ├── meta_init_context.py
    │   │   │   ├── modeling_mpt.py
    │   │   │   ├── norm.py
    │   │   │   └── param_init_fns.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   └── train_mem.py
    └── utils.py
├── llava_llama_2_utils
    ├── generator.py
    ├── prompt_wrapper.py
    └── visual_attacker.py
├── llava_llama_v2_inference.py
├── llava_llama_v2_visual_attack.py
├── metric
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-38.pyc
    │   └── perspective_api.cpython-38.pyc
    ├── detoxify.py
    └── perspective_api.py
├── minigpt4
    ├── .DS_Store
    ├── LICENSE_MiniGPT-4.md
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-39.pyc
    ├── common
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── config.cpython-39.pyc
    │   │   ├── dist_utils.cpython-39.pyc
    │   │   ├── logger.cpython-39.pyc
    │   │   ├── registry.cpython-39.pyc
    │   │   └── utils.cpython-39.pyc
    │   ├── config.py
    │   ├── dist_utils.py
    │   ├── gradcam.py
    │   ├── logger.py
    │   ├── optims.py
    │   ├── registry.py
    │   └── utils.py
    ├── configs
    │   ├── .DS_Store
    │   ├── datasets
    │   │   ├── cc_sbu
    │   │   │   ├── align.yaml
    │   │   │   └── defaults.yaml
    │   │   └── laion
    │   │   │   └── defaults.yaml
    │   ├── default.yaml
    │   └── models
    │   │   ├── llama.yaml
    │   │   └── minigpt4.yaml
    ├── conversation
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── conversation.cpython-39.pyc
    │   └── conversation.py
    ├── datasets
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── data_utils.cpython-39.pyc
    │   ├── builders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── base_dataset_builder.cpython-39.pyc
    │   │   │   ├── custom_builder.cpython-39.pyc
    │   │   │   └── image_text_pair_builder.cpython-39.pyc
    │   │   ├── base_dataset_builder.py
    │   │   └── image_text_pair_builder.py
    │   ├── data_utils.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-39.pyc
    │   │       ├── base_dataset.cpython-39.pyc
    │   │       ├── caption_datasets.cpython-39.pyc
    │   │       ├── cc_sbu_dataset.cpython-39.pyc
    │   │       ├── dataloader_utils.cpython-39.pyc
    │   │       └── laion_dataset.cpython-39.pyc
    │   │   ├── base_dataset.py
    │   │   ├── caption_datasets.py
    │   │   ├── cc_sbu_dataset.py
    │   │   ├── dataloader_utils.py
    │   │   └── laion_dataset.py
    ├── models
    │   ├── .DS_Store
    │   ├── Qformer.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── Qformer.cpython-39.pyc
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── base_model.cpython-39.pyc
    │   │   ├── blip2.cpython-39.pyc
    │   │   ├── eva_vit.cpython-39.pyc
    │   │   ├── mini_gpt4.cpython-39.pyc
    │   │   └── modeling_llama.cpython-39.pyc
    │   ├── base_model.py
    │   ├── blip2.py
    │   ├── blip2_outputs.py
    │   ├── eva_vit.py
    │   ├── mini_gpt4.py
    │   └── modeling_llama.py
    ├── processors
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── base_processor.cpython-39.pyc
    │   │   ├── blip_processors.cpython-39.pyc
    │   │   └── randaugment.cpython-39.pyc
    │   ├── base_processor.py
    │   ├── blip_processors.py
    │   └── randaugment.py
    ├── runners
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   └── runner_base.cpython-39.pyc
    │   └── runner_base.py
    └── tasks
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-39.pyc
    │       ├── base_task.cpython-39.pyc
    │       └── image_text_pretrain.cpython-39.pyc
    │   ├── base_task.py
    │   └── image_text_pretrain.py
├── minigpt_inference.py
├── minigpt_red_teaming.py
├── minigpt_test_manual_prompts_text_llm.py
├── minigpt_test_manual_prompts_visual_llm.py
├── minigpt_textual_attack.py
├── minigpt_utils
    ├── generator.py
    ├── prompt_wrapper.py
    ├── text_attacker.py
    └── visual_attacker.py
└── minigpt_visual_attack.py


/adversarial_images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/.DS_Store


--------------------------------------------------------------------------------
/adversarial_images/clean.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/clean.jpeg


--------------------------------------------------------------------------------
/adversarial_images/prompt_constrained_16.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_16.bmp


--------------------------------------------------------------------------------
/adversarial_images/prompt_constrained_32.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_32.bmp


--------------------------------------------------------------------------------
/adversarial_images/prompt_constrained_64.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_64.bmp


--------------------------------------------------------------------------------
/adversarial_images/prompt_unconstrained.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_unconstrained.bmp


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/attack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/attack.png


--------------------------------------------------------------------------------
/assets/crime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/crime.png


--------------------------------------------------------------------------------
/assets/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/demo.png


--------------------------------------------------------------------------------
/assets/gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/gender.png


--------------------------------------------------------------------------------
/assets/human_race.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/human_race.png


--------------------------------------------------------------------------------
/assets/race.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/race.png


--------------------------------------------------------------------------------
/assets/religious-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/religious-1.png


--------------------------------------------------------------------------------
/assets/religious-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/religious-2.png


--------------------------------------------------------------------------------
/eval_configs/llama.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 |   model_type: pretrain_llama
 4 |   freeze_vit: True
 5 |   freeze_qformer: True
 6 |   max_txt_len: 160
 7 |   end_sym: "###"
 8 |   low_resource: True
 9 |   prompt_path: ""
10 |   prompt_template: '###Human: {} ###Assistant: '
11 |   ckpt: 'ckpts/pretrained_minigpt4.pth'
12 | 
13 | 
14 | datasets:
15 |   cc_sbu_align:
16 |     vis_processor:
17 |       train:
18 |         name: "blip2_image_eval"
19 |         image_size: 224
20 |     text_processor:
21 |       train:
22 |         name: "blip_caption"
23 | 
24 | run:
25 |   task: image_text_pretrain
26 | 


--------------------------------------------------------------------------------
/eval_configs/minigpt4_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 |   model_type: pretrain_vicuna
 4 |   freeze_vit: True
 5 |   freeze_qformer: True
 6 |   max_txt_len: 160
 7 |   end_sym: "###"
 8 |   low_resource: True
 9 |   prompt_path: ""
10 |   prompt_template: '###Human: {} ###Assistant: '
11 |   ckpt: 'ckpts/pretrained_minigpt4.pth'
12 | 
13 | 
14 | datasets:
15 |   cc_sbu_align:
16 |     vis_processor:
17 |       train:
18 |         name: "blip2_image_eval"
19 |         image_size: 224
20 |     text_processor:
21 |       train:
22 |         name: "blip_caption"
23 | 
24 | run:
25 |   task: image_text_pretrain
26 | 


--------------------------------------------------------------------------------
/harmful_corpus/README.md:
--------------------------------------------------------------------------------
 1 | # Harmful Corpus for Research Purposes
 2 | 
 3 | **Warning: This directory contains materials that are toxic and offensive in nature.**
 4 | 
 5 | `derogatory_corpus.csv` contains a small corpus comprised of derogatory content against females, African American, and the human race. This corpus is used for optimizing adversarial examples and is only intended for research purposes. 
 6 | 
 7 | `manual_harmful_instructions.csv` contains 40 examples of harmful instructions for evaluation purposes.
 8 | 
 9 | **All these materials do not, in any way, reflect the personal views or beliefs of the authors.**
10 | 


--------------------------------------------------------------------------------
/lavis/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/.DS_Store


--------------------------------------------------------------------------------
/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.registry import registry
14 | 
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   avsd_dialogue: # name of the dataset builder
 8 |     dataset_card: dataset_card/avsd_dialogue.md 
 9 |     data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 |           storage: avsd/annotations/train.json 
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 |           storage: avsd/annotations/val.json 
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 |           storage: avsd/annotations/test.json 
23 |       features:
24 |         storage: avsd/features/ 
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_caption: # name of the dataset builder
 8 |     dataset_card: dataset_card/coco_caption.md
 9 |     # data_dir: ${env.data_dir}/datasets
10 |     data_type: images # [images|videos|features]
11 | 
12 |     build_info:
13 |       # Be careful not to append minus sign (-) before split to avoid itemizing
14 |       annotations:
15 |         train:
16 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 |           md5: aa31ac474cf6250ebb81d18348a07ed8
18 |           storage: coco/annotations/coco_karpathy_train.json
19 |         val:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 |           md5: b273847456ef5580e33713b1f7de52a0
22 |           storage:  coco/annotations/coco_karpathy_val.json
23 |         test:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 |           storage: coco/annotations/coco_karpathy_test.json
27 |       images:
28 |         storage: coco/images/
29 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               # TODO make this order insensitive
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
18 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
19 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
20 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
21 |           storage:
22 |               - coco/annotations/vqa_val_eval.json
23 |               - coco/annotations/answer_list.json
24 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
25 |               - coco/annotations/v2_mscoco_val2014_annotations.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   didemo_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 |           storage: didemo/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 |           storage: didemo/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 |           storage: didemo/annotations/retrieval_test.json
23 |       videos:
24 |         storage: didemo/videos
25 |         # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/gqa/balanced_testdev.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |             - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
22 |           storage:
23 |             - gqa/annotations/testdev_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/gqa/balanced_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22 |           storage:
23 |               - gqa/annotations/val_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
16 |           storage: msrvtt/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
19 |           storage: msrvtt/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
22 |           storage: msrvtt/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
25 |           storage: msrvtt/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msrvtt/videos
28 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 |           storage: msrvtt/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 |           storage: msrvtt/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 |           storage: msrvtt/annotations/retrieval_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msvd/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16 |           storage: msvd/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19 |           storage: msvd/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22 |           storage: msvd/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25 |           storage: msvd/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msvd/videos
28 | 
29 |       instance_id_key: question_id
30 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 |           storage: vatex/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 |           storage: vatex/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 |           storage: vatex/annotations/cap_test.json
23 |       videos:
24 |         storage: /export/share/dongxuli/data/vatex
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/export/home/.cache/lavis"
11 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xxl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xxl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "./ckpts/vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "./llm/vicuna-7b"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2_llama
 8 |   load_finetuned: False
 9 |   
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # LLM
24 |   llm_model: "/export/home/project/stanford_alpaca/llama_7B"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip2_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-32
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 336
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 | 
13 | 
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 |     train_dataset_cls = NLVRDataset
17 |     eval_dataset_cls = NLVREvalDataset
18 | 
19 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 | 
21 | 
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 |     train_dataset_cls = SNLIVisualEntialmentDataset
25 |     eval_dataset_cls = SNLIVisualEntialmentDataset
26 | 
27 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 | 


--------------------------------------------------------------------------------
/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 |     AVSDDialDataset,
12 |     AVSDDialEvalDataset,
13 | )
14 | 
15 | 
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 |     train_dataset_cls = AVSDDialDataset
19 |     eval_dataset_cls = AVSDDialEvalDataset
20 | 
21 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 | 


--------------------------------------------------------------------------------
/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from abc import abstractmethod
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class MultimodalClassificationDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 | 
16 |         self.class_labels = None
17 | 
18 |     @abstractmethod
19 |     def _build_class_labels(self):
20 |         pass
21 | 


--------------------------------------------------------------------------------
/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from PIL import Image
11 | 
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 | 
14 | 
15 | class VGVQADataset(VQADataset):
16 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 | 
19 |     def __getitem__(self, index):
20 |         ann = self.annotation[index]
21 | 
22 |         image_path = os.path.join(self.vis_root, ann["image"])
23 |         image = Image.open(image_path).convert("RGB")
24 | 
25 |         image = self.vis_processor(image)
26 |         question = self.text_processor(ann["question"])
27 | 
28 |         answers = [ann["answer"]]
29 |         # TODO this should be configured better
30 |         weights = [0.2]
31 | 
32 |         return {
33 |             "image": image,
34 |             "text_input": question,
35 |             "answers": answers,
36 |             "weights": weights,
37 |         }
38 | 


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2019 Igor Brigadir
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | -->
 7 | 
 8 | # Download Conceptual Captions Data
 9 | 
10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
11 | 
12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
13 | 
14 | run `download_data_cc3m.py` or `download_data_cc12m.py`.
15 | 
16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
17 | 
18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
19 | 
20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
21 | 
22 | It should take about a day or two to download the training data, keep an eye on disk space.
23 | 


--------------------------------------------------------------------------------
/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 | 
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 | 


--------------------------------------------------------------------------------
/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/clip_models/pics/CLIP.png


--------------------------------------------------------------------------------
/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/lavis/models/pnp_vqa_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def prepare_qa_input(sample, num_captions, num_captions_fid):
12 |     sample_question_captions = []
13 | 
14 |     for question, captions in zip(sample['text_input'], sample['captions']):
15 |         assert isinstance(captions, list)
16 |         question_captions = []
17 |         question_caption = ''
18 |         for cap_id, cap_ in enumerate(captions[0:num_captions]):
19 |             question_caption += (cap_.strip() + '. ')
20 |             if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0):
21 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
22 |                 question_captions.append(question_caption)
23 |                 question_caption = ''
24 |             if (cap_id + 1) == num_captions:
25 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
26 |                 question_captions.append(question_caption)
27 |         sample_question_captions.append(question_captions)
28 | 
29 |     sample['question_captions'] = sample_question_captions
30 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/ALBEF/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_COCO"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_flickr30k_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/snli_ve_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 | 
10 | datasets:
11 |   snli_ve: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   task: multimodal_classification
21 |   # optimization-specific
22 |   batch_size_train: 32
23 |   batch_size_eval: 64
24 |   num_workers: 4
25 | 
26 |   seed: 42
27 |   output_dir: "output/ALBEF/SNLI_VE"
28 | 
29 |   evaluate: True
30 |   test_splits: ["val", "test"]
31 | 
32 |   # distribution-specific
33 |   device: "cuda"
34 |   world_size: 1
35 |   dist_url: "env://"
36 |   distributed: True
37 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_test.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | 
13 | datasets:
14 |   coco_vqa: # name of the dataset builder
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   test_splits: ["test"]
44 | 
45 |   # distribution-specific
46 |   device: "cuda"
47 |   world_size: 1
48 |   dist_url: "env://"
49 |   distributed: True
50 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   coco_vqa: # name of the dataset builder
14 |     type: eval
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   test_splits: ["val"]
43 | 
44 |   # distribution-specific
45 |   device: "cuda"
46 |   world_size: 1
47 |   dist_url: "env://"
48 |   distributed: True
49 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/nlvr_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: False
10 | 
11 | datasets:
12 |   nlvr: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "blip_image_train"
16 |           image_size: 384
17 |         eval:
18 |           name: "blip_image_eval"
19 |           image_size: 384
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: multimodal_classification
28 |   # optimization-specific
29 |   lr_sched: "linear_warmup_cosine_lr"
30 |   init_lr: 2e-5
31 |   min_lr: 1e-6
32 |   weight_decay: 0.02
33 |   warmup_lr: 1e-5
34 |   warmup_steps: 650
35 |   max_epoch: 10
36 |   batch_size_train: 16
37 |   batch_size_eval: 64
38 |   num_workers: 4
39 | 
40 |   seed: 42
41 |   output_dir: "output/ALBEF/NLVR"
42 | 
43 |   amp: False
44 |   resume_ckpt_path: null
45 | 
46 |   evaluate: False 
47 |   train_splits: ["train"]
48 |   valid_splits: ["val", "test"]
49 |   test_splits: ["test"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/snli_ve_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 |   load_finetuned: False
10 |   num_classes: 3
11 | 
12 | datasets:
13 |   snli_ve: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |         eval:
23 |           name: "blip_caption"
24 | 
25 | run:
26 |   task: multimodal_classification
27 |   # optimization-specific
28 |   lr_sched: "linear_warmup_cosine_lr"
29 |   init_lr: 2e-5
30 |   min_lr: 0
31 |   weight_decay: 0.05
32 |   max_epoch: 10
33 |   batch_size_train: 32
34 |   batch_size_eval: 64
35 |   num_workers: 4
36 | 
37 |   seed: 42
38 |   output_dir: "output/ALBEF/SNLI_VE"
39 | 
40 |   amp: False
41 |   resume_ckpt_path: null
42 | 
43 |   evaluate: False 
44 |   train_splits: ["train"]
45 |   valid_splits: ["val"]
46 |   test_splits: ["test"]
47 | 
48 |   # distribution-specific
49 |   device: "cuda"
50 |   world_size: 1
51 |   dist_url: "env://"
52 |   distributed: True
53 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/didemo_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: didemo
 9 | 
10 |   max_txt_len: 50
11 | 
12 |   timesformer:
13 |     n_frms: 8
14 |     image_size: 224
15 | 
16 | 
17 | datasets:
18 |   didemo_retrieval: # name of the dataset builder
19 |     vis_processor:
20 |         eval:
21 |           name: "alpro_video_eval"
22 |           n_frms: 8
23 |           image_size: 224
24 |     text_processor:
25 |         eval:
26 |           name: "blip_caption"
27 | 
28 | run:
29 |   task: retrieval
30 |   # optimization-specific
31 |   batch_size_train: 8
32 |   batch_size_eval: 64
33 |   num_workers: 4
34 | 
35 |   # k_test: 256
36 |   k_test: 1000
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALPRO/didemo_retrieval"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   valid_splits: ["val", "test"]
44 |   test_splits: ["test"]
45 | 
46 |   # distribution-specific
47 |   device: "cuda"
48 |   world_size: 1
49 |   dist_url: "env://"
50 |   distributed: True
51 |   use_dist_eval_sampler: False
52 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: multimodal_classification
23 |   # optimization-specific
24 |   batch_size_train: 32
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   seed: 42
29 |   output_dir: "output/ALPRO/msrvtt_qa"
30 | 
31 |   evaluate: True
32 |   valid_splits: ["val"]
33 |   test_splits: ["test"]
34 | 
35 |   # distribution-specific
36 |   device: "cuda"
37 |   world_size: 1
38 |   dist_url: "env://"
39 |   distributed: True
40 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 8
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 |   # optimization-specific
24 |   batch_size_train: 24
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # k_test: 256
29 |   k_test: 1000
30 | 
31 |   seed: 42
32 |   output_dir: "output/ALPRO/msrvtt_retrieval"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   # distribution-specific
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msvd_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msvd
 9 | 
10 | datasets:
11 |   msvd_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |         eval:
21 |           name: "blip_caption"
22 | 
23 | run:
24 |   task: multimodal_classification
25 |   # optimization-specific
26 |   batch_size_train: 24
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   seed: 42
31 |   output_dir: "output/ALPRO/msvd_qa"
32 | 
33 |   evaluate: True
34 |   test_splits: ["test"]
35 | 
36 |   # distribution-specific
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/coco_cap_ft_iter.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         train:
14 |           name: "blip_image_train"
15 |         eval:
16 |           name: "blip_image_eval"
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |           prompt: "a picture of "
21 |         eval:
22 |           name: "blip_caption"
23 | 
24 | run:
25 |   runner: runner_iter
26 | 
27 |   max_iters: 2e4
28 |   iters_per_inner_epoch: 2e3
29 | 
30 |   # task: retrieval
31 |   task: captioning
32 |   # optimizer
33 |   lr_sched: "linear_warmup_cosine_lr"
34 |   init_lr: 2e-6
35 |   min_lr: 0
36 |   weight_decay: 0.05
37 |   batch_size_train: 16
38 |   batch_size_eval: 64
39 |   num_workers: 4
40 | 
41 |   max_len: 20
42 |   min_len: 5
43 |   num_beams: 3
44 | 
45 |   seed: 42
46 |   output_dir: "output/BLIP/Caption_coco"
47 | 
48 |   amp: False
49 |   resume_ckpt_path: null
50 | 
51 |   evaluate: False 
52 |   train_splits: ["train"]
53 |   valid_splits: ["val", "test"]
54 | 
55 |   device: "cuda"
56 |   world_size: 1
57 |   dist_url: "env://"
58 |   distributed: True
59 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/aokvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: aokvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   aok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: aok_vqa
23 |   # optimization-specific
24 |   batch_size_train: 64
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/AOKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["val", "test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/BLIP/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/nocaps_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 |   # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
10 | 
11 | datasets:
12 |   nocaps: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 |           prompt: "a picture of "
21 | 
22 | run:
23 |   # task: retrieval
24 |   task: captioning
25 |   # optimizer
26 |   batch_size_train: 32
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 3
33 | 
34 |   seed: 42
35 |   output_dir: "output/BLIP/NoCaps"
36 | 
37 |   evaluate: True
38 |   test_splits: ["val", "test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 
45 |   report_metric: False
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: okvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   ok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/OKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         train:
14 |           name: "blip_image_train"
15 |           image_size: 384
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |         eval:
23 |           name: "blip_caption"
24 | 
25 | run:
26 |   task: retrieval
27 | 
28 |   # dataloading
29 |   num_workers: 4
30 |   batch_size_train: 32
31 |   batch_size_eval: 128
32 | 
33 |   train_splits: ["train"]
34 |   valid_splits: ["val"]
35 |   test_splits: ["test"]
36 | 
37 |   # distribution
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 
44 |   # model specific
45 |   k_test: 256
46 | 
47 |   # misc
48 |   seed: 42
49 |   output_dir: "output/BLIP/Retrieval_COCO"
50 | 
51 |   evaluate: True
52 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: vqav2
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   coco_vqa: # name of the dataset builder
13 |     type: eval
14 |     vis_processor:
15 |         eval:
16 |           name: "blip_image_eval"
17 |           image_size: 480
18 |     text_processor:
19 |         eval:
20 |           name: "blip_question"
21 | 
22 | run:
23 |   task: vqa
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 64
27 |   num_workers: 4
28 | 
29 |   # inference-specific
30 |   max_len: 10
31 |   min_len: 1
32 |   num_beams: 3
33 |   num_ans_candidates: 128
34 |   inference_method: "rank"
35 | 
36 |   seed: 42
37 |   output_dir: "output/BLIP/VQA"
38 | 
39 |   evaluate: True
40 |   test_splits: ["val"]
41 | 
42 |   # distribution-specific
43 |   device: "cuda"
44 |   world_size: 1
45 |   dist_url: "env://"
46 |   distributed: True
47 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/caption_coco_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 | 
 9 |   model_type: base_coco
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   coco_caption: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |           prompt: "a picture of "
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   # task: retrieval
28 |   task: captioning
29 |   # optimizer
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 1e-5
32 |   min_lr: 0
33 |   weight_decay: 0.05
34 |   max_epoch: 5
35 |   batch_size_train: 32
36 |   batch_size_eval: 64
37 |   num_workers: 4
38 | 
39 |   max_len: 20
40 |   min_len: 5
41 |   num_beams: 3
42 | 
43 |   seed: 42
44 |   output_dir: "output/BLIP/Caption_coco"
45 | 
46 |   amp: False
47 |   resume_ckpt_path: null
48 | 
49 |   evaluate: False 
50 |   train_splits: ["train"]
51 |   valid_splits: ["val"]
52 |   test_splits: ["test"]
53 | 
54 |   device: "cuda"
55 |   world_size: 1
56 |   dist_url: "env://"
57 |   distributed: True
58 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/caption_coco_large_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 | 
 9 |   model_type: large_coco
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   coco_caption: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |           prompt: "a picture of "
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: captioning
28 |   # optimizer
29 |   lr_sched: "linear_warmup_cosine_lr"
30 |   init_lr: 2e-6
31 |   min_lr: 0
32 |   weight_decay: 0.05
33 |   max_epoch: 5
34 |   batch_size_train: 16
35 |   batch_size_eval: 64
36 |   num_workers: 4
37 | 
38 |   max_len: 20
39 |   min_len: 5
40 |   num_beams: 3
41 | 
42 |   seed: 42
43 |   output_dir: "output/BLIP/Caption_coco"
44 | 
45 |   amp: False
46 |   resume_ckpt_path: null
47 | 
48 |   evaluate: False 
49 |   train_splits: ["train"]
50 |   valid_splits: ["val"]
51 |   test_splits: ["test"]
52 | 
53 |   device: "cuda"
54 |   world_size: 1
55 |   dist_url: "env://"
56 |   distributed: True
57 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/nlvr_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 | 
 9 |   model_type: nlvr
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   nlvr: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |           image_size: 384
18 |         eval:
19 |           name: "blip_image_eval"
20 |           image_size: 384
21 |     text_processor:
22 |         train:
23 |           name: "blip_caption"
24 |         eval:
25 |           name: "blip_caption"
26 | 
27 | run:
28 |   task: multimodal_classification
29 | 
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 2.5e-5
32 |   min_lr: 0
33 |   weight_decay: 0.05
34 |   max_epoch: 15
35 | 
36 |   batch_size_train: 16
37 |   batch_size_eval: 64
38 |   num_workers: 4
39 | 
40 |   seed: 42
41 |   output_dir: "output/BLIP/NLVR"
42 | 
43 |   amp: False
44 |   resume_ckpt_path: null
45 | 
46 |   evaluate: False 
47 |   train_splits: ["train"]
48 |   valid_splits: ["val", "test"]
49 |   test_splits: ["test"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Bleu_1: 0.832
 7 | # Bleu_2: 0.691
 8 | # Bleu_3: 0.556
 9 | # Bleu_4: 0.438
10 | # METEOR: 0.317
11 | # ROUGE_L: 0.620
12 | # CIDEr: 1.461
13 | # SPICE: 0.252
14 | 
15 | model:
16 |   arch: blip2_opt
17 |   model_type: caption_coco_opt2.7b
18 |   use_grad_checkpoint: False
19 | 
20 | datasets:
21 |   coco_caption: # name of the dataset builder
22 |     vis_processor:
23 |         eval:
24 |           name: "blip_image_eval"
25 |           image_size: 364
26 |     text_processor:
27 |         eval:
28 |           name: "blip_caption"
29 | #     build_info:
30 | #         images:
31 | #             storage: '/export/share/datasets/vision/coco/images/'
32 | 
33 | run:
34 |   task: captioning
35 |   # optimizer
36 |   batch_size_train: 32
37 |   batch_size_eval: 16
38 |   num_workers: 4
39 | 
40 |   max_len: 30
41 |   min_len: 8
42 |   num_beams: 5
43 | 
44 |   seed: 42
45 |   output_dir: "output/BLIP2/Caption_coco_opt2.7b"
46 | 
47 |   evaluate: True
48 |   test_splits: ["test"]
49 | 
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Bleu_1: 0.831
 7 | # Bleu_2: 0.689
 8 | # Bleu_3: 0.552
 9 | # Bleu_4: 0.434
10 | # METEOR: 0.316
11 | # ROUGE_L: 0.618
12 | # CIDEr: 1.451
13 | # SPICE: 0.251
14 | 
15 | model:
16 |   arch: blip2_opt
17 |   model_type: caption_coco_opt6.7b
18 |   use_grad_checkpoint: False
19 | 
20 | datasets:
21 |   coco_caption: # name of the dataset builder
22 |     vis_processor:
23 |         eval:
24 |           name: "blip_image_eval"
25 |           image_size: 364
26 |     text_processor:
27 |         eval:
28 |           name: "blip_caption"
29 | #     build_info:
30 | #         images:
31 | #             storage: '/export/share/datasets/vision/coco/images/'
32 | 
33 | run:
34 |   task: captioning
35 |   # optimizer
36 |   batch_size_train: 32
37 |   batch_size_eval: 16
38 |   num_workers: 4
39 | 
40 |   max_len: 30
41 |   min_len: 8
42 |   num_beams: 5
43 | 
44 |   seed: 42
45 |   output_dir: "output/BLIP2/Caption_coco_opt6.7b"
46 | 
47 |   evaluate: True
48 |   test_splits: ["test"]
49 | 
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 43.98
 7 | model:
 8 |   arch: blip2_t5
 9 |   model_type: pretrain_flant5xl
10 |   use_grad_checkpoint: False
11 | 
12 | datasets:
13 |   gqa: # name of the dataset builder
14 |     type: balanced_testdev
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 224
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 |     build_info:
23 |         images:
24 |             storage: "/export/share/datasets/vision/GQA/images/"
25 | 
26 | run:
27 |   task: gqa
28 |   # optimization-specific
29 |   batch_size_train: 16
30 |   batch_size_eval: 64
31 |   num_workers: 4
32 | 
33 |   # inference-specific
34 |   max_len: 10
35 |   min_len: 1
36 |   num_beams: 5
37 |   inference_method: "generate"
38 |   prompt: "Question: {} Short answer:"
39 | 
40 |   seed: 42
41 |   output_dir: "output/BLIP2/GQA"
42 | 
43 |   evaluate: True
44 |   test_splits: ["val"]
45 | 
46 |   # distribution-specific
47 |   device: "cuda"
48 |   world_size: 1
49 |   dist_url: "env://"
50 |   distributed: True
51 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 41.22
 7 | 
 8 | model:
 9 |   arch: blip2_t5
10 |   model_type: pretrain_flant5xl
11 |   use_grad_checkpoint: False
12 | 
13 |   # for OKVQA evaluation
14 |   apply_lemmatizer: True
15 | 
16 | datasets:
17 |   ok_vqa: # name of the dataset builder
18 |     vis_processor:
19 |         eval:
20 |           name: "blip_image_eval"
21 |           image_size: 224
22 |     text_processor:
23 |         eval:
24 |           name: "blip_question"
25 | #     build_info:
26 | #         images:
27 | #             storage: '/export/share/datasets/vision/coco/images/'
28 | 
29 | run:
30 |   task: vqa
31 |   # optimization-specific
32 |   batch_size_train: 16
33 |   batch_size_eval: 64
34 |   num_workers: 4
35 | 
36 |   # inference-specific
37 |   max_len: 10
38 |   min_len: 1
39 |   num_beams: 5
40 |   inference_method: "generate"
41 |   prompt: "Question: {} Short answer:"
42 | 
43 |   seed: 42
44 |   output_dir: "output/BLIP2/OKVQA"
45 | 
46 |   evaluate: True
47 |   test_splits: ["test"]
48 | 
49 |   # distribution-specific
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "blip_image_train"
16 |           image_size: 364
17 |         eval:
18 |           name: "blip_image_eval"
19 |           image_size: 364
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | #     build_info:
26 | #         images:
27 | #             storage: '/export/share/datasets/vision/coco/images/'
28 | run:
29 |   task: retrieval
30 | 
31 |   # dataloading
32 |   num_workers: 4
33 |   batch_size_train: 16
34 |   batch_size_eval: 32
35 | 
36 |   train_splits: ["train"]
37 |   valid_splits: ["val"]
38 |   test_splits: ["test"]
39 | 
40 |   # distribution
41 |   device: "cuda"
42 |   world_size: 1
43 |   dist_url: "env://"
44 |   distributed: True
45 |   use_dist_eval_sampler: False
46 | 
47 |   # model specific
48 |   k_test: 128
49 | 
50 |   # misc
51 |   seed: 42
52 |   output_dir: "output/BLIP2/Retrieval_COCO"
53 | 
54 |   evaluate: True
55 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 364
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 | 
24 |   # dataloading
25 |   num_workers: 4
26 |   batch_size_train: 16
27 |   batch_size_eval: 32
28 | 
29 |   test_splits: ["test"]
30 | 
31 |   # distribution
32 |   device: "cuda"
33 |   world_size: 1
34 |   dist_url: "env://"
35 |   distributed: True
36 |   use_dist_eval_sampler: False
37 | 
38 |   # model specific
39 |   k_test: 128
40 | 
41 |   # misc
42 |   seed: 42
43 |   output_dir: "output/BLIP2/Retrieval_Flickr30k"
44 | 
45 |   evaluate: True


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 63.13
 7 | # Per Answer Type Accuracy is the following:
 8 | # other : 52.90
 9 | # yes/no : 84.28
10 | # number : 41.01
11 | 
12 | model:
13 |   arch: blip2_t5
14 |   model_type: pretrain_flant5xl
15 |   use_grad_checkpoint: False
16 | 
17 | datasets:
18 |   coco_vqa: # name of the dataset builder
19 |     type: eval
20 |     vis_processor:
21 |         eval:
22 |           name: "blip_image_eval"
23 |           image_size: 224
24 |     text_processor:
25 |         eval:
26 |           name: "blip_question"
27 | #     build_info:
28 | #         images:
29 | #             storage: '/export/share/datasets/vision/coco/images/'
30 | 
31 | run:
32 |   task: vqa
33 |   # optimization-specific
34 |   batch_size_train: 16
35 |   batch_size_eval: 64
36 |   num_workers: 4
37 | 
38 |   # inference-specific
39 |   max_len: 10
40 |   min_len: 1
41 |   num_beams: 5
42 |   inference_method: "generate"
43 |   prompt: "Question: {} Short answer:"
44 | 
45 |   seed: 42
46 |   output_dir: "output/BLIP2/VQA"
47 | 
48 |   evaluate: True
49 |   test_splits: ["val"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/vqav2_zeroshot_opt_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 51.88 (result different from BLIP-2 paper due to different implementation and transformers version)
 7 | 
 8 | model:
 9 |   arch: blip2_opt
10 |   model_type: pretrain_opt2.7b
11 |   use_grad_checkpoint: False
12 | 
13 | datasets:
14 |   coco_vqa: # name of the dataset builder
15 |     type: eval
16 |     vis_processor:
17 |         eval:
18 |           name: "blip_image_eval"
19 |           image_size: 224
20 |     text_processor:
21 |         eval:
22 |           name: "blip_question"
23 |     # build_info:
24 |     #     images:
25 |     #         storage: '/export/share/datasets/vision/coco/images/'
26 | 
27 | run:
28 |   task: vqa
29 |   # optimization-specific
30 |   batch_size_train: 16
31 |   batch_size_eval: 64
32 |   num_workers: 4
33 | 
34 |   # inference-specific
35 |   max_len: 10
36 |   min_len: 1
37 |   num_beams: 5
38 |   inference_method: "generate"
39 |   prompt: "Question: {} Short answer:"
40 | 
41 |   seed: 42
42 |   output_dir: "output/BLIP2/VQA"
43 | 
44 |   evaluate: True
45 |   test_splits: ["val"]
46 | 
47 |   # distribution-specific
48 |   device: "cuda"
49 |   world_size: 1
50 |   dist_url: "env://"
51 |   distributed: True
52 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_coco_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_COCO"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_flickr_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_Flickr"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_imnet_zs_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   imagenet: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "clip_image_eval"
16 |           # image_size: 224
17 |           image_size: 336
18 | 
19 | run:
20 |   task: multimodal_classification
21 | 
22 |   # dataloading
23 |   num_workers: 4
24 |   batch_size_train: 32
25 |   batch_size_eval: 128
26 | 
27 |   test_splits: ["val"]
28 | 
29 |   # distribution
30 |   device: "cuda"
31 |   world_size: 1
32 |   dist_url: "env://"
33 |   distributed: True
34 | 
35 |   # misc
36 |   seed: 42
37 |   output_dir: "output/clip/zs_imnet"
38 | 
39 |   evaluate: True
40 | 


--------------------------------------------------------------------------------
/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   avsd_dialogue: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "gpt_video_ft"
15 |           visual_ft: ["i3d_flow", "i3d_rgb"]
16 |           audio_ft: ["vggish"]
17 |     text_processor:
18 |         eval:
19 |           name: "gpt_dialogue"
20 |           max_turns:  3
21 |           use_caption: True
22 | 
23 | run:
24 |   task: dialogue
25 |   # optimizer
26 |   batch_size_train: 16
27 |   batch_size_eval: 16
28 |   num_workers: 0
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 5
33 | 
34 |   seed: 42
35 |   output_dir: "output/gpt2/dialogue_avsd"
36 | 
37 |   evaluate: True
38 |   valid_splits: ["test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/gqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   gqa: # name of the dataset builder
12 |     type: balanced_testdev
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: gqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 16
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 5
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA/GQA"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 4
25 |   batch_size_eval: 4
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA-3b/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 12
25 |   batch_size_eval: 12
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA-large/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.runners.runner_base import RunnerBase
 9 | from lavis.runners.runner_iter import RunnerIter
10 | 
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 | 


--------------------------------------------------------------------------------
/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/llava_llama_2/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava_llama_2/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/llava_llama_2/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/llava_llama_2/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/llava_llama_2/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/llava_llama_2/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/llava_llama_2/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/llava_llama_2/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/llava_llama_2/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/llava_llama_2/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 | 


--------------------------------------------------------------------------------
/llava_llama_2/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava_llama_2.model.consolidate --src ~/model_weights/llava_llama_2-7b --dst ~/model_weights/llava_llama_2-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava_llama_2.model import *
10 | from llava_llama_2.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava_llama_2/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/llava_llama_2/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .clip_encoder import CLIPVisionTower
 2 | 
 3 | 
 4 | def build_vision_tower(vision_tower_cfg, **kwargs):
 5 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 6 |     if vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 7 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
 8 | 
 9 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
10 | 


--------------------------------------------------------------------------------
/llava_llama_2/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava_llama_2' in config and 'llava_llama_2' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava_llama_2")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava_llama_2/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/__init__.py


--------------------------------------------------------------------------------
/llava_llama_2/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava_llama_2/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava_llama_2/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava_llama_2/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from llava_llama_2.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | replace_llama_attn_with_flash_attn()
 9 | 
10 | from llava_llama_2.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/metric/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__init__.py


--------------------------------------------------------------------------------
/metric/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/metric/__pycache__/perspective_api.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__pycache__/perspective_api.cpython-38.pyc


--------------------------------------------------------------------------------
/metric/detoxify.py:
--------------------------------------------------------------------------------
 1 | from detoxify import Detoxify
 2 | from typing import Dict, Optional, List
 3 | 
 4 | class DetoxifyScorer:
 5 |     def __init__(self):
 6 |         self.model = Detoxify('original', device='cuda')
 7 |     def get_scores(self, input_text: str) -> Dict[str, float]:
 8 |         return self.model.predict(input_text)
 9 | 
10 | 


--------------------------------------------------------------------------------
/minigpt4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/.DS_Store


--------------------------------------------------------------------------------
/minigpt4/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from minigpt4.common.registry import registry
14 | 
15 | from minigpt4.datasets.builders import *
16 | from minigpt4.models import *
17 | from minigpt4.processors import *
18 | from minigpt4.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/minigpt4/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__init__.py


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/config.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/config.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/dist_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/dist_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/logger.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/logger.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/registry.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/registry.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/utils.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/minigpt4/configs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/configs/.DS_Store


--------------------------------------------------------------------------------
/minigpt4/configs/datasets/cc_sbu/align.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu_align:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_align/
6 | 


--------------------------------------------------------------------------------
/minigpt4/configs/datasets/cc_sbu/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_dataset/{00000..01255}.tar
6 | 


--------------------------------------------------------------------------------
/minigpt4/configs/datasets/laion/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   laion:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/laion_dataset/{00000..10488}.tar
6 | 


--------------------------------------------------------------------------------
/minigpt4/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 |   # For default users
3 |   # cache_root: "cache"
4 |   # For internal use with persistent storage
5 |   cache_root: "/export/home/.cache/minigpt4"
6 | 


--------------------------------------------------------------------------------
/minigpt4/configs/models/llama.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # Vicuna
16 |   llama_model: "./ckpts/llama-13b"
17 | 
18 |   # generation configs
19 |   prompt: ""
20 | 
21 | preprocess:
22 |     vis_processor:
23 |         train:
24 |           name: "blip2_image_train"
25 |           image_size: 224
26 |         eval:
27 |           name: "blip2_image_eval"
28 |           image_size: 224
29 |     text_processor:
30 |         train:
31 |           name: "blip_caption"
32 |         eval:
33 |           name: "blip_caption"
34 | 


--------------------------------------------------------------------------------
/minigpt4/configs/models/minigpt4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # Vicuna
16 |   llama_model: "./ckpts/vicuna-13b-weights"
17 | 
18 |   # generation configs
19 |   prompt: ""
20 | 
21 | preprocess:
22 |     vis_processor:
23 |         train:
24 |           name: "blip2_image_train"
25 |           image_size: 224
26 |         eval:
27 |           name: "blip2_image_eval"
28 |           image_size: 224
29 |     text_processor:
30 |         train:
31 |           name: "blip_caption"
32 |         eval:
33 |           name: "blip_caption"
34 | 


--------------------------------------------------------------------------------
/minigpt4/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__init__.py


--------------------------------------------------------------------------------
/minigpt4/conversation/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/conversation/__pycache__/conversation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__pycache__/conversation.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__init__.py


--------------------------------------------------------------------------------
/minigpt4/datasets/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/__pycache__/data_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__pycache__/data_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/builders/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/builders/__pycache__/custom_builder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/custom_builder.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__init__.py


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import webdataset as wds
 9 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class LaionDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, location):
14 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 | 
16 |         self.inner_dataset = wds.DataPipeline(
17 |             wds.ResampledShards(location),
18 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 |             wds.shuffle(1000, handler=wds.warn_and_continue),
20 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
21 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
24 |         )
25 | 
26 |     def to_dict(self, sample):
27 |         return {
28 |             "image": sample[0],
29 |             "text_input": self.text_processor(sample[1]["caption"]),
30 |         }
31 | 
32 | 


--------------------------------------------------------------------------------
/minigpt4/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/.DS_Store


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/Qformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/Qformer.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/base_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/base_model.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/blip2.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/blip2.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/eva_vit.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/eva_vit.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/mini_gpt4.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/mini_gpt4.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/models/__pycache__/modeling_llama.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/modeling_llama.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.processors.base_processor import BaseProcessor
 9 | from minigpt4.processors.blip_processors import (
10 |     Blip2ImageTrainProcessor,
11 |     Blip2ImageEvalProcessor,
12 |     BlipCaptionProcessor,
13 | )
14 | 
15 | from minigpt4.common.registry import registry
16 | 
17 | __all__ = [
18 |     "BaseProcessor",
19 |     "Blip2ImageTrainProcessor",
20 |     "Blip2ImageEvalProcessor",
21 |     "BlipCaptionProcessor",
22 | ]
23 | 
24 | 
25 | def load_processor(name, cfg=None):
26 |     """
27 |     Example
28 | 
29 |     >>> processor = load_processor("alpro_video_train", cfg=None)
30 |     """
31 |     processor = registry.get_processor_class(name).from_config(cfg)
32 | 
33 |     return processor
34 | 


--------------------------------------------------------------------------------
/minigpt4/processors/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/processors/__pycache__/base_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/base_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/processors/__pycache__/blip_processors.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/blip_processors.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/processors/__pycache__/randaugment.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/randaugment.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/minigpt4/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.runners.runner_base import RunnerBase
 9 | 
10 | __all__ = ["RunnerBase"]
11 | 


--------------------------------------------------------------------------------
/minigpt4/runners/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/runners/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/runners/__pycache__/runner_base.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/runners/__pycache__/runner_base.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
11 | 
12 | 
13 | def setup_task(cfg):
14 |     assert "task" in cfg.run_cfg, "Task name must be provided."
15 | 
16 |     task_name = cfg.run_cfg.task
17 |     task = registry.get_task_class(task_name).setup_task(cfg=cfg)
18 |     assert task is not None, "Task {} not properly registered.".format(task_name)
19 | 
20 |     return task
21 | 
22 | 
23 | __all__ = [
24 |     "BaseTask",
25 |     "ImageTextPretrainTask",
26 | ]
27 | 


--------------------------------------------------------------------------------
/minigpt4/tasks/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/tasks/__pycache__/base_task.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/base_task.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-39.pyc


--------------------------------------------------------------------------------
/minigpt4/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------