├── README.md ├── adversarial_images ├── .DS_Store ├── clean.jpeg ├── prompt_constrained_16.bmp ├── prompt_constrained_32.bmp ├── prompt_constrained_64.bmp └── prompt_unconstrained.bmp ├── assets ├── .DS_Store ├── attack.png ├── crime.png ├── demo.png ├── gender.png ├── human_race.png ├── race.png ├── religious-1.png └── religious-2.png ├── blip_utils └── visual_attacker.py ├── cal_metrics.py ├── demo.py ├── environment.yml ├── eval_configs ├── llama.yaml └── minigpt4_eval.yaml ├── get_metric.py ├── harmful_corpus ├── README.md ├── derogatory_corpus.csv ├── manual_harmful_instructions.csv └── red_teaming_prompts.jsonl ├── instructblip_inference.py ├── instructblip_visual_attack.py ├── lavis ├── .DS_Store ├── LICENSE_Lavis.md ├── __init__.py ├── common │ ├── config.py │ ├── dist_utils.py │ ├── gradcam.py │ ├── logger.py │ ├── optims.py │ ├── registry.py │ ├── utils.py │ └── vqa_tools │ │ ├── __init__.py │ │ ├── vqa.py │ │ └── vqa_eval.py ├── configs │ ├── datasets │ │ ├── aokvqa │ │ │ └── defaults.yaml │ │ ├── avsd │ │ │ └── defaults_dial.yaml │ │ ├── coco │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_ret.yaml │ │ │ ├── defaults_vqa.yaml │ │ │ └── eval_vqa.yaml │ │ ├── conceptual_caption │ │ │ ├── defaults_12m.yaml │ │ │ └── defaults_3m.yaml │ │ ├── didemo │ │ │ └── defaults_ret.yaml │ │ ├── flickr30k │ │ │ └── defaults.yaml │ │ ├── gqa │ │ │ ├── balanced_testdev.yaml │ │ │ ├── balanced_val.yaml │ │ │ └── defaults.yaml │ │ ├── imagenet │ │ │ └── defaults.yaml │ │ ├── laion │ │ │ └── defaults_2B_multi.yaml │ │ ├── msrvtt │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_qa.yaml │ │ │ └── defaults_ret.yaml │ │ ├── msvd │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_qa.yaml │ │ ├── nlvr │ │ │ └── defaults.yaml │ │ ├── nocaps │ │ │ └── defaults.yaml │ │ ├── okvqa │ │ │ └── defaults.yaml │ │ ├── sbu_caption │ │ │ └── defaults.yaml │ │ ├── snli_ve │ │ │ └── defaults.yaml │ │ ├── vatex │ │ │ └── defaults_cap.yaml │ │ └── vg │ │ │ ├── defaults_caption.yaml │ │ │ └── defaults_vqa.yaml │ ├── default.yaml │ └── models │ │ ├── albef_classification_ve.yaml │ │ ├── albef_feature_extractor.yaml │ │ ├── albef_nlvr.yaml │ │ ├── albef_pretrain_base.yaml │ │ ├── albef_retrieval_coco.yaml │ │ ├── albef_retrieval_flickr.yaml │ │ ├── albef_vqav2.yaml │ │ ├── alpro_qa_msrvtt.yaml │ │ ├── alpro_qa_msvd.yaml │ │ ├── alpro_retrieval_didemo.yaml │ │ ├── alpro_retrieval_msrvtt.yaml │ │ ├── bert_config.json │ │ ├── bert_config_alpro.json │ │ ├── blip2 │ │ ├── blip2_caption_flant5xl.yaml │ │ ├── blip2_caption_opt2.7b.yaml │ │ ├── blip2_caption_opt6.7b.yaml │ │ ├── blip2_coco.yaml │ │ ├── blip2_instruct_flant5xl.yaml │ │ ├── blip2_instruct_flant5xxl.yaml │ │ ├── blip2_instruct_vicuna13b.yaml │ │ ├── blip2_instruct_vicuna7b.yaml │ │ ├── blip2_pretrain.yaml │ │ ├── blip2_pretrain_flant5xl.yaml │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ ├── blip2_pretrain_llama7b.yaml │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ └── blip2_pretrain_vitL.yaml │ │ ├── blip_caption_base_coco.yaml │ │ ├── blip_caption_large_coco.yaml │ │ ├── blip_classification_base.yaml │ │ ├── blip_feature_extractor_base.yaml │ │ ├── blip_itm_base.yaml │ │ ├── blip_itm_large.yaml │ │ ├── blip_nlvr.yaml │ │ ├── blip_pretrain_base.yaml │ │ ├── blip_pretrain_large.yaml │ │ ├── blip_retrieval_coco.yaml │ │ ├── blip_retrieval_flickr.yaml │ │ ├── blip_vqa_aokvqa.yaml │ │ ├── blip_vqa_okvqa.yaml │ │ ├── blip_vqav2.yaml │ │ ├── clip │ │ ├── RN101-quickgelu.json │ │ ├── RN101.json │ │ ├── RN50-quickgelu.json │ │ ├── RN50.json │ │ ├── RN50x16.json │ │ ├── RN50x4.json │ │ ├── ViT-B-16-plus-240.json │ │ ├── ViT-B-16-plus.json │ │ ├── ViT-B-16.json │ │ ├── ViT-B-32-plus-256.json │ │ ├── ViT-B-32-quickgelu.json │ │ ├── ViT-B-32.json │ │ ├── ViT-H-14.json │ │ ├── ViT-H-16.json │ │ ├── ViT-L-14-280.json │ │ ├── ViT-L-14-336.json │ │ ├── ViT-L-14.json │ │ ├── ViT-L-16-320.json │ │ ├── ViT-L-16.json │ │ ├── ViT-g-14.json │ │ ├── timm-efficientnetv2_rw_s.json │ │ ├── timm-resnet50d.json │ │ ├── timm-resnetaa50d.json │ │ ├── timm-resnetblur50.json │ │ ├── timm-swin_base_patch4_window7_224.json │ │ ├── timm-vit_base_patch16_224.json │ │ ├── timm-vit_base_patch32_224.json │ │ └── timm-vit_small_patch16_224.json │ │ ├── clip_resnet50.yaml │ │ ├── clip_vit_base16.yaml │ │ ├── clip_vit_base32.yaml │ │ ├── clip_vit_large14.yaml │ │ ├── clip_vit_large14_336.yaml │ │ ├── gpt_dialogue_base.yaml │ │ ├── img2prompt-vqa │ │ └── img2prompt_vqa_base.yaml │ │ ├── med_config.json │ │ ├── med_config_albef.json │ │ ├── med_large_config.json │ │ └── pnp-vqa │ │ ├── pnp_vqa_3b.yaml │ │ ├── pnp_vqa_base.yaml │ │ ├── pnp_vqa_large.yaml │ │ ├── unifiedqav2_3b_config.json │ │ ├── unifiedqav2_base_config.json │ │ └── unifiedqav2_large_config.json ├── datasets │ ├── builders │ │ ├── __init__.py │ │ ├── base_dataset_builder.py │ │ ├── caption_builder.py │ │ ├── classification_builder.py │ │ ├── dialogue_builder.py │ │ ├── image_text_pair_builder.py │ │ ├── imagefolder_builder.py │ │ ├── retrieval_builder.py │ │ ├── video_qa_builder.py │ │ └── vqa_builder.py │ ├── data_utils.py │ ├── datasets │ │ ├── aok_vqa_datasets.py │ │ ├── avsd_dialogue_datasets.py │ │ ├── base_dataset.py │ │ ├── caption_datasets.py │ │ ├── coco_caption_datasets.py │ │ ├── coco_vqa_datasets.py │ │ ├── dataloader_utils.py │ │ ├── dialogue_datasets.py │ │ ├── gqa_datasets.py │ │ ├── image_text_pair_datasets.py │ │ ├── imagefolder_dataset.py │ │ ├── laion_dataset.py │ │ ├── multimodal_classification_datasets.py │ │ ├── nlvr_datasets.py │ │ ├── retrieval_datasets.py │ │ ├── snli_ve_datasets.py │ │ ├── vg_vqa_datasets.py │ │ ├── video_caption_datasets.py │ │ ├── video_vqa_datasets.py │ │ └── vqa_datasets.py │ └── download_scripts │ │ ├── DownloadConceptualCaptions │ │ ├── LICENSE │ │ ├── README.md │ │ ├── create_annotation_12m.ipynb │ │ ├── create_annotation_3m.ipynb │ │ ├── download_data_cc12m.py │ │ └── download_data_cc3m.py │ │ ├── download_coco.py │ │ ├── download_didemo.py │ │ ├── download_flickr.py │ │ ├── download_gqa.py │ │ ├── download_msrvtt.py │ │ ├── download_msvd.py │ │ ├── download_nocaps.py │ │ ├── download_sbu.py │ │ └── download_vg.py ├── models │ ├── __init__.py │ ├── albef_models │ │ ├── __init__.py │ │ ├── albef_classification.py │ │ ├── albef_feature_extractor.py │ │ ├── albef_nlvr.py │ │ ├── albef_outputs.py │ │ ├── albef_pretrain.py │ │ ├── albef_retrieval.py │ │ └── albef_vqa.py │ ├── alpro_models │ │ ├── __init__.py │ │ ├── alpro_outputs.py │ │ ├── alpro_qa.py │ │ └── alpro_retrieval.py │ ├── base_model.py │ ├── blip2_models │ │ ├── Qformer.py │ │ ├── __init__.py │ │ ├── blip2.py │ │ ├── blip2_image_text_matching.py │ │ ├── blip2_opt.py │ │ ├── blip2_qformer.py │ │ ├── blip2_t5.py │ │ ├── blip2_t5_instruct.py │ │ ├── blip2_vicuna_instruct.py │ │ ├── modeling_llama.py │ │ ├── modeling_opt.py │ │ └── modeling_t5.py │ ├── blip_models │ │ ├── __init__.py │ │ ├── blip.py │ │ ├── blip_caption.py │ │ ├── blip_classification.py │ │ ├── blip_feature_extractor.py │ │ ├── blip_image_text_matching.py │ │ ├── blip_nlvr.py │ │ ├── blip_outputs.py │ │ ├── blip_pretrain.py │ │ ├── blip_retrieval.py │ │ ├── blip_vqa.py │ │ └── nlvr_encoder.py │ ├── clip_models │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── clip_outputs.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── pics │ │ │ └── CLIP.png │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ └── utils.py │ ├── clip_vit.py │ ├── eva_vit.py │ ├── gpt_models │ │ └── gpt_dialogue.py │ ├── img2prompt_models │ │ ├── __init__.py │ │ └── img2prompt_vqa.py │ ├── med.py │ ├── pnp_vqa_models │ │ ├── __init__.py │ │ ├── pnp_unifiedqav2_fid.py │ │ └── pnp_vqa.py │ ├── timesformer │ │ ├── __init__.py │ │ ├── conv2d_same.py │ │ ├── features.py │ │ ├── helpers.py │ │ ├── linear.py │ │ ├── vit.py │ │ └── vit_utils.py │ └── vit.py ├── processors │ ├── __init__.py │ ├── alpro_processors.py │ ├── base_processor.py │ ├── blip_processors.py │ ├── clip_processors.py │ ├── functional_video.py │ ├── gpt_processors.py │ ├── randaugment.py │ └── transforms_video.py ├── projects │ ├── albef │ │ ├── eval │ │ │ ├── nlvr_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr30k_eval.yaml │ │ │ ├── snli_ve_eval.yaml │ │ │ ├── vqa_test.yaml │ │ │ └── vqa_val.yaml │ │ └── train │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain.yaml │ │ │ ├── ret_coco_ft.yaml │ │ │ ├── ret_flickr30k_ft.yaml │ │ │ ├── snli_ve_ft.yaml │ │ │ └── vqa_ft.yaml │ ├── alpro │ │ ├── eval │ │ │ ├── didemo_ret_eval.yaml │ │ │ ├── msrvtt_qa_eval.yaml │ │ │ ├── msrvtt_ret_eval.yaml │ │ │ └── msvd_qa_eval.yaml │ │ └── train │ │ │ ├── didemo_ret_ft.yaml │ │ │ ├── msrvtt_qa_ft.yaml │ │ │ ├── msrvtt_retrieval_ft.yaml │ │ │ └── msvd_qa_ft.yaml │ ├── blip │ │ ├── coco_cap_ft_iter.yaml │ │ ├── eval │ │ │ ├── aokvqa_eval.yaml │ │ │ ├── caption_coco_eval.yaml │ │ │ ├── caption_coco_eval_large.yaml │ │ │ ├── nlvr_eval.yaml │ │ │ ├── nocaps_eval.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ └── vqav2_eval.yaml │ │ └── train │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── caption_coco_large_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain_14m.yaml │ │ │ ├── retrieval_coco_ft.yaml │ │ │ ├── retrieval_flickr_ft.yaml │ │ │ └── vqav2_ft.yaml │ ├── blip2 │ │ ├── eval │ │ │ ├── caption_coco_flant5xl_eval.yaml │ │ │ ├── caption_coco_opt2.7b_eval.yaml │ │ │ ├── caption_coco_opt6.7b_eval.yaml │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ ├── vqav2_zeroshot_flant5xl_eval.yaml │ │ │ └── vqav2_zeroshot_opt_eval.yaml │ │ └── train │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── pretrain_stage1.yaml │ │ │ ├── pretrain_stage2.yaml │ │ │ └── retrieval_coco_ft.yaml │ ├── clip │ │ ├── exp_coco_ret_eval.yaml │ │ ├── exp_flickr_ret_eval.yaml │ │ └── exp_imnet_zs_eval.yaml │ ├── gpt │ │ ├── eval │ │ │ └── dialogue_avsd_eval.yaml │ │ └── train │ │ │ └── dialogue_avsd_ft.yaml │ └── pnp-vqa │ │ └── eval │ │ ├── gqa_eval.yaml │ │ ├── gqa_eval_3b.yaml │ │ ├── gqa_eval_large.yaml │ │ ├── okvqa_eval.yaml │ │ ├── okvqa_eval_3b.yaml │ │ ├── okvqa_eval_large.yaml │ │ ├── vqav2_eval.yaml │ │ ├── vqav2_eval_3b.yaml │ │ ├── vqav2_eval_large.yaml │ │ ├── vqav2_test_eval.yaml │ │ ├── vqav2_test_eval_3b.yaml │ │ └── vqav2_test_eval_large.yaml ├── runners │ ├── __init__.py │ ├── runner_base.py │ └── runner_iter.py └── tasks │ ├── __init__.py │ ├── base_task.py │ ├── captioning.py │ ├── dialogue.py │ ├── image_text_pretrain.py │ ├── multimodal_classification.py │ ├── retrieval.py │ ├── vqa.py │ └── vqa_reading_comprehension.py ├── llava_llama_2 ├── LICENSE ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── README.md │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── generate_webpage_data_from_table.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_ds.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ ├── table │ │ ├── answer │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ ├── answer_bard.jsonl │ │ │ ├── answer_gpt35.jsonl │ │ │ ├── answer_llama-13b.jsonl │ │ │ └── answer_vicuna-13b.jsonl │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ ├── model.jsonl │ │ ├── prompt.jsonl │ │ ├── question.jsonl │ │ ├── results │ │ │ └── test_sqa_llava_13b_v0.json │ │ ├── review │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ ├── reviewer.jsonl │ │ └── rule.json │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_mpt.py │ │ └── mpt │ │ │ ├── adapt_tokenizer.py │ │ │ ├── attention.py │ │ │ ├── blocks.py │ │ │ ├── configuration_mpt.py │ │ │ ├── custom_embedding.py │ │ │ ├── flash_attn_triton.py │ │ │ ├── hf_prefixlm_converter.py │ │ │ ├── meta_init_context.py │ │ │ ├── modeling_mpt.py │ │ │ ├── norm.py │ │ │ └── param_init_fns.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ └── train_mem.py └── utils.py ├── llava_llama_2_utils ├── generator.py ├── prompt_wrapper.py └── visual_attacker.py ├── llava_llama_v2_inference.py ├── llava_llama_v2_visual_attack.py ├── metric ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── perspective_api.cpython-38.pyc ├── detoxify.py └── perspective_api.py ├── minigpt4 ├── .DS_Store ├── LICENSE_MiniGPT-4.md ├── __init__.py ├── __pycache__ │ └── __init__.cpython-39.pyc ├── common │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── config.cpython-39.pyc │ │ ├── dist_utils.cpython-39.pyc │ │ ├── logger.cpython-39.pyc │ │ ├── registry.cpython-39.pyc │ │ └── utils.cpython-39.pyc │ ├── config.py │ ├── dist_utils.py │ ├── gradcam.py │ ├── logger.py │ ├── optims.py │ ├── registry.py │ └── utils.py ├── configs │ ├── .DS_Store │ ├── datasets │ │ ├── cc_sbu │ │ │ ├── align.yaml │ │ │ └── defaults.yaml │ │ └── laion │ │ │ └── defaults.yaml │ ├── default.yaml │ └── models │ │ ├── llama.yaml │ │ └── minigpt4.yaml ├── conversation │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── conversation.cpython-39.pyc │ └── conversation.py ├── datasets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── data_utils.cpython-39.pyc │ ├── builders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── base_dataset_builder.cpython-39.pyc │ │ │ ├── custom_builder.cpython-39.pyc │ │ │ └── image_text_pair_builder.cpython-39.pyc │ │ ├── base_dataset_builder.py │ │ └── image_text_pair_builder.py │ ├── data_utils.py │ └── datasets │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── base_dataset.cpython-39.pyc │ │ ├── caption_datasets.cpython-39.pyc │ │ ├── cc_sbu_dataset.cpython-39.pyc │ │ ├── dataloader_utils.cpython-39.pyc │ │ └── laion_dataset.cpython-39.pyc │ │ ├── base_dataset.py │ │ ├── caption_datasets.py │ │ ├── cc_sbu_dataset.py │ │ ├── dataloader_utils.py │ │ └── laion_dataset.py ├── models │ ├── .DS_Store │ ├── Qformer.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── Qformer.cpython-39.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── base_model.cpython-39.pyc │ │ ├── blip2.cpython-39.pyc │ │ ├── eva_vit.cpython-39.pyc │ │ ├── mini_gpt4.cpython-39.pyc │ │ └── modeling_llama.cpython-39.pyc │ ├── base_model.py │ ├── blip2.py │ ├── blip2_outputs.py │ ├── eva_vit.py │ ├── mini_gpt4.py │ └── modeling_llama.py ├── processors │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── base_processor.cpython-39.pyc │ │ ├── blip_processors.cpython-39.pyc │ │ └── randaugment.cpython-39.pyc │ ├── base_processor.py │ ├── blip_processors.py │ └── randaugment.py ├── runners │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── runner_base.cpython-39.pyc │ └── runner_base.py └── tasks │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── base_task.cpython-39.pyc │ └── image_text_pretrain.cpython-39.pyc │ ├── base_task.py │ └── image_text_pretrain.py ├── minigpt_inference.py ├── minigpt_red_teaming.py ├── minigpt_test_manual_prompts_text_llm.py ├── minigpt_test_manual_prompts_visual_llm.py ├── minigpt_textual_attack.py ├── minigpt_utils ├── generator.py ├── prompt_wrapper.py ├── text_attacker.py └── visual_attacker.py └── minigpt_visual_attack.py /adversarial_images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/.DS_Store -------------------------------------------------------------------------------- /adversarial_images/clean.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/clean.jpeg -------------------------------------------------------------------------------- /adversarial_images/prompt_constrained_16.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_16.bmp -------------------------------------------------------------------------------- /adversarial_images/prompt_constrained_32.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_32.bmp -------------------------------------------------------------------------------- /adversarial_images/prompt_constrained_64.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_constrained_64.bmp -------------------------------------------------------------------------------- /adversarial_images/prompt_unconstrained.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/adversarial_images/prompt_unconstrained.bmp -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/.DS_Store -------------------------------------------------------------------------------- /assets/attack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/attack.png -------------------------------------------------------------------------------- /assets/crime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/crime.png -------------------------------------------------------------------------------- /assets/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/demo.png -------------------------------------------------------------------------------- /assets/gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/gender.png -------------------------------------------------------------------------------- /assets/human_race.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/human_race.png -------------------------------------------------------------------------------- /assets/race.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/race.png -------------------------------------------------------------------------------- /assets/religious-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/religious-1.png -------------------------------------------------------------------------------- /assets/religious-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/assets/religious-2.png -------------------------------------------------------------------------------- /eval_configs/llama.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | model_type: pretrain_llama 4 | freeze_vit: True 5 | freeze_qformer: True 6 | max_txt_len: 160 7 | end_sym: "###" 8 | low_resource: True 9 | prompt_path: "" 10 | prompt_template: '###Human: {} ###Assistant: ' 11 | ckpt: 'ckpts/pretrained_minigpt4.pth' 12 | 13 | 14 | datasets: 15 | cc_sbu_align: 16 | vis_processor: 17 | train: 18 | name: "blip2_image_eval" 19 | image_size: 224 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | 24 | run: 25 | task: image_text_pretrain 26 | -------------------------------------------------------------------------------- /eval_configs/minigpt4_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | model_type: pretrain_vicuna 4 | freeze_vit: True 5 | freeze_qformer: True 6 | max_txt_len: 160 7 | end_sym: "###" 8 | low_resource: True 9 | prompt_path: "" 10 | prompt_template: '###Human: {} ###Assistant: ' 11 | ckpt: 'ckpts/pretrained_minigpt4.pth' 12 | 13 | 14 | datasets: 15 | cc_sbu_align: 16 | vis_processor: 17 | train: 18 | name: "blip2_image_eval" 19 | image_size: 224 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | 24 | run: 25 | task: image_text_pretrain 26 | -------------------------------------------------------------------------------- /harmful_corpus/README.md: -------------------------------------------------------------------------------- 1 | # Harmful Corpus for Research Purposes 2 | 3 | **Warning: This directory contains materials that are toxic and offensive in nature.** 4 | 5 | `derogatory_corpus.csv` contains a small corpus comprised of derogatory content against females, African American, and the human race. This corpus is used for optimizing adversarial examples and is only intended for research purposes. 6 | 7 | `manual_harmful_instructions.csv` contains 40 examples of harmful instructions for evaluation purposes. 8 | 9 | **All these materials do not, in any way, reflect the personal views or beliefs of the authors.** 10 | -------------------------------------------------------------------------------- /lavis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/.DS_Store -------------------------------------------------------------------------------- /lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.registry import registry 14 | 15 | from lavis.datasets.builders import * 16 | from lavis.models import * 17 | from lavis.processors import * 18 | from lavis.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json 18 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 19 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 20 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json 21 | storage: 22 | - coco/annotations/vqa_val_eval.json 23 | - coco/annotations/answer_list.json 24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 25 | - coco/annotations/v2_mscoco_val2014_annotations.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/gqa/balanced_testdev.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json 22 | storage: 23 | - gqa/annotations/testdev_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /lavis/configs/datasets/gqa/balanced_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json 22 | storage: 23 | - gqa/annotations/val_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json 16 | storage: msrvtt/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json 19 | storage: msrvtt/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json 22 | storage: msrvtt/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json 25 | storage: msrvtt/annotations/qa_ans2label.json 26 | videos: 27 | storage: msrvtt/videos 28 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msvd/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json 16 | storage: msvd/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json 19 | storage: msvd/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json 22 | storage: msvd/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json 25 | storage: msvd/annotations/qa_ans2label.json 26 | videos: 27 | storage: msvd/videos 28 | 29 | instance_id_key: question_id 30 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xxl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xxl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "./ckpts/vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "./llm/vicuna-7b" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_llama 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # LLM 24 | llm_model: "/export/home/project/stanford_alpaca/llama_7B" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip2_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/datasets/builders/classification_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset 11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset 12 | 13 | 14 | @registry.register_builder("nlvr") 15 | class NLVRBuilder(BaseDatasetBuilder): 16 | train_dataset_cls = NLVRDataset 17 | eval_dataset_cls = NLVREvalDataset 18 | 19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"} 20 | 21 | 22 | @registry.register_builder("snli_ve") 23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder): 24 | train_dataset_cls = SNLIVisualEntialmentDataset 25 | eval_dataset_cls = SNLIVisualEntialmentDataset 26 | 27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"} 28 | -------------------------------------------------------------------------------- /lavis/datasets/builders/dialogue_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.avsd_dialogue_datasets import ( 11 | AVSDDialDataset, 12 | AVSDDialEvalDataset, 13 | ) 14 | 15 | 16 | @registry.register_builder("avsd_dialogue") 17 | class AVSDDialBuilder(BaseDatasetBuilder): 18 | train_dataset_cls = AVSDDialDataset 19 | eval_dataset_cls = AVSDDialEvalDataset 20 | 21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"} 22 | -------------------------------------------------------------------------------- /lavis/datasets/datasets/multimodal_classification_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from abc import abstractmethod 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class MultimodalClassificationDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 15 | 16 | self.class_labels = None 17 | 18 | @abstractmethod 19 | def _build_class_labels(self): 20 | pass 21 | -------------------------------------------------------------------------------- /lavis/datasets/datasets/vg_vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | 10 | from PIL import Image 11 | 12 | from lavis.datasets.datasets.vqa_datasets import VQADataset 13 | 14 | 15 | class VGVQADataset(VQADataset): 16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 18 | 19 | def __getitem__(self, index): 20 | ann = self.annotation[index] 21 | 22 | image_path = os.path.join(self.vis_root, ann["image"]) 23 | image = Image.open(image_path).convert("RGB") 24 | 25 | image = self.vis_processor(image) 26 | question = self.text_processor(ann["question"]) 27 | 28 | answers = [ann["answer"]] 29 | # TODO this should be configured better 30 | weights = [0.2] 31 | 32 | return { 33 | "image": image, 34 | "text_input": question, 35 | "answers": answers, 36 | "weights": weights, 37 | } 38 | -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | MIT License 6 | 7 | Copyright (c) 2019 Igor Brigadir 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | # Download Conceptual Captions Data 9 | 10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder 11 | 12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333) 13 | 14 | run `download_data_cc3m.py` or `download_data_cc12m.py`. 15 | 16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is. 17 | 18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates. 19 | 20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this. 21 | 22 | It should take about a day or two to download the training data, keep an eye on disk space. 23 | -------------------------------------------------------------------------------- /lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /lavis/models/clip_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | """ OpenAI pretrained model functions 11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. 12 | 13 | Originally MIT License, Copyright (c) 2021 OpenAI. 14 | """ 15 | -------------------------------------------------------------------------------- /lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /lavis/models/clip_models/pics/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/models/clip_models/pics/CLIP.png -------------------------------------------------------------------------------- /lavis/models/img2prompt_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /lavis/models/pnp_vqa_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | def prepare_qa_input(sample, num_captions, num_captions_fid): 12 | sample_question_captions = [] 13 | 14 | for question, captions in zip(sample['text_input'], sample['captions']): 15 | assert isinstance(captions, list) 16 | question_captions = [] 17 | question_caption = '' 18 | for cap_id, cap_ in enumerate(captions[0:num_captions]): 19 | question_caption += (cap_.strip() + '. ') 20 | if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0): 21 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 22 | question_captions.append(question_caption) 23 | question_caption = '' 24 | if (cap_id + 1) == num_captions: 25 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 26 | question_captions.append(question_caption) 27 | sample_question_captions.append(question_captions) 28 | 29 | sample['question_captions'] = sample_question_captions 30 | -------------------------------------------------------------------------------- /lavis/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /lavis/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/ALBEF/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_COCO" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_flickr30k_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/snli_ve_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | 10 | datasets: 11 | snli_ve: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | task: multimodal_classification 21 | # optimization-specific 22 | batch_size_train: 32 23 | batch_size_eval: 64 24 | num_workers: 4 25 | 26 | seed: 42 27 | output_dir: "output/ALBEF/SNLI_VE" 28 | 29 | evaluate: True 30 | test_splits: ["val", "test"] 31 | 32 | # distribution-specific 33 | device: "cuda" 34 | world_size: 1 35 | dist_url: "env://" 36 | distributed: True 37 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_test.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | 13 | datasets: 14 | coco_vqa: # name of the dataset builder 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | test_splits: ["test"] 44 | 45 | # distribution-specific 46 | device: "cuda" 47 | world_size: 1 48 | dist_url: "env://" 49 | distributed: True 50 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | coco_vqa: # name of the dataset builder 14 | type: eval 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | test_splits: ["val"] 43 | 44 | # distribution-specific 45 | device: "cuda" 46 | world_size: 1 47 | dist_url: "env://" 48 | distributed: True 49 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/nlvr_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | load_finetuned: False 10 | 11 | datasets: 12 | nlvr: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "blip_image_train" 16 | image_size: 384 17 | eval: 18 | name: "blip_image_eval" 19 | image_size: 384 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: multimodal_classification 28 | # optimization-specific 29 | lr_sched: "linear_warmup_cosine_lr" 30 | init_lr: 2e-5 31 | min_lr: 1e-6 32 | weight_decay: 0.02 33 | warmup_lr: 1e-5 34 | warmup_steps: 650 35 | max_epoch: 10 36 | batch_size_train: 16 37 | batch_size_eval: 64 38 | num_workers: 4 39 | 40 | seed: 42 41 | output_dir: "output/ALBEF/NLVR" 42 | 43 | amp: False 44 | resume_ckpt_path: null 45 | 46 | evaluate: False 47 | train_splits: ["train"] 48 | valid_splits: ["val", "test"] 49 | test_splits: ["test"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/snli_ve_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | load_finetuned: False 10 | num_classes: 3 11 | 12 | datasets: 13 | snli_ve: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | eval: 23 | name: "blip_caption" 24 | 25 | run: 26 | task: multimodal_classification 27 | # optimization-specific 28 | lr_sched: "linear_warmup_cosine_lr" 29 | init_lr: 2e-5 30 | min_lr: 0 31 | weight_decay: 0.05 32 | max_epoch: 10 33 | batch_size_train: 32 34 | batch_size_eval: 64 35 | num_workers: 4 36 | 37 | seed: 42 38 | output_dir: "output/ALBEF/SNLI_VE" 39 | 40 | amp: False 41 | resume_ckpt_path: null 42 | 43 | evaluate: False 44 | train_splits: ["train"] 45 | valid_splits: ["val"] 46 | test_splits: ["test"] 47 | 48 | # distribution-specific 49 | device: "cuda" 50 | world_size: 1 51 | dist_url: "env://" 52 | distributed: True 53 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/didemo_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: didemo 9 | 10 | max_txt_len: 50 11 | 12 | timesformer: 13 | n_frms: 8 14 | image_size: 224 15 | 16 | 17 | datasets: 18 | didemo_retrieval: # name of the dataset builder 19 | vis_processor: 20 | eval: 21 | name: "alpro_video_eval" 22 | n_frms: 8 23 | image_size: 224 24 | text_processor: 25 | eval: 26 | name: "blip_caption" 27 | 28 | run: 29 | task: retrieval 30 | # optimization-specific 31 | batch_size_train: 8 32 | batch_size_eval: 64 33 | num_workers: 4 34 | 35 | # k_test: 256 36 | k_test: 1000 37 | 38 | seed: 42 39 | output_dir: "output/ALPRO/didemo_retrieval" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | valid_splits: ["val", "test"] 44 | test_splits: ["test"] 45 | 46 | # distribution-specific 47 | device: "cuda" 48 | world_size: 1 49 | dist_url: "env://" 50 | distributed: True 51 | use_dist_eval_sampler: False 52 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: multimodal_classification 23 | # optimization-specific 24 | batch_size_train: 32 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | seed: 42 29 | output_dir: "output/ALPRO/msrvtt_qa" 30 | 31 | evaluate: True 32 | valid_splits: ["val"] 33 | test_splits: ["test"] 34 | 35 | # distribution-specific 36 | device: "cuda" 37 | world_size: 1 38 | dist_url: "env://" 39 | distributed: True 40 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 8 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | # optimization-specific 24 | batch_size_train: 24 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # k_test: 256 29 | k_test: 1000 30 | 31 | seed: 42 32 | output_dir: "output/ALPRO/msrvtt_retrieval" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | # distribution-specific 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msvd_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msvd 9 | 10 | datasets: 11 | msvd_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | eval: 21 | name: "blip_caption" 22 | 23 | run: 24 | task: multimodal_classification 25 | # optimization-specific 26 | batch_size_train: 24 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | seed: 42 31 | output_dir: "output/ALPRO/msvd_qa" 32 | 33 | evaluate: True 34 | test_splits: ["test"] 35 | 36 | # distribution-specific 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/coco_cap_ft_iter.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | train: 14 | name: "blip_image_train" 15 | eval: 16 | name: "blip_image_eval" 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | prompt: "a picture of " 21 | eval: 22 | name: "blip_caption" 23 | 24 | run: 25 | runner: runner_iter 26 | 27 | max_iters: 2e4 28 | iters_per_inner_epoch: 2e3 29 | 30 | # task: retrieval 31 | task: captioning 32 | # optimizer 33 | lr_sched: "linear_warmup_cosine_lr" 34 | init_lr: 2e-6 35 | min_lr: 0 36 | weight_decay: 0.05 37 | batch_size_train: 16 38 | batch_size_eval: 64 39 | num_workers: 4 40 | 41 | max_len: 20 42 | min_len: 5 43 | num_beams: 3 44 | 45 | seed: 42 46 | output_dir: "output/BLIP/Caption_coco" 47 | 48 | amp: False 49 | resume_ckpt_path: null 50 | 51 | evaluate: False 52 | train_splits: ["train"] 53 | valid_splits: ["val", "test"] 54 | 55 | device: "cuda" 56 | world_size: 1 57 | dist_url: "env://" 58 | distributed: True 59 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/aokvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: aokvqa 9 | image_size: 480 10 | 11 | datasets: 12 | aok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: aok_vqa 23 | # optimization-specific 24 | batch_size_train: 64 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/AOKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["val", "test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/BLIP/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/nocaps_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' 10 | 11 | datasets: 12 | nocaps: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | prompt: "a picture of " 21 | 22 | run: 23 | # task: retrieval 24 | task: captioning 25 | # optimizer 26 | batch_size_train: 32 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 3 33 | 34 | seed: 42 35 | output_dir: "output/BLIP/NoCaps" 36 | 37 | evaluate: True 38 | test_splits: ["val", "test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | 45 | report_metric: False 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: okvqa 9 | image_size: 480 10 | 11 | datasets: 12 | ok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/OKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | train: 14 | name: "blip_image_train" 15 | image_size: 384 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | eval: 23 | name: "blip_caption" 24 | 25 | run: 26 | task: retrieval 27 | 28 | # dataloading 29 | num_workers: 4 30 | batch_size_train: 32 31 | batch_size_eval: 128 32 | 33 | train_splits: ["train"] 34 | valid_splits: ["val"] 35 | test_splits: ["test"] 36 | 37 | # distribution 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | 44 | # model specific 45 | k_test: 256 46 | 47 | # misc 48 | seed: 42 49 | output_dir: "output/BLIP/Retrieval_COCO" 50 | 51 | evaluate: True 52 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/vqav2_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: vqav2 9 | image_size: 480 10 | 11 | datasets: 12 | coco_vqa: # name of the dataset builder 13 | type: eval 14 | vis_processor: 15 | eval: 16 | name: "blip_image_eval" 17 | image_size: 480 18 | text_processor: 19 | eval: 20 | name: "blip_question" 21 | 22 | run: 23 | task: vqa 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 64 27 | num_workers: 4 28 | 29 | # inference-specific 30 | max_len: 10 31 | min_len: 1 32 | num_beams: 3 33 | num_ans_candidates: 128 34 | inference_method: "rank" 35 | 36 | seed: 42 37 | output_dir: "output/BLIP/VQA" 38 | 39 | evaluate: True 40 | test_splits: ["val"] 41 | 42 | # distribution-specific 43 | device: "cuda" 44 | world_size: 1 45 | dist_url: "env://" 46 | distributed: True 47 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/caption_coco_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | 9 | model_type: base_coco 10 | load_finetuned: False 11 | 12 | datasets: 13 | coco_caption: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | prompt: "a picture of " 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | # task: retrieval 28 | task: captioning 29 | # optimizer 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 1e-5 32 | min_lr: 0 33 | weight_decay: 0.05 34 | max_epoch: 5 35 | batch_size_train: 32 36 | batch_size_eval: 64 37 | num_workers: 4 38 | 39 | max_len: 20 40 | min_len: 5 41 | num_beams: 3 42 | 43 | seed: 42 44 | output_dir: "output/BLIP/Caption_coco" 45 | 46 | amp: False 47 | resume_ckpt_path: null 48 | 49 | evaluate: False 50 | train_splits: ["train"] 51 | valid_splits: ["val"] 52 | test_splits: ["test"] 53 | 54 | device: "cuda" 55 | world_size: 1 56 | dist_url: "env://" 57 | distributed: True 58 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/caption_coco_large_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | 9 | model_type: large_coco 10 | load_finetuned: False 11 | 12 | datasets: 13 | coco_caption: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | prompt: "a picture of " 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: captioning 28 | # optimizer 29 | lr_sched: "linear_warmup_cosine_lr" 30 | init_lr: 2e-6 31 | min_lr: 0 32 | weight_decay: 0.05 33 | max_epoch: 5 34 | batch_size_train: 16 35 | batch_size_eval: 64 36 | num_workers: 4 37 | 38 | max_len: 20 39 | min_len: 5 40 | num_beams: 3 41 | 42 | seed: 42 43 | output_dir: "output/BLIP/Caption_coco" 44 | 45 | amp: False 46 | resume_ckpt_path: null 47 | 48 | evaluate: False 49 | train_splits: ["train"] 50 | valid_splits: ["val"] 51 | test_splits: ["test"] 52 | 53 | device: "cuda" 54 | world_size: 1 55 | dist_url: "env://" 56 | distributed: True 57 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/nlvr_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | 9 | model_type: nlvr 10 | load_finetuned: False 11 | 12 | datasets: 13 | nlvr: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | image_size: 384 18 | eval: 19 | name: "blip_image_eval" 20 | image_size: 384 21 | text_processor: 22 | train: 23 | name: "blip_caption" 24 | eval: 25 | name: "blip_caption" 26 | 27 | run: 28 | task: multimodal_classification 29 | 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 2.5e-5 32 | min_lr: 0 33 | weight_decay: 0.05 34 | max_epoch: 15 35 | 36 | batch_size_train: 16 37 | batch_size_eval: 64 38 | num_workers: 4 39 | 40 | seed: 42 41 | output_dir: "output/BLIP/NLVR" 42 | 43 | amp: False 44 | resume_ckpt_path: null 45 | 46 | evaluate: False 47 | train_splits: ["train"] 48 | valid_splits: ["val", "test"] 49 | test_splits: ["test"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Bleu_1: 0.832 7 | # Bleu_2: 0.691 8 | # Bleu_3: 0.556 9 | # Bleu_4: 0.438 10 | # METEOR: 0.317 11 | # ROUGE_L: 0.620 12 | # CIDEr: 1.461 13 | # SPICE: 0.252 14 | 15 | model: 16 | arch: blip2_opt 17 | model_type: caption_coco_opt2.7b 18 | use_grad_checkpoint: False 19 | 20 | datasets: 21 | coco_caption: # name of the dataset builder 22 | vis_processor: 23 | eval: 24 | name: "blip_image_eval" 25 | image_size: 364 26 | text_processor: 27 | eval: 28 | name: "blip_caption" 29 | # build_info: 30 | # images: 31 | # storage: '/export/share/datasets/vision/coco/images/' 32 | 33 | run: 34 | task: captioning 35 | # optimizer 36 | batch_size_train: 32 37 | batch_size_eval: 16 38 | num_workers: 4 39 | 40 | max_len: 30 41 | min_len: 8 42 | num_beams: 5 43 | 44 | seed: 42 45 | output_dir: "output/BLIP2/Caption_coco_opt2.7b" 46 | 47 | evaluate: True 48 | test_splits: ["test"] 49 | 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Bleu_1: 0.831 7 | # Bleu_2: 0.689 8 | # Bleu_3: 0.552 9 | # Bleu_4: 0.434 10 | # METEOR: 0.316 11 | # ROUGE_L: 0.618 12 | # CIDEr: 1.451 13 | # SPICE: 0.251 14 | 15 | model: 16 | arch: blip2_opt 17 | model_type: caption_coco_opt6.7b 18 | use_grad_checkpoint: False 19 | 20 | datasets: 21 | coco_caption: # name of the dataset builder 22 | vis_processor: 23 | eval: 24 | name: "blip_image_eval" 25 | image_size: 364 26 | text_processor: 27 | eval: 28 | name: "blip_caption" 29 | # build_info: 30 | # images: 31 | # storage: '/export/share/datasets/vision/coco/images/' 32 | 33 | run: 34 | task: captioning 35 | # optimizer 36 | batch_size_train: 32 37 | batch_size_eval: 16 38 | num_workers: 4 39 | 40 | max_len: 30 41 | min_len: 8 42 | num_beams: 5 43 | 44 | seed: 42 45 | output_dir: "output/BLIP2/Caption_coco_opt6.7b" 46 | 47 | evaluate: True 48 | test_splits: ["test"] 49 | 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 43.98 7 | model: 8 | arch: blip2_t5 9 | model_type: pretrain_flant5xl 10 | use_grad_checkpoint: False 11 | 12 | datasets: 13 | gqa: # name of the dataset builder 14 | type: balanced_testdev 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 224 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | build_info: 23 | images: 24 | storage: "/export/share/datasets/vision/GQA/images/" 25 | 26 | run: 27 | task: gqa 28 | # optimization-specific 29 | batch_size_train: 16 30 | batch_size_eval: 64 31 | num_workers: 4 32 | 33 | # inference-specific 34 | max_len: 10 35 | min_len: 1 36 | num_beams: 5 37 | inference_method: "generate" 38 | prompt: "Question: {} Short answer:" 39 | 40 | seed: 42 41 | output_dir: "output/BLIP2/GQA" 42 | 43 | evaluate: True 44 | test_splits: ["val"] 45 | 46 | # distribution-specific 47 | device: "cuda" 48 | world_size: 1 49 | dist_url: "env://" 50 | distributed: True 51 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 41.22 7 | 8 | model: 9 | arch: blip2_t5 10 | model_type: pretrain_flant5xl 11 | use_grad_checkpoint: False 12 | 13 | # for OKVQA evaluation 14 | apply_lemmatizer: True 15 | 16 | datasets: 17 | ok_vqa: # name of the dataset builder 18 | vis_processor: 19 | eval: 20 | name: "blip_image_eval" 21 | image_size: 224 22 | text_processor: 23 | eval: 24 | name: "blip_question" 25 | # build_info: 26 | # images: 27 | # storage: '/export/share/datasets/vision/coco/images/' 28 | 29 | run: 30 | task: vqa 31 | # optimization-specific 32 | batch_size_train: 16 33 | batch_size_eval: 64 34 | num_workers: 4 35 | 36 | # inference-specific 37 | max_len: 10 38 | min_len: 1 39 | num_beams: 5 40 | inference_method: "generate" 41 | prompt: "Question: {} Short answer:" 42 | 43 | seed: 42 44 | output_dir: "output/BLIP2/OKVQA" 45 | 46 | evaluate: True 47 | test_splits: ["test"] 48 | 49 | # distribution-specific 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "blip_image_train" 16 | image_size: 364 17 | eval: 18 | name: "blip_image_eval" 19 | image_size: 364 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | # build_info: 26 | # images: 27 | # storage: '/export/share/datasets/vision/coco/images/' 28 | run: 29 | task: retrieval 30 | 31 | # dataloading 32 | num_workers: 4 33 | batch_size_train: 16 34 | batch_size_eval: 32 35 | 36 | train_splits: ["train"] 37 | valid_splits: ["val"] 38 | test_splits: ["test"] 39 | 40 | # distribution 41 | device: "cuda" 42 | world_size: 1 43 | dist_url: "env://" 44 | distributed: True 45 | use_dist_eval_sampler: False 46 | 47 | # model specific 48 | k_test: 128 49 | 50 | # misc 51 | seed: 42 52 | output_dir: "output/BLIP2/Retrieval_COCO" 53 | 54 | evaluate: True 55 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 364 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | 24 | # dataloading 25 | num_workers: 4 26 | batch_size_train: 16 27 | batch_size_eval: 32 28 | 29 | test_splits: ["test"] 30 | 31 | # distribution 32 | device: "cuda" 33 | world_size: 1 34 | dist_url: "env://" 35 | distributed: True 36 | use_dist_eval_sampler: False 37 | 38 | # model specific 39 | k_test: 128 40 | 41 | # misc 42 | seed: 42 43 | output_dir: "output/BLIP2/Retrieval_Flickr30k" 44 | 45 | evaluate: True -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 63.13 7 | # Per Answer Type Accuracy is the following: 8 | # other : 52.90 9 | # yes/no : 84.28 10 | # number : 41.01 11 | 12 | model: 13 | arch: blip2_t5 14 | model_type: pretrain_flant5xl 15 | use_grad_checkpoint: False 16 | 17 | datasets: 18 | coco_vqa: # name of the dataset builder 19 | type: eval 20 | vis_processor: 21 | eval: 22 | name: "blip_image_eval" 23 | image_size: 224 24 | text_processor: 25 | eval: 26 | name: "blip_question" 27 | # build_info: 28 | # images: 29 | # storage: '/export/share/datasets/vision/coco/images/' 30 | 31 | run: 32 | task: vqa 33 | # optimization-specific 34 | batch_size_train: 16 35 | batch_size_eval: 64 36 | num_workers: 4 37 | 38 | # inference-specific 39 | max_len: 10 40 | min_len: 1 41 | num_beams: 5 42 | inference_method: "generate" 43 | prompt: "Question: {} Short answer:" 44 | 45 | seed: 42 46 | output_dir: "output/BLIP2/VQA" 47 | 48 | evaluate: True 49 | test_splits: ["val"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/vqav2_zeroshot_opt_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 51.88 (result different from BLIP-2 paper due to different implementation and transformers version) 7 | 8 | model: 9 | arch: blip2_opt 10 | model_type: pretrain_opt2.7b 11 | use_grad_checkpoint: False 12 | 13 | datasets: 14 | coco_vqa: # name of the dataset builder 15 | type: eval 16 | vis_processor: 17 | eval: 18 | name: "blip_image_eval" 19 | image_size: 224 20 | text_processor: 21 | eval: 22 | name: "blip_question" 23 | # build_info: 24 | # images: 25 | # storage: '/export/share/datasets/vision/coco/images/' 26 | 27 | run: 28 | task: vqa 29 | # optimization-specific 30 | batch_size_train: 16 31 | batch_size_eval: 64 32 | num_workers: 4 33 | 34 | # inference-specific 35 | max_len: 10 36 | min_len: 1 37 | num_beams: 5 38 | inference_method: "generate" 39 | prompt: "Question: {} Short answer:" 40 | 41 | seed: 42 42 | output_dir: "output/BLIP2/VQA" 43 | 44 | evaluate: True 45 | test_splits: ["val"] 46 | 47 | # distribution-specific 48 | device: "cuda" 49 | world_size: 1 50 | dist_url: "env://" 51 | distributed: True 52 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_coco_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_COCO" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_flickr_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_Flickr" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_imnet_zs_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | imagenet: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "clip_image_eval" 16 | # image_size: 224 17 | image_size: 336 18 | 19 | run: 20 | task: multimodal_classification 21 | 22 | # dataloading 23 | num_workers: 4 24 | batch_size_train: 32 25 | batch_size_eval: 128 26 | 27 | test_splits: ["val"] 28 | 29 | # distribution 30 | device: "cuda" 31 | world_size: 1 32 | dist_url: "env://" 33 | distributed: True 34 | 35 | # misc 36 | seed: 42 37 | output_dir: "output/clip/zs_imnet" 38 | 39 | evaluate: True 40 | -------------------------------------------------------------------------------- /lavis/projects/gpt/eval/dialogue_avsd_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | model_type: base 9 | 10 | datasets: 11 | avsd_dialogue: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "gpt_video_ft" 15 | visual_ft: ["i3d_flow", "i3d_rgb"] 16 | audio_ft: ["vggish"] 17 | text_processor: 18 | eval: 19 | name: "gpt_dialogue" 20 | max_turns: 3 21 | use_caption: True 22 | 23 | run: 24 | task: dialogue 25 | # optimizer 26 | batch_size_train: 16 27 | batch_size_eval: 16 28 | num_workers: 0 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 5 33 | 34 | seed: 42 35 | output_dir: "output/gpt2/dialogue_avsd" 36 | 37 | evaluate: True 38 | valid_splits: ["test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/gqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | gqa: # name of the dataset builder 12 | type: balanced_testdev 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: gqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 16 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 5 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA/GQA" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 4 25 | batch_size_eval: 4 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA-3b/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 12 25 | batch_size_eval: 12 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA-large/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.runners.runner_base import RunnerBase 9 | from lavis.runners.runner_iter import RunnerIter 10 | 11 | __all__ = ["RunnerBase", "RunnerIter"] 12 | -------------------------------------------------------------------------------- /lavis/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /llava_llama_2/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava_llama_2/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /llava_llama_2/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /llava_llama_2/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /llava_llama_2/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava_llama_2/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava_llama_2/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava_llama_2/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava_llama_2/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava_llama_2/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig 3 | -------------------------------------------------------------------------------- /llava_llama_2/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava_llama_2.model.consolidate --src ~/model_weights/llava_llama_2-7b --dst ~/model_weights/llava_llama_2-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava_llama_2.model import * 10 | from llava_llama_2.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava_llama_2/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /llava_llama_2/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | from .clip_encoder import CLIPVisionTower 2 | 3 | 4 | def build_vision_tower(vision_tower_cfg, **kwargs): 5 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 6 | if vision_tower.startswith("openai") or vision_tower.startswith("laion"): 7 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 8 | 9 | raise ValueError(f'Unknown vision tower: {vision_tower}') 10 | -------------------------------------------------------------------------------- /llava_llama_2/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava_llama_2' in config and 'llava_llama_2' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava_llama_2") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava_llama_2/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/__init__.py -------------------------------------------------------------------------------- /llava_llama_2/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava_llama_2/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/llava_llama_2/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava_llama_2/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava_llama_2/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | from llava_llama_2.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | replace_llama_attn_with_flash_attn() 9 | 10 | from llava_llama_2.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /metric/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__init__.py -------------------------------------------------------------------------------- /metric/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /metric/__pycache__/perspective_api.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/metric/__pycache__/perspective_api.cpython-38.pyc -------------------------------------------------------------------------------- /metric/detoxify.py: -------------------------------------------------------------------------------- 1 | from detoxify import Detoxify 2 | from typing import Dict, Optional, List 3 | 4 | class DetoxifyScorer: 5 | def __init__(self): 6 | self.model = Detoxify('original', device='cuda') 7 | def get_scores(self, input_text: str) -> Dict[str, float]: 8 | return self.model.predict(input_text) 9 | 10 | -------------------------------------------------------------------------------- /minigpt4/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/.DS_Store -------------------------------------------------------------------------------- /minigpt4/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from minigpt4.common.registry import registry 14 | 15 | from minigpt4.datasets.builders import * 16 | from minigpt4.models import * 17 | from minigpt4.processors import * 18 | from minigpt4.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /minigpt4/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__init__.py -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/dist_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/dist_utils.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/logger.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/logger.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/registry.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/registry.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/common/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /minigpt4/configs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/configs/.DS_Store -------------------------------------------------------------------------------- /minigpt4/configs/datasets/cc_sbu/align.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu_align: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/cc_sbu_align/ 6 | -------------------------------------------------------------------------------- /minigpt4/configs/datasets/cc_sbu/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/cc_sbu_dataset/{00000..01255}.tar 6 | -------------------------------------------------------------------------------- /minigpt4/configs/datasets/laion/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | laion: 3 | data_type: images 4 | build_info: 5 | storage: /path/to/laion_dataset/{00000..10488}.tar 6 | -------------------------------------------------------------------------------- /minigpt4/configs/default.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | # For default users 3 | # cache_root: "cache" 4 | # For internal use with persistent storage 5 | cache_root: "/export/home/.cache/minigpt4" 6 | -------------------------------------------------------------------------------- /minigpt4/configs/models/llama.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | 12 | # Q-Former 13 | num_query_token: 32 14 | 15 | # Vicuna 16 | llama_model: "./ckpts/llama-13b" 17 | 18 | # generation configs 19 | prompt: "" 20 | 21 | preprocess: 22 | vis_processor: 23 | train: 24 | name: "blip2_image_train" 25 | image_size: 224 26 | eval: 27 | name: "blip2_image_eval" 28 | image_size: 224 29 | text_processor: 30 | train: 31 | name: "blip_caption" 32 | eval: 33 | name: "blip_caption" 34 | -------------------------------------------------------------------------------- /minigpt4/configs/models/minigpt4.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | 12 | # Q-Former 13 | num_query_token: 32 14 | 15 | # Vicuna 16 | llama_model: "./ckpts/vicuna-13b-weights" 17 | 18 | # generation configs 19 | prompt: "" 20 | 21 | preprocess: 22 | vis_processor: 23 | train: 24 | name: "blip2_image_train" 25 | image_size: 224 26 | eval: 27 | name: "blip2_image_eval" 28 | image_size: 224 29 | text_processor: 30 | train: 31 | name: "blip_caption" 32 | eval: 33 | name: "blip_caption" 34 | -------------------------------------------------------------------------------- /minigpt4/conversation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__init__.py -------------------------------------------------------------------------------- /minigpt4/conversation/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/conversation/__pycache__/conversation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/conversation/__pycache__/conversation.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__init__.py -------------------------------------------------------------------------------- /minigpt4/datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/__pycache__/data_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/__pycache__/data_utils.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/builders/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/builders/__pycache__/custom_builder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/custom_builder.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__init__.py -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/datasets/datasets/laion_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import webdataset as wds 9 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class LaionDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, location): 14 | super().__init__(vis_processor=vis_processor, text_processor=text_processor) 15 | 16 | self.inner_dataset = wds.DataPipeline( 17 | wds.ResampledShards(location), 18 | wds.tarfile_to_samples(handler=wds.warn_and_continue), 19 | wds.shuffle(1000, handler=wds.warn_and_continue), 20 | wds.decode("pilrgb", handler=wds.warn_and_continue), 21 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), 22 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), 23 | wds.map(self.to_dict, handler=wds.warn_and_continue), 24 | ) 25 | 26 | def to_dict(self, sample): 27 | return { 28 | "image": sample[0], 29 | "text_input": self.text_processor(sample[1]["caption"]), 30 | } 31 | 32 | -------------------------------------------------------------------------------- /minigpt4/models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/.DS_Store -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/Qformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/Qformer.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/base_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/base_model.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/blip2.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/blip2.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/eva_vit.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/eva_vit.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/mini_gpt4.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/mini_gpt4.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/models/__pycache__/modeling_llama.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/models/__pycache__/modeling_llama.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.processors.base_processor import BaseProcessor 9 | from minigpt4.processors.blip_processors import ( 10 | Blip2ImageTrainProcessor, 11 | Blip2ImageEvalProcessor, 12 | BlipCaptionProcessor, 13 | ) 14 | 15 | from minigpt4.common.registry import registry 16 | 17 | __all__ = [ 18 | "BaseProcessor", 19 | "Blip2ImageTrainProcessor", 20 | "Blip2ImageEvalProcessor", 21 | "BlipCaptionProcessor", 22 | ] 23 | 24 | 25 | def load_processor(name, cfg=None): 26 | """ 27 | Example 28 | 29 | >>> processor = load_processor("alpro_video_train", cfg=None) 30 | """ 31 | processor = registry.get_processor_class(name).from_config(cfg) 32 | 33 | return processor 34 | -------------------------------------------------------------------------------- /minigpt4/processors/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/processors/__pycache__/base_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/base_processor.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/processors/__pycache__/blip_processors.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/blip_processors.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/processors/__pycache__/randaugment.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/processors/__pycache__/randaugment.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /minigpt4/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.runners.runner_base import RunnerBase 9 | 10 | __all__ = ["RunnerBase"] 11 | -------------------------------------------------------------------------------- /minigpt4/runners/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/runners/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/runners/__pycache__/runner_base.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/runners/__pycache__/runner_base.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask 11 | 12 | 13 | def setup_task(cfg): 14 | assert "task" in cfg.run_cfg, "Task name must be provided." 15 | 16 | task_name = cfg.run_cfg.task 17 | task = registry.get_task_class(task_name).setup_task(cfg=cfg) 18 | assert task is not None, "Task {} not properly registered.".format(task_name) 19 | 20 | return task 21 | 22 | 23 | __all__ = [ 24 | "BaseTask", 25 | "ImageTextPretrainTask", 26 | ] 27 | -------------------------------------------------------------------------------- /minigpt4/tasks/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/tasks/__pycache__/base_task.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/base_task.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/tasks/__pycache__/image_text_pretrain.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models/5fc9883cfa9b4183cf05c6eceef49e449e45e9c9/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-39.pyc -------------------------------------------------------------------------------- /minigpt4/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | --------------------------------------------------------------------------------