├── .gitignore ├── README.md ├── ckpt └── .gitkeep ├── data └── .gitkeep ├── docs ├── ch3ef.md ├── chef.md ├── lamm.md └── octavius.md ├── images ├── Ch3Ef_intro.png ├── ChEF-benchmark.png ├── ChEF-logo.png ├── LAMM-Dataset.png ├── LAMM-Framework.png ├── LAMM-benchmark.png ├── LAMM_2d_demo.png ├── LAMM_Imagewall.png ├── Octavius_arch.png ├── ch3ef-logo.png ├── lamm-title.png └── lamm-video.png ├── requirements ├── ChEF.txt ├── default.txt └── optional.txt └── src ├── ChEF ├── __init__.py ├── data_process │ ├── Omnibenchmark.py │ └── mmbench.py ├── evaluator.py ├── inferencer │ ├── Multiturn.py │ ├── Singleturn.py │ ├── __init__.py │ └── utils.py ├── instruction │ ├── __init__.py │ ├── ice_retriever │ │ ├── __init__.py │ │ ├── base_retriever.py │ │ ├── fixed_retriever.py │ │ ├── random_retriever.py │ │ ├── topk_retriever.py │ │ ├── topk_retriever_img.py │ │ └── utils.py │ ├── prompt.py │ └── template.py ├── metric │ ├── Ch3Ef.py │ ├── __init__.py │ ├── caption.py │ ├── classification.py │ ├── counting.py │ ├── desiderata.py │ ├── detection.py │ ├── lamm_locating.py │ ├── mmmu.py │ ├── mmmu_utils.py │ ├── ocr.py │ ├── utils.py │ ├── vqa.py │ └── winoground.py ├── models │ ├── __init__.py │ ├── instruct_blip │ │ ├── __init__.py │ │ ├── common │ │ │ ├── config.py │ │ │ ├── dist_utils.py │ │ │ ├── gradcam.py │ │ │ ├── logger.py │ │ │ ├── optims.py │ │ │ ├── registry.py │ │ │ ├── utils.py │ │ │ └── vqa_tools │ │ │ │ ├── __init__.py │ │ │ │ ├── vqa.py │ │ │ │ └── vqa_eval.py │ │ ├── configs │ │ │ ├── default.yaml │ │ │ └── models │ │ │ │ ├── albef_classification_ve.yaml │ │ │ │ ├── albef_feature_extractor.yaml │ │ │ │ ├── albef_nlvr.yaml │ │ │ │ ├── albef_pretrain_base.yaml │ │ │ │ ├── albef_retrieval_coco.yaml │ │ │ │ ├── albef_retrieval_flickr.yaml │ │ │ │ ├── albef_vqav2.yaml │ │ │ │ ├── alpro_qa_msrvtt.yaml │ │ │ │ ├── alpro_qa_msvd.yaml │ │ │ │ ├── alpro_retrieval_didemo.yaml │ │ │ │ ├── alpro_retrieval_msrvtt.yaml │ │ │ │ ├── bert_config.json │ │ │ │ ├── bert_config_alpro.json │ │ │ │ ├── blip2 │ │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ │ ├── blip2_coco.yaml │ │ │ │ ├── blip2_instruct_flant5xl.yaml │ │ │ │ ├── blip2_instruct_flant5xxl.yaml │ │ │ │ ├── blip2_instruct_vicuna13b.yaml │ │ │ │ ├── blip2_instruct_vicuna7b.yaml │ │ │ │ ├── blip2_pretrain.yaml │ │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ │ ├── blip2_pretrain_llama7b.yaml │ │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ │ └── blip2_pretrain_vitL.yaml │ │ │ │ ├── blip_caption_base_coco.yaml │ │ │ │ ├── blip_caption_large_coco.yaml │ │ │ │ ├── blip_classification_base.yaml │ │ │ │ ├── blip_feature_extractor_base.yaml │ │ │ │ ├── blip_itm_base.yaml │ │ │ │ ├── blip_itm_large.yaml │ │ │ │ ├── blip_nlvr.yaml │ │ │ │ ├── blip_pretrain_base.yaml │ │ │ │ ├── blip_pretrain_large.yaml │ │ │ │ ├── blip_retrieval_coco.yaml │ │ │ │ ├── blip_retrieval_flickr.yaml │ │ │ │ ├── blip_vqa_aokvqa.yaml │ │ │ │ ├── blip_vqa_okvqa.yaml │ │ │ │ ├── blip_vqav2.yaml │ │ │ │ ├── clip │ │ │ │ ├── RN101-quickgelu.json │ │ │ │ ├── RN101.json │ │ │ │ ├── RN50-quickgelu.json │ │ │ │ ├── RN50.json │ │ │ │ ├── RN50x16.json │ │ │ │ ├── RN50x4.json │ │ │ │ ├── ViT-B-16-plus-240.json │ │ │ │ ├── ViT-B-16-plus.json │ │ │ │ ├── ViT-B-16.json │ │ │ │ ├── ViT-B-32-plus-256.json │ │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ │ ├── ViT-B-32.json │ │ │ │ ├── ViT-H-14.json │ │ │ │ ├── ViT-H-16.json │ │ │ │ ├── ViT-L-14-280.json │ │ │ │ ├── ViT-L-14-336.json │ │ │ │ ├── ViT-L-14.json │ │ │ │ ├── ViT-L-16-320.json │ │ │ │ ├── ViT-L-16.json │ │ │ │ ├── ViT-g-14.json │ │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ │ ├── timm-resnet50d.json │ │ │ │ ├── timm-resnetaa50d.json │ │ │ │ ├── timm-resnetblur50.json │ │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ │ └── timm-vit_small_patch16_224.json │ │ │ │ ├── clip_resnet50.yaml │ │ │ │ ├── clip_vit_base16.yaml │ │ │ │ ├── clip_vit_base32.yaml │ │ │ │ ├── clip_vit_large14.yaml │ │ │ │ ├── clip_vit_large14_336.yaml │ │ │ │ ├── gpt_dialogue_base.yaml │ │ │ │ ├── img2prompt-vqa │ │ │ │ └── img2prompt_vqa_base.yaml │ │ │ │ ├── med_config.json │ │ │ │ ├── med_config_albef.json │ │ │ │ ├── med_large_config.json │ │ │ │ └── pnp-vqa │ │ │ │ ├── pnp_vqa_3b.yaml │ │ │ │ ├── pnp_vqa_base.yaml │ │ │ │ ├── pnp_vqa_large.yaml │ │ │ │ ├── unifiedqav2_3b_config.json │ │ │ │ ├── unifiedqav2_base_config.json │ │ │ │ └── unifiedqav2_large_config.json │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── blip2_models │ │ │ │ ├── Qformer.py │ │ │ │ ├── __init__.py │ │ │ │ ├── blip2.py │ │ │ │ ├── blip2_image_text_matching.py │ │ │ │ ├── blip2_opt.py │ │ │ │ ├── blip2_qformer.py │ │ │ │ ├── blip2_t5.py │ │ │ │ ├── blip2_t5_instruct.py │ │ │ │ ├── blip2_vicuna_instruct.py │ │ │ │ ├── modeling_llama.py │ │ │ │ ├── modeling_opt.py │ │ │ │ └── modeling_t5.py │ │ │ ├── blip_models │ │ │ │ ├── __init__.py │ │ │ │ ├── blip.py │ │ │ │ ├── blip_caption.py │ │ │ │ ├── blip_classification.py │ │ │ │ ├── blip_feature_extractor.py │ │ │ │ ├── blip_image_text_matching.py │ │ │ │ ├── blip_nlvr.py │ │ │ │ ├── blip_outputs.py │ │ │ │ ├── blip_pretrain.py │ │ │ │ ├── blip_retrieval.py │ │ │ │ ├── blip_vqa.py │ │ │ │ └── nlvr_encoder.py │ │ │ ├── clip_vit.py │ │ │ ├── eva_vit.py │ │ │ ├── med.py │ │ │ ├── timesformer │ │ │ │ ├── __init__.py │ │ │ │ ├── conv2d_same.py │ │ │ │ ├── features.py │ │ │ │ ├── helpers.py │ │ │ │ ├── linear.py │ │ │ │ ├── vit.py │ │ │ │ └── vit_utils.py │ │ │ └── vit.py │ │ └── processors │ │ │ ├── __init__.py │ │ │ ├── base_processor.py │ │ │ ├── blip_processors.py │ │ │ ├── clip_processors.py │ │ │ ├── functional_video.py │ │ │ ├── gpt_processors.py │ │ │ └── randaugment.py │ ├── internlm │ │ ├── __init__.py │ │ ├── build_mlp.py │ │ ├── configuration_internlm_xcomposer2.py │ │ ├── modeling_internlm2.py │ │ ├── modeling_internlm_xcomposer2.py │ │ ├── rewrite_modeling_internlm_xcomposer2.py │ │ ├── tokenization_internlm_xcomposer2.py │ │ └── zero_to_fp32.py │ ├── kosmos2 │ │ ├── data │ │ │ ├── dict.txt │ │ │ └── sentencepiece.bpe.model │ │ ├── unilm │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ └── utils.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── connector.py │ │ │ │ ├── gpt.py │ │ │ │ ├── gpt_eval.py │ │ │ │ ├── unigpt.py │ │ │ │ └── vl │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── clip.py │ │ │ │ │ └── vlm_generator.py │ │ │ └── tasks │ │ │ │ ├── __init__.py │ │ │ │ └── generation_obj.py │ │ └── utils.py │ ├── llama_adapter_v2 │ │ ├── __init__.py │ │ ├── llama.py │ │ ├── llama_adapter.py │ │ ├── tokenizer.py │ │ └── utils.py │ ├── minigpt4 │ │ ├── __init__.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── dist_utils.py │ │ │ ├── gradcam.py │ │ │ ├── logger.py │ │ │ ├── optims.py │ │ │ ├── registry.py │ │ │ └── utils.py │ │ ├── configs │ │ │ ├── default.yaml │ │ │ └── models │ │ │ │ └── minigpt4.yaml │ │ ├── conversation │ │ │ ├── __init__.py │ │ │ └── conversation.py │ │ ├── minigpt4_eval.yaml │ │ ├── models │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── blip2.py │ │ │ ├── blip2_outputs.py │ │ │ ├── eva_vit.py │ │ │ ├── mini_gpt4.py │ │ │ └── modeling_llama.py │ │ └── processors │ │ │ ├── __init__.py │ │ │ ├── base_processor.py │ │ │ ├── blip_processors.py │ │ │ └── randaugment.py │ ├── mplug_owl │ │ ├── __init__.py │ │ ├── configuration_mplug_owl.py │ │ ├── modeling_mplug_owl.py │ │ ├── processing_mplug_owl.py │ │ └── tokenization_mplug_owl.py │ ├── otter │ │ ├── __init__.py │ │ ├── config.json │ │ ├── configuration_otter.py │ │ ├── flamingo_pt2otter_hf.py │ │ ├── modeling_otter.py │ │ └── otter_pt2otter_hf.py │ ├── qwen │ │ ├── __init__.py │ │ ├── configuration_qwen.py │ │ ├── modeling_qwen.py │ │ ├── qwen_generation_utils.py │ │ ├── tokenization_qwen.py │ │ └── visual.py │ ├── rlhfv │ │ ├── __init__.py │ │ ├── beit3.py │ │ ├── conversation.py │ │ ├── llava.py │ │ ├── muffin.py │ │ └── utils.py │ ├── shikra │ │ ├── __init__.py │ │ ├── builder │ │ │ ├── __init__.py │ │ │ ├── build_shikra.py │ │ │ └── builder.py │ │ ├── conversation │ │ │ ├── __init__.py │ │ │ └── base_conversation.py │ │ ├── dataset │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── process_function │ │ │ │ ├── __init__.py │ │ │ │ ├── box_process_function.py │ │ │ │ └── shikra_process_function.py │ │ │ ├── root.py │ │ │ ├── single_image_convsation.py │ │ │ ├── single_image_interactive.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── compute_metrics.py │ │ │ │ ├── concatenate_dataset.py │ │ │ │ ├── flickr30k_entities_utils.py │ │ │ │ ├── io.py │ │ │ │ ├── mixin.py │ │ │ │ └── transform.py │ │ ├── shikra │ │ │ ├── __init__.py │ │ │ ├── apply_delta.py │ │ │ ├── make_delta.py │ │ │ └── shikra.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ └── llama_flash_attn_monkey_patch.py │ ├── test_base.py │ ├── test_gemini.py │ ├── test_gpt.py │ ├── test_instructblip.py │ ├── test_internlmxcomposer.py │ ├── test_kosmos.py │ ├── test_lamm.py │ ├── test_lamm15.py │ ├── test_llamaadapterv2.py │ ├── test_llava15.py │ ├── test_llavarlhf.py │ ├── test_minigpt4.py │ ├── test_mplugowl.py │ ├── test_octavius.py │ ├── test_otter.py │ ├── test_qwenvl.py │ ├── test_rlhfv.py │ ├── test_shikra.py │ └── utils.py ├── resources │ └── ChEF-logo.png ├── scenario │ ├── Ch3Ef_dataset.py │ ├── LAMM_dataset.py │ ├── MMBench_dataset.py │ ├── MME_dataset.py │ ├── MMMU_dataset.py │ ├── POPE_dataset.py │ ├── SEED_Bench2_dataset.py │ ├── SEED_Bench_dataset.py │ ├── Winoground_dataset.py │ ├── __init__.py │ ├── caption_dataset.py │ ├── classification.py │ ├── counting_dataset.py │ ├── det_dataset.py │ ├── lamm_sysmsg.py │ ├── octavius_pcl_dataset.py │ ├── utils.py │ └── vqa_dataset.py ├── test │ ├── test_model.py │ ├── test_model.sh │ └── test_recipes.py └── tools │ └── eval_results.py ├── cli_demo.py ├── config ├── ChEF │ ├── desiderata_recipes │ │ ├── Calibration │ │ │ ├── MMBench.yaml │ │ │ └── ScienceQA.yaml │ │ ├── Hallucination │ │ │ ├── POPE_adversarial.yaml │ │ │ ├── POPE_popular.yaml │ │ │ └── POPE_random.yaml │ │ ├── ICL │ │ │ ├── MMBench.yaml │ │ │ └── ScienceQA.yaml │ │ ├── Insfollow │ │ │ ├── MMBench.yaml │ │ │ └── ScienceQA.yaml │ │ └── Robust │ │ │ ├── MMBench.yaml │ │ │ └── ScienceQA.yaml │ ├── models │ │ ├── gemini.yaml │ │ ├── gpt.yaml │ │ ├── instructblip_vicuna.yaml │ │ ├── internlm_xcomposer.yaml │ │ ├── kosmos2.yaml │ │ ├── lamm.yaml │ │ ├── lamm15.yaml │ │ ├── lamm_3d.yaml │ │ ├── llamaadapterv2.yaml │ │ ├── llava15.yaml │ │ ├── llavarlhf.yaml │ │ ├── minigpt4.yaml │ │ ├── mplug.yaml │ │ ├── octavius_2d+3d.yaml │ │ ├── octavius_2d.yaml │ │ ├── octavius_3d.yaml │ │ ├── otter.yaml │ │ ├── qwen_vl.yaml │ │ ├── rlhfv.yaml │ │ ├── shikra.yaml │ │ └── test.yaml │ └── scenario_recipes │ │ ├── CIFAR10 │ │ ├── direct.yaml │ │ └── ppl.yaml │ │ ├── Ch3Ef │ │ ├── Harmless.yaml │ │ ├── Harmless_ppl.yaml │ │ ├── Helpful.yaml │ │ ├── Helpful_ppl.yaml │ │ ├── Honest.yaml │ │ └── Honest_ppl.yaml │ │ ├── FSC147 │ │ ├── direct.yaml │ │ └── ppl.yaml │ │ ├── Flickr30k │ │ ├── direct.yaml │ │ ├── random_ppl.yaml │ │ └── topp_ppl.yaml │ │ ├── LAMM │ │ ├── AI2D.yaml │ │ ├── CIFAR10.yaml │ │ ├── CelebA_hair.yaml │ │ ├── CelebA_smile.yaml │ │ ├── FSC147.yaml │ │ ├── Flickr30k.yaml │ │ ├── SVT.yaml │ │ ├── ScanNet.yaml │ │ ├── ScanQA.yaml │ │ ├── ScanRefer.yaml │ │ ├── ScienceQA.yaml │ │ ├── UCMerced.yaml │ │ ├── VOC2012.yaml │ │ ├── locating_LSP.yaml │ │ └── locating_VOC2012.yaml │ │ ├── MMBench │ │ ├── direct.yaml │ │ └── ppl.yaml │ │ ├── MME │ │ ├── direct.yaml │ │ └── ppl.yaml │ │ ├── MMMU │ │ └── default.yaml │ │ ├── Octavius3D │ │ ├── nr3d_caption_direct3d.yaml │ │ ├── scan_caption_direct3d.yaml │ │ ├── scan_cls_direct3d.yaml │ │ ├── scan_vqa_direct3d.yaml │ │ └── shapenet_cls_direct3d.yaml │ │ ├── Omnibenchmark │ │ ├── multiturn_direct.yaml │ │ ├── multiturn_ppl.yaml │ │ ├── singleturn_direct.yaml │ │ └── singleturn_ppl.yaml │ │ ├── SEEDBench-2 │ │ └── ppl.yaml │ │ ├── SEEDBench │ │ └── default.yaml │ │ ├── ScienceQA │ │ ├── direct.yaml │ │ ├── direct_CoT.yaml │ │ ├── ppl.yaml │ │ └── ppl_CoT.yaml │ │ ├── VOC2012 │ │ ├── kosmos2_multiturn_direct.yaml │ │ ├── kosmos2_multiturn_ppl.yaml │ │ ├── kosmos2_singleturn_direct.yaml │ │ ├── multiturn_direct.yaml │ │ ├── multiturn_ppl.yaml │ │ ├── shikra_multiturn_direct.yaml │ │ ├── shikra_multiturn_ppl.yaml │ │ ├── shikra_singleturn_direct.yaml │ │ └── singleturn_direct.yaml │ │ └── Winoground │ │ └── default.yaml ├── LAMM │ ├── train.yaml │ ├── train_ds3.yaml │ └── train_sft.yaml └── Octavius │ ├── octavius_2d+3d_e6_bs64.yaml │ ├── octavius_2d_e4_bs64.yaml │ └── octavius_3d_e3_bs64.yaml ├── datasets ├── __init__.py ├── dataset.py ├── samplers.py ├── system_msg.py └── utils.py ├── dist.py ├── eval.py ├── model ├── LAMM │ ├── CLIP │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── clip.py │ │ ├── model.py │ │ └── simple_tokenizer.py │ ├── EPCL │ │ ├── __init__.py │ │ ├── epcl.py │ │ ├── helpers.py │ │ ├── model_3detr.py │ │ ├── position_embedding.py │ │ ├── third_party │ │ │ └── pointnet2 │ │ │ │ ├── _ext_src │ │ │ │ ├── include │ │ │ │ │ ├── ball_query.h │ │ │ │ │ ├── cuda_utils.h │ │ │ │ │ ├── group_points.h │ │ │ │ │ ├── interpolate.h │ │ │ │ │ ├── sampling.h │ │ │ │ │ └── utils.h │ │ │ │ └── src │ │ │ │ │ ├── ball_query.cpp │ │ │ │ │ ├── ball_query_gpu.cu │ │ │ │ │ ├── bindings.cpp │ │ │ │ │ ├── group_points.cpp │ │ │ │ │ ├── group_points_gpu.cu │ │ │ │ │ ├── interpolate.cpp │ │ │ │ │ ├── interpolate_gpu.cu │ │ │ │ │ ├── sampling.cpp │ │ │ │ │ └── sampling_gpu.cu │ │ │ │ ├── build │ │ │ │ └── temp.linux-x86_64-cpython-310 │ │ │ │ │ ├── .ninja_deps │ │ │ │ │ ├── .ninja_log │ │ │ │ │ └── build.ninja │ │ │ │ ├── pointnet2_modules.py │ │ │ │ ├── pointnet2_test.py │ │ │ │ ├── pointnet2_utils.py │ │ │ │ ├── pytorch_utils.py │ │ │ │ └── setup.py │ │ ├── transformer.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── ap_calculator.py │ │ │ ├── box_intersection.c │ │ │ ├── box_intersection.pyx │ │ │ ├── box_ops3d.py │ │ │ ├── box_util.py │ │ │ ├── cython_compile.py │ │ │ ├── cython_compile.sh │ │ │ ├── dist.py │ │ │ ├── download_weights.py │ │ │ ├── eval_det.py │ │ │ ├── io.py │ │ │ ├── logger.py │ │ │ ├── misc.py │ │ │ ├── nms.py │ │ │ ├── pc_util.py │ │ │ └── random_cuboid.py │ ├── README.md │ ├── __init__.py │ ├── conversations.py │ ├── flash_attn_patch.py │ ├── modeling_lightllm.py │ ├── modeling_llama.py │ ├── openlamm.py │ ├── utils │ │ ├── __init__.py │ │ ├── data.py │ │ ├── helpers.py │ │ ├── multimodal_preprocessors.py │ │ └── pcl_utils.py │ └── xformers_patch.py ├── Octavius │ ├── __init__.py │ ├── moe │ │ ├── __init__.py │ │ ├── layer.py │ │ └── moe_lora.py │ ├── octavius.py │ └── resampler3d.py ├── __init__.py ├── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mpt.py │ │ │ └── mpt │ │ │ │ ├── adapt_tokenizer.py │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── custom_embedding.py │ │ │ │ ├── flash_attn_triton.py │ │ │ │ ├── hf_prefixlm_converter.py │ │ │ │ ├── meta_init_context.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ ├── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_xformers_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ ├── train_mem.py │ │ └── train_xformers.py │ └── utils.py └── training_agent.py ├── slurm_eval.sh ├── slurm_eval_icl.sh ├── tools ├── ChEF │ ├── eval_calibration.py │ ├── eval_hallucination.py │ ├── eval_icl.py │ ├── eval_insfollow.py │ ├── eval_langperf.py │ └── eval_robust.py ├── LAMM │ ├── eval_lamm2d.sh │ ├── eval_lamm3d.sh │ ├── train_lamm2d.sh │ ├── train_lamm2d_sft_stg1_slurm.sh │ ├── train_lamm2d_sft_stg2_slurm.sh │ ├── train_lamm2d_slurm.sh │ ├── train_lamm3d.sh │ └── train_lamm3d_slurm.sh └── Octavius │ ├── ULIP │ ├── .DS_Store │ ├── data │ │ ├── .DS_Store │ │ ├── ScanRefer.yaml │ │ ├── ScanReferValid.yaml │ │ ├── dataset_3d.py │ │ ├── dataset_catalog.json │ │ ├── labels.json │ │ └── templates.json │ ├── main.py │ ├── models │ │ ├── .DS_Store │ │ ├── ULIP_models.py │ │ ├── losses.py │ │ ├── pointbert │ │ │ ├── .DS_Store │ │ │ ├── PointTransformer_8192point.yaml │ │ │ ├── checkpoint.py │ │ │ ├── dvae.py │ │ │ ├── logger.py │ │ │ ├── misc.py │ │ │ └── point_encoder.py │ │ └── pointnet2 │ │ │ ├── .DS_Store │ │ │ ├── pointnet2.py │ │ │ └── pointnet2_utils.py │ ├── scripts │ │ └── pretrain_pointbert.sh │ └── utils │ │ ├── .DS_Store │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── build.py │ │ ├── config.py │ │ ├── io.py │ │ ├── logger.py │ │ ├── registry.py │ │ ├── tokenizer.py │ │ └── utils.py │ ├── octavius_ChEF.sh │ └── train_octavius_slurm.sh └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # source file related 2 | *.pyc 3 | *.o 4 | 5 | 6 | 7 | *.so 8 | *.egg 9 | *.egg-info 10 | 11 | 12 | 13 | # training related 14 | *.log 15 | *.pth 16 | *.pt 17 | *.model 18 | !*.bpe.model 19 | *.0 20 | 21 | # result related 22 | answers/ 23 | results/ 24 | ckpt/* 25 | !ckpt/.gitkeep 26 | 27 | *.jsonl 28 | 29 | 30 | # assets related 31 | data/* 32 | !data/.gitkeep 33 | model_zoo/* 34 | !model_zoo/.gitkeep 35 | 36 | # package related 37 | src/run.sh 38 | 39 | *.ttf 40 | -------------------------------------------------------------------------------- /ckpt/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/ckpt/.gitkeep -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/data/.gitkeep -------------------------------------------------------------------------------- /images/Ch3Ef_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/Ch3Ef_intro.png -------------------------------------------------------------------------------- /images/ChEF-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ChEF-benchmark.png -------------------------------------------------------------------------------- /images/ChEF-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ChEF-logo.png -------------------------------------------------------------------------------- /images/LAMM-Dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-Dataset.png -------------------------------------------------------------------------------- /images/LAMM-Framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-Framework.png -------------------------------------------------------------------------------- /images/LAMM-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-benchmark.png -------------------------------------------------------------------------------- /images/LAMM_2d_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM_2d_demo.png -------------------------------------------------------------------------------- /images/LAMM_Imagewall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM_Imagewall.png -------------------------------------------------------------------------------- /images/Octavius_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/Octavius_arch.png -------------------------------------------------------------------------------- /images/ch3ef-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ch3ef-logo.png -------------------------------------------------------------------------------- /images/lamm-title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/lamm-title.png -------------------------------------------------------------------------------- /images/lamm-video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/lamm-video.png -------------------------------------------------------------------------------- /requirements/ChEF.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0.1 2 | tqdm==4.64.1 3 | pandas==2.0.3 4 | transformers==4.31.0 5 | sentence-transformers==2.2.2 -------------------------------------------------------------------------------- /requirements/default.txt: -------------------------------------------------------------------------------- 1 | data 2 | ninja 3 | accelerate>=0.20.3 4 | einops==0.6.1 5 | ftfy==6.1.1 6 | iopath==0.1.10 7 | ipdb==0.13.13 8 | numpy==1.24.3 9 | Pillow==9.5.0 10 | PyYAML==6.0.1 11 | regex==2022.10.31 12 | pytorchvideo 13 | fvcore 14 | decord==0.6.0 15 | tqdm 16 | setuptools==65.5.1 17 | bigmodelvis 18 | nltk 19 | tensorboard 20 | cython 21 | plyfile 22 | trimesh 23 | sentencepiece 24 | -------------------------------------------------------------------------------- /requirements/optional.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/requirements/optional.txt -------------------------------------------------------------------------------- /src/ChEF/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/__init__.py -------------------------------------------------------------------------------- /src/ChEF/data_process/mmbench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import base64 4 | import pandas as pd 5 | from PIL import Image 6 | def decode_base64_to_image(base64_string): 7 | image_data = base64.b64decode(base64_string) 8 | image = Image.open(io.BytesIO(image_data)) 9 | return image 10 | 11 | def main(split='dev'): 12 | base_path = '../../../data/MMBench' 13 | save_image_dir = os.path.join(base_path, 'images') 14 | os.makedirs(save_image_dir,exist_ok=True) 15 | df = pd.read_csv(os.path.join(base_path, f'mmbench_{split}_20230712.tsv'), sep='\t') 16 | for i in range(len(df)): 17 | image = df.iloc[i]['image'] 18 | index = df.iloc[i]['index'] 19 | image = decode_base64_to_image(image) 20 | image_name = f'mmbench_image_{index}.png' 21 | image.save(os.path.join(save_image_dir, image_name)) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() -------------------------------------------------------------------------------- /src/ChEF/inferencer/__init__.py: -------------------------------------------------------------------------------- 1 | from .Singleturn import Direct3D_Inferencer, Direct_Inferencer, PPL_Inferencer 2 | from .Multiturn import Multi_Turn_PPL_Inferencer, Multi_Direct_Inferencer 3 | 4 | inferencer_dict = { 5 | 'Direct': Direct_Inferencer, 6 | 'Direct3D': Direct3D_Inferencer, 7 | 'PPL': PPL_Inferencer, 8 | 'Multi_PPL': Multi_Turn_PPL_Inferencer, 9 | 'Multi_Direct': Multi_Direct_Inferencer, 10 | } 11 | 12 | def build_inferencer(inferencer_type, **kwargs): 13 | return inferencer_dict[inferencer_type](**kwargs) -------------------------------------------------------------------------------- /src/ChEF/inferencer/utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | def copy_batch_dict(batch, idx): 4 | answer_dict = {} 5 | for key in batch.keys(): 6 | if not isinstance(batch[key], list): 7 | answer_dict[key] = batch[key] 8 | else: 9 | answer_dict[key] = batch[key][idx] 10 | if 'image_path' in answer_dict and isinstance(answer_dict['image_path'], Image.Image): # image_path in MMBench is Image.Image 11 | del answer_dict['image_path'] 12 | return answer_dict -------------------------------------------------------------------------------- /src/ChEF/instruction/ice_retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from .random_retriever import RandomRetriever 2 | from .topk_retriever import TopkRetriever 3 | from .fixed_retriever import FixedRetriever 4 | from .topk_retriever_img import ImageTopkRetriever 5 | 6 | 7 | retriever_dict = { 8 | 'random': RandomRetriever, 9 | 'topk_text': TopkRetriever, 10 | 'fixed': FixedRetriever, 11 | 'topk_img': ImageTopkRetriever 12 | } 13 | 14 | def build_retriever(train_dataset, test_dataset, retriever_type, **kwargs): 15 | build_fuc = retriever_dict[retriever_type] 16 | return build_fuc(train_dataset, test_dataset, **kwargs) -------------------------------------------------------------------------------- /src/ChEF/metric/Ch3Ef.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from .utils import Base_Metric 3 | 4 | class Ch3Ef_Metric(Base_Metric): 5 | 6 | def __init__(self, dataset_name, ppl=False, **kwargs): 7 | super().__init__(dataset_name) 8 | self.ppl = ppl 9 | 10 | def ppl_metric(self, answers): 11 | score = 0.0 12 | for item in tqdm(answers, desc="Running Metric"): 13 | if "I don't know." in item['options']: 14 | gt = item['options'][:2] 15 | else: 16 | gt = item['options'][:1] 17 | pred_text = item['answer'] 18 | result = pred_text in gt 19 | score += result 20 | item['metric_result'] = result 21 | score = score/len(answers) * 100 22 | return dict( 23 | ACC = score, 24 | ), answers 25 | 26 | 27 | def metric_func(self, answers): 28 | if self.ppl: 29 | return self.ppl_metric(answers) 30 | return dict(), answers -------------------------------------------------------------------------------- /src/ChEF/metric/counting.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from .utils import Base_Metric 3 | 4 | class Counting(Base_Metric): 5 | def __init__(self, dataset_name, inference_type = 'direct', **kwargs): 6 | super().__init__(dataset_name) 7 | self.inference_type = inference_type 8 | assert self.inference_type in ['direct', 'ppl'] 9 | from .utils import ennum2numerical 10 | self.parse_num_func = ennum2numerical 11 | 12 | def mae_metric(self, answers): 13 | score = 0 14 | for item in tqdm(answers, desc="Running MAE Metric"): 15 | gt_num = item['gt_answers'] 16 | text = item['answer'] 17 | pred_num = self.parse_num_func(text) 18 | score += min(gt_num, abs(pred_num-gt_num)) 19 | return score / len(answers) 20 | 21 | def acc_metric(self, answers): 22 | score = 0 23 | for item in tqdm(answers, desc="Running ACC Metric"): 24 | gt_num = item['gt_answers'] 25 | text = item['answer'] 26 | pred_num = self.parse_num_func(text) 27 | score += (pred_num == gt_num) 28 | return score / len(answers) * 100 29 | 30 | def metric_func(self, answers): 31 | res_dict = {} 32 | if self.inference_type == 'direct': 33 | res_dict['MAE'] = self.mae_metric(answers) 34 | res_dict['ACC'] = self.acc_metric(answers) 35 | return res_dict, answers 36 | -------------------------------------------------------------------------------- /src/ChEF/metric/ocr.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .utils import Base_Metric, parse_caption_sentence 4 | 5 | class SVT_OCR(Base_Metric): 6 | 7 | def __init__(self, dataset_name, **kwargs): 8 | super().__init__(dataset_name) 9 | 10 | def metric_func(self, answers): 11 | score = 0.0 12 | for item in tqdm(answers, desc="Running Metric"): 13 | gt_word_list = item['gt_answers'] 14 | pred_text = item['answer'] 15 | pred_word_list = parse_caption_sentence(pred_text).lower().split() 16 | correct = 0 17 | for word in gt_word_list: 18 | if word.lower() in pred_word_list: 19 | correct += 1 20 | tmp_score = correct / len(gt_word_list) 21 | score += tmp_score 22 | item['metric_result'] = tmp_score 23 | 24 | return dict( 25 | ACC = score/len(answers), 26 | ), answers -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from .common.registry import registry 14 | from .models import * 15 | from .processors import * 16 | 17 | 18 | root_dir = os.path.dirname(os.path.abspath(__file__)) 19 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 20 | 21 | registry.register_path("library_root", root_dir) 22 | repo_root = os.path.join(root_dir, "..") 23 | registry.register_path("repo_root", repo_root) 24 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 25 | registry.register_path("cache_root", cache_root) 26 | 27 | registry.register("MAX_INT", sys.maxsize) 28 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 29 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xxl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xxl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "./llm/vicuna-13b" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "eachadea/vicuna-7b-1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_llama7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_llama 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # LLM 24 | llm_model: "/export/home/project/stanford_alpaca/llama_7B" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip2_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: img2prompt_vqa 8 | model_type: base 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_generation_moodel: 47 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth" 48 | 49 | 50 | 51 | preprocess: 52 | vis_processor: 53 | eval: 54 | name: "blip_image_eval" 55 | image_size: 384 56 | text_processor: 57 | eval: 58 | name: "blip_caption" 59 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/pnp-vqa/pnp_vqa_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_answering_model: 47 | arch: pnp_unifiedqav2_fid 48 | 49 | pretrained: "allenai/unifiedqa-v2-t5-3b-1363200" 50 | 51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json" 52 | 53 | preprocess: 54 | vis_processor: 55 | eval: 56 | name: "blip_image_eval" 57 | image_size: 384 58 | text_processor: 59 | eval: 60 | name: "blip_caption" 61 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/pnp-vqa/unifiedqav2_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 3072, 6 | "d_kv": 64, 7 | "d_model": 768, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 12, 21 | "num_heads": 12, 22 | "num_layers": 12, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/configs/models/pnp-vqa/unifiedqav2_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 4096, 6 | "d_kv": 64, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 24, 21 | "num_heads": 16, 22 | "num_layers": 24, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/instruct_blip/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from .base_processor import BaseProcessor 9 | 10 | from .blip_processors import ( 11 | BlipImageTrainProcessor, 12 | Blip2ImageTrainProcessor, 13 | BlipImageEvalProcessor, 14 | BlipCaptionProcessor, 15 | ) 16 | from .gpt_processors import ( 17 | GPTVideoFeatureProcessor, 18 | GPTDialogueProcessor, 19 | ) 20 | from.clip_processors import ClipImageTrainProcessor 21 | 22 | from ..common.registry import registry 23 | 24 | __all__ = [ 25 | "BaseProcessor", 26 | # BLIP 27 | "BlipImageTrainProcessor", 28 | "Blip2ImageTrainProcessor", 29 | "BlipImageEvalProcessor", 30 | "BlipCaptionProcessor", 31 | "ClipImageTrainProcessor", 32 | # GPT 33 | "GPTVideoFeatureProcessor", 34 | "GPTDialogueProcessor", 35 | ] 36 | 37 | 38 | def load_processor(name, cfg=None): 39 | """ 40 | Example 41 | 42 | >>> processor = load_processor("alpro_video_train", cfg=None) 43 | """ 44 | processor = registry.get_processor_class(name).from_config(cfg) 45 | 46 | return processor 47 | -------------------------------------------------------------------------------- /src/ChEF/models/instruct_blip/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /src/ChEF/models/internlm/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_internlm_xcomposer2 import InternLMXcomposer2Config 2 | from .rewrite_modeling_internlm_xcomposer2 import RewriteInternLMXComposer2ForCausalLM as InternLMXComposer2ForCausalLM 3 | from .tokenization_internlm_xcomposer2 import InternLMXComposer2Tokenizer -------------------------------------------------------------------------------- /src/ChEF/models/kosmos2/data/sentencepiece.bpe.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/kosmos2/data/sentencepiece.bpe.model -------------------------------------------------------------------------------- /src/ChEF/models/kosmos2/unilm/__init__.py: -------------------------------------------------------------------------------- 1 | from ChEF.models.kosmos2.unilm import models 2 | from ChEF.models.kosmos2.unilm import tasks -------------------------------------------------------------------------------- /src/ChEF/models/kosmos2/unilm/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from fairseq.models import import_models 4 | 5 | models_dir = os.path.dirname(__file__) 6 | import_models(models_dir, "ChEF.models.kosmos2.unilm.models") -------------------------------------------------------------------------------- /src/ChEF/models/kosmos2/unilm/models/vl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/kosmos2/unilm/models/vl/__init__.py -------------------------------------------------------------------------------- /src/ChEF/models/kosmos2/unilm/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib 3 | import os 4 | from fairseq.tasks import import_tasks 5 | 6 | tasks_dir = os.path.dirname(__file__) 7 | import_tasks(tasks_dir, "ChEF.models.kosmos2.unilm.tasks") 8 | -------------------------------------------------------------------------------- /src/ChEF/models/llama_adapter_v2/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama import ModelArgs, Transformer 2 | from .tokenizer import Tokenizer 3 | from .llama_adapter import * 4 | from .utils import format_prompt, format_prompt_icl -------------------------------------------------------------------------------- /src/ChEF/models/llama_adapter_v2/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from sentencepiece import SentencePieceProcessor 5 | from logging import getLogger 6 | from typing import List 7 | import os 8 | 9 | 10 | logger = getLogger() 11 | 12 | 13 | class Tokenizer: 14 | def __init__(self, model_path: str): 15 | # reload tokenizer 16 | assert os.path.isfile(model_path), model_path 17 | self.sp_model = SentencePieceProcessor(model_file=model_path) 18 | logger.info(f"Reloaded SentencePiece model from {model_path}") 19 | 20 | # BOS / EOS token IDs 21 | self.n_words: int = self.sp_model.vocab_size() 22 | self.bos_id: int = self.sp_model.bos_id() 23 | self.eos_id: int = self.sp_model.eos_id() 24 | self.pad_id: int = self.sp_model.pad_id() 25 | logger.info( 26 | f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" 27 | ) 28 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 29 | 30 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]: 31 | assert type(s) is str 32 | t = self.sp_model.encode(s) 33 | if bos: 34 | t = [self.bos_id] + t 35 | if eos: 36 | t = t + [self.eos_id] 37 | return t 38 | 39 | def decode(self, t: List[int]) -> str: 40 | return self.sp_model.decode(t) 41 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from .common.registry import registry 14 | from .models import * 15 | from .processors import * 16 | 17 | 18 | root_dir = os.path.dirname(os.path.abspath(__file__)) 19 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 20 | 21 | registry.register_path("library_root", root_dir) 22 | repo_root = os.path.join(root_dir, "..") 23 | registry.register_path("repo_root", repo_root) 24 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 25 | registry.register_path("cache_root", cache_root) 26 | 27 | registry.register("MAX_INT", sys.maxsize) 28 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 29 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/minigpt4/common/__init__.py -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/configs/default.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | # For default users 3 | # cache_root: "cache" 4 | # For internal use with persistent storage 5 | cache_root: "/export/home/.cache/minigpt4" 6 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/configs/models/minigpt4.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp32" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | 12 | # Q-Former 13 | num_query_token: 32 14 | 15 | # Vicuna 16 | llama_model: ../model_zoo/Vicuna/7b_v0 17 | 18 | # generation configs 19 | prompt: "" 20 | 21 | preprocess: 22 | vis_processor: 23 | train: 24 | name: "blip2_image_train" 25 | image_size: 224 26 | eval: 27 | name: "blip2_image_eval" 28 | image_size: 224 29 | text_processor: 30 | train: 31 | name: "blip_caption" 32 | eval: 33 | name: "blip_caption" 34 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/conversation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/minigpt4/conversation/__init__.py -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/minigpt4_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4 3 | model_type: pretrain_vicuna 4 | freeze_vit: True 5 | freeze_qformer: True 6 | max_txt_len: 160 7 | end_sym: "###" 8 | low_resource: False 9 | prompt_path: "alignment.txt" 10 | prompt_template: '###Human: {} ###Assistant: ' 11 | ckpt: 'pretrained_minigpt4_7b.pth' 12 | 13 | 14 | preprocess: 15 | vis_processor: 16 | train: 17 | name: "blip2_image_train" 18 | image_size: 224 19 | eval: 20 | name: "blip2_image_eval" 21 | image_size: 224 22 | text_processor: 23 | train: 24 | name: "blip_caption" 25 | eval: 26 | name: "blip_caption" 27 | 28 | 29 | run: 30 | task: image_text_pretrain 31 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from .base_processor import BaseProcessor 9 | from .blip_processors import ( 10 | Blip2ImageTrainProcessor, 11 | Blip2ImageEvalProcessor, 12 | BlipCaptionProcessor, 13 | ) 14 | 15 | from ..common.registry import registry 16 | 17 | __all__ = [ 18 | "BaseProcessor", 19 | "Blip2ImageTrainProcessor", 20 | "Blip2ImageEvalProcessor", 21 | "BlipCaptionProcessor", 22 | ] 23 | 24 | 25 | def load_processor(name, cfg=None): 26 | """ 27 | Example 28 | 29 | >>> processor = load_processor("alpro_video_train", cfg=None) 30 | """ 31 | processor = registry.get_processor_class(name).from_config(cfg) 32 | 33 | return processor 34 | -------------------------------------------------------------------------------- /src/ChEF/models/minigpt4/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /src/ChEF/models/otter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/otter/__init__.py -------------------------------------------------------------------------------- /src/ChEF/models/otter/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "otter", 3 | "cross_attn_every_n_layers": 4, 4 | "tie_word_embeddings": false, 5 | "use_media_placement_augmentation": true, 6 | "only_attend_previous": true, 7 | "text_config": { 8 | "_name_or_path": "luodian/llama-7b-hf", 9 | "model_type": "llama" 10 | }, 11 | "vision_config": { 12 | "_name_or_path": "openai/clip-vit-large-patch14", 13 | "model_type": "clip_vision_model", 14 | "hidden_size": 1024, 15 | "intermediate_size": 4096, 16 | "num_attention_heads": 16, 17 | "num_hidden_layers": 24, 18 | "image_size": 224, 19 | "patch_size": 14 20 | } 21 | } -------------------------------------------------------------------------------- /src/ChEF/models/qwen/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_qwen import QWenConfig 2 | from .modeling_qwen import QWenLMHeadModel 3 | from .tokenization_qwen import QWenTokenizer 4 | from .qwen_generation_utils import make_context, get_stop_words_ids, decode_tokens -------------------------------------------------------------------------------- /src/ChEF/models/shikra/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import load_pretrained 2 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/builder/builder.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, Tuple 2 | 3 | from torch import nn 4 | 5 | from .build_shikra import load_pretrained_shikra 6 | 7 | PREPROCESSOR = Dict[str, Any] 8 | 9 | 10 | # TODO: Registry 11 | def load_pretrained(model_args, training_args) -> Tuple[nn.Module, PREPROCESSOR]: 12 | type_ = model_args.type 13 | if type_ == 'shikra': 14 | return load_pretrained_shikra(model_args, training_args) 15 | else: 16 | assert False 17 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/conversation/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_conversation import SeparatorStyle, Conversation, register_conv_template, get_conv_template -------------------------------------------------------------------------------- /src/ChEF/models/shikra/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .root import * 2 | from .utils import * 3 | from .process_function import * 4 | from .single_image_convsation import * 5 | 6 | from .builder import prepare_data 7 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/dataset/process_function/__init__.py: -------------------------------------------------------------------------------- 1 | from .shikra_process_function import ( 2 | ShikraConvProcess, 3 | ShikraImageProcessor, 4 | ShikraTextProcess, 5 | ) 6 | 7 | from .box_process_function import ( 8 | BoxFormatProcess, 9 | BoxFormatter, 10 | PlainBoxFormatter, 11 | TokenFormatter, 12 | prepare_target_processor, 13 | ) 14 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/dataset/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .io import read_img_general, init_ceph_client_if_needed 2 | from .transform import Expand2square, de_norm_box_xyxy, norm_box_xyxy, expand2square, box_xywh_to_xyxy 3 | from .compute_metrics import BaseComputeMetrics 4 | from .mixin import QuestionTemplateMixin, MInstrDataset 5 | from .concatenate_dataset import ConcatDataset, InterleaveDateset, SubSet, ConcatDatasetWithShuffle 6 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/dataset/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import logging 4 | 5 | import cv2 6 | import numpy as np 7 | from PIL import Image 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(logging.INFO) 11 | logging.basicConfig( 12 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 13 | datefmt="%m/%d/%Y %H:%M:%S", 14 | handlers=[logging.StreamHandler(sys.stdout), ], 15 | ) 16 | 17 | 18 | def read_img_general(img_path): 19 | if "s3://" in img_path: 20 | cv_img = read_img_ceph(img_path) 21 | # noinspection PyUnresolvedReferences 22 | return Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)) 23 | else: 24 | return Image.open(img_path).convert('RGB') 25 | 26 | 27 | client = None 28 | 29 | 30 | def read_img_ceph(img_path): 31 | init_ceph_client_if_needed() 32 | img_bytes = client.get(img_path) 33 | assert img_bytes is not None, f"Please check image at {img_path}" 34 | img_mem_view = memoryview(img_bytes) 35 | img_array = np.frombuffer(img_mem_view, np.uint8) 36 | # noinspection PyUnresolvedReferences 37 | img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) 38 | return img 39 | 40 | 41 | def init_ceph_client_if_needed(): 42 | global client 43 | if client is None: 44 | logger.info(f"initializing ceph client ...") 45 | st = time.time() 46 | from petrel_client.client import Client # noqa 47 | client = Client(enable_mc=True) 48 | ed = time.time() 49 | logger.info(f"initialize client cost {ed - st:.2f} s") -------------------------------------------------------------------------------- /src/ChEF/models/shikra/shikra/__init__.py: -------------------------------------------------------------------------------- 1 | from .shikra import ShikraLlamaForCausalLM, ShikraConfig 2 | -------------------------------------------------------------------------------- /src/ChEF/models/shikra/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import ( 2 | print_trainable_params, 3 | show, 4 | draw_bounding_boxes, 5 | post_process_generate_ids, 6 | decode_generate_ids, 7 | smart_tokenizer_and_embedding_resize, 8 | ) 9 | -------------------------------------------------------------------------------- /src/ChEF/models/test_lamm15.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .test_lamm import TestLAMM 3 | from model.LAMM import LAMMSFTModel 4 | 5 | class TestLAMM15(TestLAMM): 6 | def __init__(self, model_path, device=None, task_type='normal', **kwargs): 7 | self.conv_mode = 'simple' 8 | self.model = LAMMSFTModel(**kwargs) 9 | ckpt = torch.load(model_path, map_location=torch.device('cpu')) 10 | self.model.load_state_dict(ckpt, strict=False) # TODO: load delta_ckpt from model_path in lamm_3d.yaml 11 | self.model = self.model.eval().half() 12 | self.task_type = task_type 13 | self.move_to_device(device) 14 | self.model.device = device -------------------------------------------------------------------------------- /src/ChEF/resources/ChEF-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/resources/ChEF-logo.png -------------------------------------------------------------------------------- /src/ChEF/scenario/Ch3Ef_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from torch.utils.data import Dataset 4 | 5 | class Ch3EfDataset(Dataset): 6 | task_name = 'Ch3Ef' 7 | dataset_name = 'Ch3Ef' 8 | def __init__(self, base_data_path, dimension, ppl=False, **kwargs): 9 | self.base_data_path = base_data_path 10 | super().__init__() 11 | meta_base_dir = os.path.join(self.base_data_path, 'meta_file') 12 | self.data = json.load(open(os.path.join(meta_base_dir, f'{dimension}.json'))) 13 | self.data = self.data['items'] 14 | self.ppl = ppl 15 | 16 | 17 | def __len__(self): 18 | return len(self.data) 19 | 20 | def __getitem__(self, index): 21 | item = self.data[index] 22 | id = str(item['id']) if 'id' in item else str(index) 23 | 24 | res_dict = { 25 | 'id': id, 26 | 'image_path': [os.path.join(self.base_data_path,img_path) for img_path in self.data[index]['image']], 27 | 'question': self.data[index]['query'], 28 | 'source': self.data[index]['source'] 29 | } 30 | if self.ppl: 31 | res_dict['gt_answers'] = self.data[index]['options'][0] 32 | res_dict['options'] = self.data[index]['options'] 33 | else: 34 | res_dict['gt_answers'] = self.data[index]['options'][0] 35 | return res_dict -------------------------------------------------------------------------------- /src/ChEF/scenario/utils.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | from sentence_transformers import util 3 | from torch.utils.data import Dataset 4 | 5 | class Bert_Similarity: 6 | def __init__(self, model_path = 'sentence-transformers/all-MiniLM-L6-v2') -> None: 7 | self.model = SentenceTransformer(model_path).cuda() 8 | self.cos_func = util.pytorch_cos_sim 9 | 10 | def similarity_score(self, str1, str2): 11 | embedding_1 = self.model.encode(str1, convert_to_tensor=True) 12 | embedding_2 = self.model.encode(str2, convert_to_tensor=True) 13 | score = self.cos_func(embedding_1, embedding_2).item() 14 | return score 15 | 16 | def bert_embedding(self, str): 17 | return self.model.encode(str, convert_to_tensor=True) 18 | 19 | def embedding_similarity_score(self, emb1, emb2): 20 | score_metric = self.cos_func(emb1, emb2) 21 | return score_metric 22 | 23 | 24 | mmbench_rand_acc = {'circular': 2.55, 25 | 'vanilla': 27.57} 26 | 27 | sqa_rand_acc = {'circular': 35.8, 28 | 'vanilla': 35.8} 29 | 30 | rand_acc = {'MMBench': mmbench_rand_acc, 31 | 'ScienceQA': sqa_rand_acc} 32 | -------------------------------------------------------------------------------- /src/ChEF/test/test_model.sh: -------------------------------------------------------------------------------- 1 | model_cfg=$1 2 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 3 | python test/test_model.py ${model_cfg} --debug -------------------------------------------------------------------------------- /src/ChEF/test/test_recipes.py: -------------------------------------------------------------------------------- 1 | scenario_recipes = [ 2 | 'configs/scenario_recipes/CIFAR10/default.yaml', 3 | # 'configs/scenario_recipes/CIFAR10/direct.yaml', 4 | 'configs/scenario_recipes/Flickr30k/default.yaml', 5 | # 'configs/scenario_recipes/Flickr30k/direct.yaml', 6 | 'configs/scenario_recipes/FSC147/default.yaml', 7 | # 'configs/scenario_recipes/FSC147/direct.yaml', 8 | 'configs/scenario_recipes/MMBench/default.yaml', 9 | # 'configs/scenario_recipes/MMBench/direct.yaml', 10 | 'configs/scenario_recipes/MME/default.yaml', 11 | # 'configs/scenario_recipes/MME/direct.yaml', 12 | 'configs/scenario_recipes/Omnibenchmark/default.yaml', 13 | # 'configs/scenario_recipes/Omnibenchmark/direct.yaml', 14 | # 'configs/scenario_recipes/Omnibenchmark/single_ppl.yaml', 15 | 'configs/scenario_recipes/ScienceQA/default.yaml', 16 | # 'configs/scenario_recipes/ScienceQA/direct_CoT.yaml', 17 | # 'configs/scenario_recipes/ScienceQA/direct.yaml', 18 | 'configs/scenario_recipes/SEEDBench/default.yaml', 19 | 'configs/scenario_recipes/VOC2012/default.yaml', 20 | # 'configs/scenario_recipes/VOC2012/direct.yaml', 21 | ] 22 | 23 | desiderata_recipes = [ 24 | 25 | ] -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Calibration/MMBench.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | ppl_cfg: 5 | content_only: False 6 | split: dev 7 | hint: True 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | query_type: query_pool 12 | prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 13 | template_assigned_ids: 1 # (kosmos: 0) (default: 1) 14 | inferencer_cfg: 15 | inferencer_type: Calibration 16 | batch_size: 6 17 | CoT: True 18 | max_new_tokens: 256 19 | metric_cfg: 20 | metric_type: Calibration -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Calibration/ScienceQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | query_type: query_pool 10 | query_assigned_ids: 1 # otter 1 11 | template_assigned_ids: 1 # otter 1 12 | inferencer_cfg: 13 | inferencer_type: Calibration 14 | batch_size: 8 15 | CoT: True 16 | max_new_tokens: 256 17 | metric_cfg: 18 | metric_type: Calibration -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Hallucination/POPE_adversarial.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: POPE_COCO_adversarial 3 | base_data_path: ../data/coco_pope 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | query_type: standard_query 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 8 13 | CoT: True 14 | max_new_tokens: 256 15 | metric_cfg: 16 | metric_type: Hallucination 17 | -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Hallucination/POPE_popular.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: POPE_COCO_popular 3 | base_data_path: ../data/coco_pope 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | query_type: standard_query 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 8 13 | CoT: True 14 | max_new_tokens: 256 15 | metric_cfg: 16 | metric_type: Hallucination 17 | -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Hallucination/POPE_random.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: POPE_COCO_random 3 | base_data_path: ../data/coco_pope 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | query_type: standard_query 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 16 13 | max_new_tokens: 256 14 | metric_cfg: 15 | metric_type: Hallucination 16 | -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/ICL/MMBench.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | ppl_cfg: 5 | content_only: False 6 | split: dev 7 | hint: True 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: singleturn 12 | prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 13 | template_assigned_ids: 1 # (kosmos: 0) (default: 1) 14 | incontext_cfg: 15 | retriever_type: random 16 | ice_num: 1 17 | random_seed: 1 18 | inferencer_cfg: 19 | inferencer_type: PPL 20 | batch_size: 1 21 | CoT: False 22 | max_new_tokens: 256 23 | metric_cfg: 24 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/ICL/ScienceQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | query_type: standard_query 10 | incontext_cfg: 11 | ice_with_image: False 12 | retriever_type: random 13 | ice_num: 3 14 | random_seed: 1 15 | inferencer_cfg: 16 | inferencer_type: ICL_PPL 17 | batch_size: 4 18 | CoT: True 19 | max_new_tokens: 256 20 | metric_cfg: 21 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Insfollow/MMBench.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | ppl_cfg: 5 | content_only: False 6 | split: dev 7 | hint: True 8 | option_map: '' 9 | 10 | eval_cfg: 11 | instruction_cfg: 12 | query_type: query_pool 13 | prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 14 | template_assigned_ids: 1 # (kosmos: 0) (default: 1) 15 | inferencer_cfg: 16 | inferencer_type: PPL 17 | batch_size: 6 18 | CoT: True 19 | max_new_tokens: 256 20 | metric_cfg: 21 | metric_type: Instruct_Follow -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Insfollow/ScienceQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | option_map: '' 7 | 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | query_type: query_pool 12 | query_assigned_ids: 0 # otter 1 13 | template_assigned_ids: 0 # otter 1 14 | inferencer_cfg: 15 | inferencer_type: PPL 16 | batch_size: 8 17 | CoT: True 18 | max_new_tokens: 256 19 | metric_cfg: 20 | metric_type: Instruct_Follow 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Robust/MMBench.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | ppl_cfg: 5 | content_only: False 6 | split: dev 7 | hint: True 8 | img_crp: True 9 | text_crp: True 10 | data_c_path: ../data/ChEF/MMBench_C 11 | 12 | eval_cfg: 13 | instruction_cfg: 14 | query_type: query_pool 15 | query_assigned_ids: 1 # (kosmos: 5) (default: 1) 16 | template_assigned_ids: 1 # (kosmos: 0) (default: 1) 17 | inferencer_cfg: 18 | inferencer_type: PPL 19 | batch_size: 4 20 | CoT: True 21 | max_new_tokens: 256 22 | metric_cfg: 23 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/desiderata_recipes/Robust/ScienceQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | img_crp: True 7 | text_crp: True 8 | data_c_path: ../data/ChEF/ScienceQA_C 9 | 10 | eval_cfg: 11 | instruction_cfg: 12 | query_type: query_pool 13 | query_assigned_ids: 0 # otter 1 14 | template_assigned_ids: 0 # otter 1 15 | inferencer_cfg: 16 | inferencer_type: PPL 17 | batch_size: 8 18 | CoT: True 19 | max_new_tokens: 256 20 | metric_cfg: 21 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/models/gemini.yaml: -------------------------------------------------------------------------------- 1 | model_name: Gemini 2 | api_key: 3 | gpt_name: gemini-pro-vision 4 | safety_block_none: True -------------------------------------------------------------------------------- /src/config/ChEF/models/gpt.yaml: -------------------------------------------------------------------------------- 1 | model_name: GPT 2 | api_key: 3 | gpt_name: gpt-4-vision-preview -------------------------------------------------------------------------------- /src/config/ChEF/models/instructblip_vicuna.yaml: -------------------------------------------------------------------------------- 1 | model_name: InstructBLIP -------------------------------------------------------------------------------- /src/config/ChEF/models/internlm_xcomposer.yaml: -------------------------------------------------------------------------------- 1 | model_name: InternLMXComposer 2 | model_path: ../model_zoo/InternLM/internlm-xcomposer2-vl-7b -------------------------------------------------------------------------------- /src/config/ChEF/models/kosmos2.yaml: -------------------------------------------------------------------------------- 1 | model_name: Kosmos2 2 | model_path: ../model_zoo/Kosmos/kosmos-2.pt 3 | if_grounding: False # set True for detection and grounding evaluation -------------------------------------------------------------------------------- /src/config/ChEF/models/lamm.yaml: -------------------------------------------------------------------------------- 1 | model_name: LAMM 2 | model_path: ../model_zoo/LAMM/LAMM_v1.0/vicuna13b_v0_lamm186k_ep2_clip_system/pytorch_model.pt 3 | llm_ckpt_path: ../model_zoo/Vicuna/13b_v0 4 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14 5 | task_type: noraml 6 | encoder_pretrain: clip 7 | vision_type: image 8 | vision_feature_type: local 9 | vision_output_layer: -2 10 | num_vision_token: 256 11 | lora_r: 32 12 | lora_alpha: 32 13 | lora_dropout: 0.1 14 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj'] 15 | max_tgt_len: 1024 16 | stage: 2 -------------------------------------------------------------------------------- /src/config/ChEF/models/lamm15.yaml: -------------------------------------------------------------------------------- 1 | model_name: LAMM_SFT 2 | model_path: ../model_zoo/LAMM/LAMM_v1.5/lamm_sft_finetune/pytorch_model.pt 3 | llm_ckpt_path: ../model_zoo/Vicuna/13b_v0 4 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14 5 | task_type: noraml 6 | encoder_pretrain: clip 7 | vision_type: image 8 | vision_feature_type: local 9 | vision_output_layer: -2 10 | num_vision_token: 256 11 | max_tgt_len: 1024 12 | stage: 3 -------------------------------------------------------------------------------- /src/config/ChEF/models/lamm_3d.yaml: -------------------------------------------------------------------------------- 1 | model_name: LAMM 2 | model_path: ../model_zoo/LAMM/vicuna13b_v0_lamm10k_ep2_epcl_system/pytorch_model.pt 3 | llm_ckpt_path: ../model_zoo/vicuna/13b_v0 4 | encoder_ckpt_path: ../model_zoo/clip_vit-L-14_scannet_ddp_ep1080_vit256token/checkpoint_best.pth 5 | task_type: noraml 6 | encoder_pretrain: epcl 7 | vision_type: pcl 8 | vision_feature_type: local 9 | vision_output_layer: -2 10 | num_vision_token: 256 11 | lora_r: 32 12 | lora_alpha: 32 13 | lora_dropout: 0.1 14 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj'] 15 | max_tgt_len: 1024 16 | stage: 2 -------------------------------------------------------------------------------- /src/config/ChEF/models/llamaadapterv2.yaml: -------------------------------------------------------------------------------- 1 | model_name: LLaMA-Adapter-v2 2 | model_path: ../model_zoo/LLaMAAdapter/LLaMA_model_weights 3 | max_seq_len: 1024 4 | max_batch_size: 20 -------------------------------------------------------------------------------- /src/config/ChEF/models/llava15.yaml: -------------------------------------------------------------------------------- 1 | model_name: LLaVA1.5 2 | model_path: ../model_zoo/LLaVA/LLaVA_v1.5/llava-v1.5-13b 3 | vis_processor_path: ../model_zoo/clip-vit-large-patch14 -------------------------------------------------------------------------------- /src/config/ChEF/models/llavarlhf.yaml: -------------------------------------------------------------------------------- 1 | model_name: LLaVARLHF 2 | model_path: ../model_zoo/LLaVA-RLHF-13b-v1.5-336 3 | vis_processor_path: ../model_zoo/clip-vit-large-patch14 -------------------------------------------------------------------------------- /src/config/ChEF/models/minigpt4.yaml: -------------------------------------------------------------------------------- 1 | model_name: MiniGPT-4 2 | model_path: ../model_zoo/MiniGPT4 3 | cfg_path: ChEF/models/minigpt4/minigpt4_eval.yaml -------------------------------------------------------------------------------- /src/config/ChEF/models/mplug.yaml: -------------------------------------------------------------------------------- 1 | model_name: mPLUG-Owl 2 | model_path: ../model_zoo/mPLUG_Owl/mplug-owl-llama-7b -------------------------------------------------------------------------------- /src/config/ChEF/models/octavius_2d+3d.yaml: -------------------------------------------------------------------------------- 1 | model_name: Octavius 2 | stage: 2 3 | octavius_modality: ['image', 'pcl'] 4 | 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0 6 | delta_ckpt_path: ../ckpt/octavius_2d+3d_e6_bs64_raw/pytorch_model.pt 7 | 8 | encoder_pretrain: clip 9 | vision_feature_type: local 10 | vision_output_layer: -2 11 | num_vision_token: 256 12 | 13 | # peft and lora 14 | peft_type: moe_lora 15 | moe_lora_num_experts: 6 16 | moe_gate_mode: top2_gate 17 | lora_r: 32 18 | lora_alpha: 32 19 | lora_dropout: 0.1 20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj'] 21 | 22 | # pcl modality 23 | num_query_rsp_3d: 16 24 | hidden_size_rsp_3d: 768 25 | num_layers_rsp_3d: 1 26 | num_heads_rsp_3d: 8 27 | 28 | max_tgt_len: 400 29 | conv_mode: simple 30 | -------------------------------------------------------------------------------- /src/config/ChEF/models/octavius_2d.yaml: -------------------------------------------------------------------------------- 1 | model_name: Octavius_2d 2 | stage: 2 3 | octavius_modality: ['image'] 4 | 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0 6 | delta_ckpt_path: ../ckpt/octavius_2d_e4_bs64_raw/pytorch_model.pt 7 | 8 | encoder_pretrain: clip 9 | vision_feature_type: local 10 | vision_output_layer: -2 11 | num_vision_token: 256 12 | 13 | # peft and lora 14 | peft_type: moe_lora 15 | moe_lora_num_experts: 4 16 | moe_gate_mode: top2_gate 17 | lora_r: 32 18 | lora_alpha: 32 19 | lora_dropout: 0.1 20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj'] 21 | 22 | # pcl modality 23 | num_query_rsp_3d: 16 24 | hidden_size_rsp_3d: 768 25 | num_layers_rsp_3d: 1 26 | num_heads_rsp_3d: 8 27 | 28 | max_tgt_len: 400 29 | conv_mode: simple 30 | -------------------------------------------------------------------------------- /src/config/ChEF/models/octavius_3d.yaml: -------------------------------------------------------------------------------- 1 | model_name: Octavius_3d 2 | stage: 2 3 | octavius_modality: ['pcl'] 4 | 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0 6 | delta_ckpt_path: ../ckpt/octavius_3d_e3_bs64/pytorch_model.pt 7 | 8 | encoder_pretrain: clip 9 | vision_feature_type: local 10 | vision_output_layer: -2 11 | num_vision_token: 256 12 | 13 | # peft and lora 14 | peft_type: moe_lora 15 | moe_lora_num_experts: 3 16 | moe_gate_mode: top2_gate 17 | lora_r: 32 18 | lora_alpha: 32 19 | lora_dropout: 0.1 20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj'] 21 | 22 | # pcl modality 23 | num_query_rsp_3d: 16 24 | hidden_size_rsp_3d: 768 25 | num_layers_rsp_3d: 1 26 | num_heads_rsp_3d: 8 27 | 28 | max_tgt_len: 400 29 | conv_mode: simple 30 | -------------------------------------------------------------------------------- /src/config/ChEF/models/otter.yaml: -------------------------------------------------------------------------------- 1 | model_name: Otter 2 | model_path: ../model_zoo/Otter/otter-9b-hf -------------------------------------------------------------------------------- /src/config/ChEF/models/qwen_vl.yaml: -------------------------------------------------------------------------------- 1 | model_name: QwenVL 2 | model_path: ../model_zoo/Qwen/Qwen-VL-Chat -------------------------------------------------------------------------------- /src/config/ChEF/models/rlhfv.yaml: -------------------------------------------------------------------------------- 1 | model_name: RLHFV 2 | model_path: ../model_zoo/RLHF-V -------------------------------------------------------------------------------- /src/config/ChEF/models/shikra.yaml: -------------------------------------------------------------------------------- 1 | model_name: Shikra 2 | model_path: ../model_zoo/Shikra/shikra-7b 3 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14 4 | -------------------------------------------------------------------------------- /src/config/ChEF/models/test.yaml: -------------------------------------------------------------------------------- 1 | model_name: Test -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/CIFAR10/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: CIFAR10 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | eval_cfg: 5 | instruction_cfg: 6 | prompt_type: singleturn 7 | prompt_assigned_ids: 0 # (mplug: 6, llava15, lamm15: 1) (default:0) 8 | template_assigned_ids: 0 # (otter, lamm, minigpt4 :1, llava15, lamm15: 3) (default:0) 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | max_new_tokens: 16 12 | batch_size: 32 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/CIFAR10/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: CIFAR10 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | prompt_assigned_ids: 0 # (mplug: 6, llava15, lamm15: 1) (default:0) 9 | template_assigned_ids: 0 # (lamm, minigpt4 :1, otter: 2, llava15, lamm15: 3) (default:0) 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 4 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Harmless.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Harmless 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | batch_size: 8 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: Ch3Ef -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Harmless_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Harmless 5 | ppl: True 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 1 13 | max_new_tokens: 256 14 | metric_cfg: 15 | metric_type: Ch3Ef 16 | ppl: True -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Helpful.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Helpful 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | batch_size: 8 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: Ch3Ef -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Helpful_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Helpful 5 | ppl: True 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 1 13 | max_new_tokens: 256 14 | metric_cfg: 15 | metric_type: Ch3Ef 16 | ppl: True -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Honest.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Honest 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | batch_size: 8 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: Ch3Ef -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Ch3Ef/Honest_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Ch3Ef 3 | base_data_path: ../data/Ch3Ef 4 | dimension: Honest 5 | ppl: True 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 2 13 | max_new_tokens: 256 14 | metric_cfg: 15 | metric_type: Ch3Ef 16 | ppl: True -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/FSC147/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: FSC147 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | max_new_tokens: 256 11 | batch_size: 32 12 | metric_cfg: 13 | metric_type: basic 14 | inference_type: direct -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/FSC147/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: FSC147 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl_cfg: 5 | heatmap_width: 2 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 8 13 | metric_cfg: 14 | metric_type: basic 15 | inference_type: ppl -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Flickr30k/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Flickr30k 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | max_new_tokens: 64 11 | batch_size: 16 12 | metric_cfg: 13 | metric_type: basic 14 | strategy: direct -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Flickr30k/random_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Flickr30k 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl_cfg: 5 | negative_opt_num: 3 6 | random_seed: 0 7 | strategy: random 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: singleturn 12 | inferencer_cfg: 13 | inferencer_type: PPL 14 | batch_size: 4 15 | metric_cfg: 16 | metric_type: basic 17 | strategy: acc -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Flickr30k/topp_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Flickr30k 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl_cfg: 5 | negative_opt_num: 3 6 | random_seed: 0 7 | strategy: top_similarity 8 | model_path: ../model_zoo/all-MiniLM-L6-v2 9 | 10 | eval_cfg: 11 | instruction_cfg: 12 | prompt_type: singleturn 13 | inferencer_cfg: 14 | inferencer_type: PPL 15 | batch_size: 4 16 | metric_cfg: 17 | metric_type: basic 18 | strategy: acc -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/AI2D.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: AI2D 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 8 11 | CoT: True 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/CIFAR10.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: CIFAR10_LAMM 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 32 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/CelebA_hair.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: CelebA(Hair) 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 32 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/CelebA_smile.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: CelebA(Smile) 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 32 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/FSC147.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: FSC147_LAMM 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | max_new_tokens: 256 11 | batch_size: 32 12 | metric_cfg: 13 | metric_type: basic 14 | inference_type: direct -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/Flickr30k.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Flickr30k_LAMM 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 32 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/SVT.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: SVT 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 16 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/ScanNet.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScanNet_LAMM 3 | base_data_path: ../data/LAMM/3D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct3D 10 | batch_size: 1 11 | metric_cfg: 12 | metric_type: LAMM 13 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/ScanQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScanQA_LAMM 3 | base_data_path: ../data/LAMM/3D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct3D 10 | batch_size: 1 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/ScanRefer.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScanRefer_LAMM 3 | base_data_path: ../data/LAMM/3D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct3D 10 | batch_size: 1 11 | metric_cfg: 12 | metric_type: LAMM 13 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/ScienceQA.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA_LAMM 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 1 11 | CoT: True 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/UCMerced.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: UCMerced 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 16 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/VOC2012.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012_LAMM 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 32 11 | metric_cfg: 12 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/locating_LSP.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Locating_LSP 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | max_new_tokens: 256 11 | batch_size: 32 12 | metric_cfg: 13 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/LAMM/locating_VOC2012.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Locating_VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | query_type: standard_query 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | max_new_tokens: 256 11 | batch_size: 32 12 | metric_cfg: 13 | metric_type: LAMM -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/MMBench/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | split: dev 5 | hint: True 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | prompt_assigned_ids: 1 11 | template_assigned_ids: 1 12 | inferencer_cfg: 13 | inferencer_type: Direct 14 | batch_size: 2 15 | CoT: False 16 | max_new_tokens: 256 17 | metric_cfg: 18 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/MMBench/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMBench 3 | base_data_path: ../data/MMBench 4 | ppl_cfg: 5 | content_only: False 6 | split: dev 7 | hint: True 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: singleturn 12 | prompt_assigned_ids: 1 13 | template_assigned_ids: 1 14 | inferencer_cfg: 15 | inferencer_type: PPL 16 | batch_size: 1 17 | CoT: False 18 | max_new_tokens: 256 19 | metric_cfg: 20 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/MME/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MME 3 | base_data_path: ../data/MME 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | prompt_assigned_ids: 2 9 | template_assigned_ids: 0 10 | inferencer_cfg: 11 | inferencer_type: Direct 12 | batch_size: 8 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/MME/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MME 3 | base_data_path: ../data/MME 4 | ppl_cfg: True 5 | option_lower: True 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | prompt_assigned_ids: 2 11 | template_assigned_ids: 0 12 | inferencer_cfg: 13 | inferencer_type: PPL 14 | batch_size: 8 15 | metric_cfg: 16 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/MMMU/default.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: MMMU 3 | base_data_path: ../data/MMMU 4 | img_folder: ../data/MMMU_images 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | batch_size: 1 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: MMMU -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Octavius3D/nr3d_caption_direct3d.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: OctaviusPCLDataset 3 | base_data_path: ../data/Octavius/3D_Benchmark/meta_file 4 | task_name: Caption 5 | inference_dataset_name: nr3d 6 | vision_root_path: ../data/Octavius/3D_Instruct 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | query_type: standard_query 11 | inferencer_cfg: 12 | inferencer_type: Direct3D 13 | batch_size: 1 14 | metric_cfg: 15 | metric_type: Octavius3D -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Octavius3D/scan_caption_direct3d.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: OctaviusPCLDataset 3 | base_data_path: ../data/Octavius/3D_Benchmark/meta_file 4 | task_name: Caption 5 | inference_dataset_name: scannet 6 | vision_root_path: ../data/Octavius/3D_Instruct 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | query_type: standard_query 11 | inferencer_cfg: 12 | inferencer_type: Direct3D 13 | batch_size: 1 14 | metric_cfg: 15 | metric_type: Octavius3D -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Octavius3D/scan_cls_direct3d.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: OctaviusPCLDataset 3 | base_data_path: ../data/Octavius/3D_Benchmark/meta_file 4 | task_name: Classification 5 | inference_dataset_name: scannet 6 | vision_root_path: ../data/Octavius/3D_Instruct 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | query_type: standard_query 11 | inferencer_cfg: 12 | inferencer_type: Direct3D 13 | batch_size: 1 14 | metric_cfg: 15 | metric_type: Octavius3D 16 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Octavius3D/scan_vqa_direct3d.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: OctaviusPCLDataset 3 | base_data_path: ../data/Octavius/3D_Benchmark/meta_file 4 | task_name: VQA 5 | inference_dataset_name: scannet 6 | vision_root_path: ../data/Octavius/3D_Instruct 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | query_type: standard_query 11 | inferencer_cfg: 12 | inferencer_type: Direct3D 13 | batch_size: 1 14 | metric_cfg: 15 | metric_type: Octavius3D -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Octavius3D/shapenet_cls_direct3d.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: OctaviusPCLDataset 3 | base_data_path: ../data/Octavius/3D_Benchmark/meta_file 4 | task_name: Classification 5 | inference_dataset_name: shapenet 6 | vision_root_path: ../data/Octavius/3D_Instruct 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | query_type: standard_query 11 | inferencer_cfg: 12 | inferencer_type: Direct3D 13 | batch_size: 1 14 | metric_cfg: 15 | metric_type: Octavius3D 16 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Omnibenchmark/multiturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Omnibenchmark 3 | base_data_path: ../data/ChEF/OmniBenchmark_Bamboo 4 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 5 | ppl_cfg: 6 | negative_opt_num: 3 7 | random_seed: 0 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: multiturn 12 | inferencer_cfg: 13 | inferencer_type: Multi_Direct 14 | batch_size: 8 15 | metric_cfg: 16 | metric_type: basic 17 | inference_type: multiturn 18 | ppl: False 19 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 20 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Omnibenchmark/multiturn_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Omnibenchmark 3 | base_data_path: ../data/ChEF/OmniBenchmark_Bamboo 4 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 5 | ppl_cfg: 6 | negative_opt_num: 3 7 | random_seed: 0 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: multiturn 12 | inferencer_cfg: 13 | inferencer_type: Multi_PPL 14 | batch_size: 8 15 | metric_cfg: 16 | metric_type: basic 17 | inference_type: multiturn 18 | ppl: True 19 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 20 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Omnibenchmark/singleturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Omnibenchmark 3 | base_data_path: ../data/ChEF/OmniBenchmark_Bamboo 4 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 5 | multi_turn: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | inferencer_cfg: 11 | inferencer_type: Direct 12 | batch_size: 8 13 | metric_cfg: 14 | metric_type: basic 15 | inference_type: singleturn 16 | ppl: False 17 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 18 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Omnibenchmark/singleturn_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Omnibenchmark 3 | base_data_path: ../data/ChEF/OmniBenchmark_Bamboo 4 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 5 | ppl_cfg: 6 | negative_opt_num: 3 7 | random_seed: 0 8 | multi_turn: False 9 | 10 | eval_cfg: 11 | instruction_cfg: 12 | prompt_type: singleturn 13 | inferencer_cfg: 14 | inferencer_type: PPL 15 | batch_size: 8 16 | metric_cfg: 17 | metric_type: basic 18 | inference_type: singleturn 19 | ppl: True 20 | bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json 21 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/SEEDBench-2/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: SEEDBench2 3 | base_data_path: ../data/SEEDBench2 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | prompt_assigned_ids: 2 9 | template_assigned_ids: 0 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 1 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/SEEDBench/default.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: SEEDBench 3 | base_data_path: ../data/SEED-Bench 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | prompt_assigned_ids: 4 9 | template_assigned_ids: 0 10 | inferencer_cfg: 11 | inferencer_type: PPL 12 | batch_size: 8 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/ScienceQA/direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 8 11 | CoT: False 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/ScienceQA/direct_CoT.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | inferencer_cfg: 9 | inferencer_type: Direct 10 | batch_size: 8 11 | CoT: True 12 | max_new_tokens: 256 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/ScienceQA/ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | prompt_assigned_ids: 0 11 | template_assigned_ids: 0 12 | inferencer_cfg: 13 | inferencer_type: PPL 14 | batch_size: 8 15 | CoT: False 16 | max_new_tokens: 256 17 | metric_cfg: 18 | metric_type: basic 19 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/ScienceQA/ppl_CoT.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: ScienceQA 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl: True 5 | option_content: False 6 | 7 | eval_cfg: 8 | instruction_cfg: 9 | prompt_type: singleturn 10 | prompt_assigned_ids: 0 11 | template_assigned_ids: 0 12 | inferencer_cfg: 13 | inferencer_type: PPL 14 | batch_size: 2 15 | CoT: True 16 | max_new_tokens: 128 17 | metric_cfg: 18 | metric_type: basic 19 | -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/kosmos2_multiturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: multiturn 8 | prompt_assigned_ids: 2 9 | template_assigned_ids: 2 10 | inferencer_cfg: 11 | inferencer_type: Multi_Direct 12 | max_new_tokens: 512 13 | batch_size: 8 14 | metric_cfg: 15 | metric_type: KOSMOS -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/kosmos2_multiturn_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | option_template: kosmos 5 | ppl_cfg: 6 | negative_opt_num: 3 7 | random_seed: 0 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: multiturn 12 | prompt_assigned_ids: 2 13 | template_assigned_ids: 2 14 | inferencer_cfg: 15 | inferencer_type: Multi_PPL 16 | batch_size: 8 17 | metric_cfg: 18 | metric_type: KOSMOS 19 | inference_type: ppl -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/kosmos2_singleturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: singleturn 8 | prompt_assigned_ids: 1 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | max_new_tokens: 512 12 | batch_size: 8 13 | metric_cfg: 14 | metric_type: KOSMOS -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/multiturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: multiturn 8 | inferencer_cfg: 9 | inferencer_type: Multi_Direct 10 | max_new_tokens: 64 11 | batch_size: 16 12 | metric_cfg: 13 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/multiturn_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | ppl_cfg: 5 | negative_opt_num: 3 6 | random_seed: 0 7 | 8 | eval_cfg: 9 | instruction_cfg: 10 | prompt_type: multiturn 11 | inferencer_cfg: 12 | inferencer_type: Multi_PPL 13 | batch_size: 8 14 | metric_cfg: 15 | metric_type: basic 16 | inference_type: ppl -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/shikra_multiturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | 5 | eval_cfg: 6 | instruction_cfg: 7 | prompt_type: multiturn 8 | prompt_assigned_ids: 0 9 | template_assigned_ids: 0 10 | inferencer_cfg: 11 | inferencer_type: Multi_Direct 12 | max_new_tokens: 64 13 | batch_size: 16 14 | metric_cfg: 15 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/shikra_multiturn_ppl.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | option_template: shikra 5 | ppl_cfg: 6 | negative_opt_num: 3 7 | random_seed: 0 8 | 9 | eval_cfg: 10 | instruction_cfg: 11 | prompt_type: multiturn 12 | prompt_assigned_ids: 1 13 | template_assigned_ids: 1 14 | inferencer_cfg: 15 | inferencer_type: Multi_PPL 16 | batch_size: 8 17 | metric_cfg: 18 | metric_type: basic 19 | inference_type: ppl -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/shikra_singleturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | multi_turn: False 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | prompt_assigned_ids: 1 10 | inferencer_cfg: 11 | inferencer_type: Direct 12 | max_new_tokens: 512 13 | batch_size: 8 14 | metric_cfg: 15 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/VOC2012/singleturn_direct.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: VOC2012 3 | base_data_path: ../data/LAMM/2D_Benchmark 4 | multi_turn: False 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | prompt_type: singleturn 9 | inferencer_cfg: 10 | inferencer_type: Direct 11 | max_new_tokens: 64 12 | batch_size: 16 13 | metric_cfg: 14 | metric_type: basic -------------------------------------------------------------------------------- /src/config/ChEF/scenario_recipes/Winoground/default.yaml: -------------------------------------------------------------------------------- 1 | scenario_cfg: 2 | dataset_name: Winoground 3 | base_data_path: ../data/Winoground 4 | ppl: True 5 | 6 | eval_cfg: 7 | instruction_cfg: 8 | query_type: standard_query 9 | inferencer_cfg: 10 | inferencer_type: PPL 11 | batch_size: 16 12 | max_new_tokens: 32 13 | multi_img: True 14 | metric_cfg: 15 | metric_type: Winoground 16 | -------------------------------------------------------------------------------- /src/dist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import time 4 | import torch 5 | 6 | def init_distributed_mode(): 7 | if 'SLURM_PROCID' in os.environ: 8 | global_rank = int(os.environ['SLURM_PROCID']) 9 | world_size = int(os.environ['SLURM_NPROCS']) 10 | local_rank = global_rank % torch.cuda.device_count() 11 | else: 12 | print('Not using distributed mode') 13 | return dict( 14 | local_rank=0, 15 | global_rank=0, 16 | world_size=1) 17 | 18 | print(f"Start inference, world_size: {world_size}, global_rank: {global_rank}, local_rank:{local_rank}") 19 | os.environ['LOCAL_RANK'] = str(local_rank) 20 | 21 | return dict( 22 | local_rank=local_rank, 23 | global_rank=global_rank, 24 | world_size=world_size) -------------------------------------------------------------------------------- /src/model/LAMM/CLIP/__init__.py: -------------------------------------------------------------------------------- 1 | # remove fp32 LN & return intermediate features 2 | from .clip import * 3 | -------------------------------------------------------------------------------- /src/model/LAMM/CLIP/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/CLIP/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_3detr import build_3detr, build_epcl_encoder 2 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/ball_query.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #pragma once 4 | #include 5 | 6 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, 7 | const int nsample); 8 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #ifndef _CUDA_UTILS_H 4 | #define _CUDA_UTILS_H 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #define TOTAL_THREADS 512 16 | 17 | inline int opt_n_threads(int work_size) { 18 | const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); 19 | 20 | return max(min(1 << pow_2, TOTAL_THREADS), 1); 21 | } 22 | 23 | inline dim3 opt_block_config(int x, int y) { 24 | const int x_threads = opt_n_threads(x); 25 | const int y_threads = 26 | max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); 27 | dim3 block_config(x_threads, y_threads, 1); 28 | 29 | return block_config; 30 | } 31 | 32 | #define CUDA_CHECK_ERRORS() \ 33 | do { \ 34 | cudaError_t err = cudaGetLastError(); \ 35 | if (cudaSuccess != err) { \ 36 | fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ 37 | cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \ 38 | __FILE__); \ 39 | exit(-1); \ 40 | } \ 41 | } while (0) 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/group_points.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | #pragma once 5 | #include 6 | 7 | at::Tensor group_points(at::Tensor points, at::Tensor idx); 8 | at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); 9 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/interpolate.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | std::vector three_nn(at::Tensor unknowns, at::Tensor knows); 9 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, 10 | at::Tensor weight); 11 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, 12 | at::Tensor weight, const int m); 13 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/sampling.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | #pragma once 5 | #include 6 | 7 | at::Tensor gather_points(at::Tensor points, at::Tensor idx); 8 | at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); 9 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); 10 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | #pragma once 5 | #include 6 | #include 7 | 8 | #define CHECK_CUDA(x) \ 9 | do { \ 10 | AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \ 11 | } while (0) 12 | 13 | #define CHECK_CONTIGUOUS(x) \ 14 | do { \ 15 | AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \ 16 | } while (0) 17 | 18 | #define CHECK_IS_INT(x) \ 19 | do { \ 20 | AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \ 21 | #x " must be an int tensor"); \ 22 | } while (0) 23 | 24 | #define CHECK_IS_FLOAT(x) \ 25 | do { \ 26 | AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \ 27 | #x " must be a float tensor"); \ 28 | } while (0) 29 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/src/ball_query.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | #include "ball_query.h" 5 | #include "utils.h" 6 | 7 | void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, 8 | int nsample, const float *new_xyz, 9 | const float *xyz, int *idx); 10 | 11 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, 12 | const int nsample) { 13 | CHECK_CONTIGUOUS(new_xyz); 14 | CHECK_CONTIGUOUS(xyz); 15 | CHECK_IS_FLOAT(new_xyz); 16 | CHECK_IS_FLOAT(xyz); 17 | 18 | if (new_xyz.is_cuda()) { 19 | CHECK_CUDA(xyz); 20 | } 21 | 22 | at::Tensor idx = 23 | torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, 24 | at::device(new_xyz.device()).dtype(at::ScalarType::Int)); 25 | 26 | if (new_xyz.is_cuda()) { 27 | query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), 28 | radius, nsample, new_xyz.data(), 29 | xyz.data(), idx.data()); 30 | } else { 31 | AT_ASSERT(false, "CPU not supported"); 32 | } 33 | 34 | return idx; 35 | } 36 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/src/bindings.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | #include "ball_query.h" 5 | #include "group_points.h" 6 | #include "interpolate.h" 7 | #include "sampling.h" 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def("gather_points", &gather_points); 11 | m.def("gather_points_grad", &gather_points_grad); 12 | m.def("furthest_point_sampling", &furthest_point_sampling); 13 | 14 | m.def("three_nn", &three_nn); 15 | m.def("three_interpolate", &three_interpolate); 16 | m.def("three_interpolate_grad", &three_interpolate_grad); 17 | 18 | m.def("ball_query", &ball_query); 19 | 20 | m.def("group_points", &group_points); 21 | m.def("group_points_grad", &group_points_grad); 22 | } 23 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/EPCL/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/pointnet2_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """ Testing customized ops. """ 4 | 5 | import torch 6 | from torch.autograd import gradcheck 7 | import numpy as np 8 | 9 | import os 10 | import sys 11 | 12 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | sys.path.append(BASE_DIR) 14 | import pointnet2_utils 15 | 16 | 17 | def test_interpolation_grad(): 18 | batch_size = 1 19 | feat_dim = 2 20 | m = 4 21 | feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() 22 | 23 | def interpolate_func(inputs): 24 | idx = torch.from_numpy(np.array([[[0, 1, 2], [1, 2, 3]]])).int().cuda() 25 | weight = torch.from_numpy(np.array([[[1, 1, 1], [2, 2, 2]]])).float().cuda() 26 | interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight) 27 | return interpolated_feats 28 | 29 | assert gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1) 30 | 31 | 32 | if __name__ == "__main__": 33 | test_interpolation_grad() 34 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/third_party/pointnet2/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from setuptools import setup 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 8 | import glob 9 | import os.path as osp 10 | 11 | this_dir = osp.dirname(osp.abspath(__file__)) 12 | 13 | _ext_src_root = "_ext_src" 14 | _ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob( 15 | "{}/src/*.cu".format(_ext_src_root) 16 | ) 17 | _ext_headers = glob.glob("{}/include/*".format(_ext_src_root)) 18 | 19 | setup( 20 | name="pointnet2", 21 | ext_modules=[ 22 | CUDAExtension( 23 | name="pointnet2._ext", 24 | sources=_ext_sources, 25 | extra_compile_args={ 26 | "cxx": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))], 27 | "nvcc": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))], 28 | }, 29 | include_dirs=[osp.join(this_dir, _ext_src_root, "include")], 30 | ) 31 | ], 32 | cmdclass={"build_ext": BuildExtension}, 33 | ) 34 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/EPCL/utils/__init__.py -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/utils/cython_compile.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from setuptools import setup, Extension 4 | from Cython.Build import cythonize 5 | import numpy as np 6 | 7 | 8 | # hacky way to find numpy include path 9 | # replace with actual path if this does not work 10 | np_include_path = np.__file__.replace("__init__.py", "core/include/") 11 | INCLUDE_PATH = [np_include_path] 12 | 13 | setup( 14 | ext_modules=cythonize( 15 | Extension( 16 | "box_intersection", 17 | sources=["box_intersection.pyx"], 18 | include_dirs=INCLUDE_PATH, 19 | ) 20 | ), 21 | ) 22 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/utils/cython_compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | python cython_compile.py build_ext --inplace 4 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/utils/download_weights.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | import os 5 | from urllib import request 6 | import torch 7 | import pickle 8 | 9 | ## Define the weights you want and where to store them 10 | dataset = "scannet" 11 | encoder = "_masked" # or "" 12 | epoch = 1080 13 | base_url = "https://dl.fbaipublicfiles.com/3detr/checkpoints" 14 | local_dir = "/tmp/" 15 | 16 | ### Downloading the weights 17 | weights_file = f"{dataset}{encoder}_ep{epoch}.pth" 18 | metrics_file = f"{dataset}{encoder}_ep{epoch}_metrics.pkl" 19 | local_weights = os.path.join(local_dir, weights_file) 20 | local_metrics = os.path.join(local_dir, metrics_file) 21 | 22 | url = os.path.join(base_url, weights_file) 23 | request.urlretrieve(url, local_weights) 24 | print(f"Downloaded weights from {url} to {local_weights}") 25 | 26 | url = os.path.join(base_url, metrics_file) 27 | request.urlretrieve(url, local_metrics) 28 | print(f"Downloaded metrics from {url} to {local_metrics}") 29 | 30 | # weights can be simply loaded with pytorch 31 | weights = torch.load(local_weights, map_location=torch.device("cpu")) 32 | print("Weights loaded successfully.") 33 | 34 | # metrics can be loaded with pickle 35 | with open(local_metrics, "rb") as fh: 36 | metrics = pickle.load(fh) 37 | print("Metrics loaded successfully.") 38 | -------------------------------------------------------------------------------- /src/model/LAMM/EPCL/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import torch 4 | 5 | try: 6 | from tensorboardX import SummaryWriter 7 | except ImportError: 8 | print("Cannot import tensorboard. Will log to txt files only.") 9 | SummaryWriter = None 10 | 11 | from utils.dist import is_primary 12 | 13 | 14 | class Logger(object): 15 | def __init__(self, log_dir=None) -> None: 16 | self.log_dir = log_dir 17 | if SummaryWriter is not None and is_primary(): 18 | self.writer = SummaryWriter(self.log_dir) 19 | else: 20 | self.writer = None 21 | 22 | def log_scalars(self, scalar_dict, step, prefix=None): 23 | if self.writer is None: 24 | return 25 | for k in scalar_dict: 26 | v = scalar_dict[k] 27 | if isinstance(v, torch.Tensor): 28 | v = v.detach().cpu().item() 29 | if prefix is not None: 30 | k = prefix + k 31 | self.writer.add_scalar(k, v, step) 32 | -------------------------------------------------------------------------------- /src/model/LAMM/__init__.py: -------------------------------------------------------------------------------- 1 | from .openlamm import LAMMPEFTModel 2 | from .openlamm import LAMMSFTModel -------------------------------------------------------------------------------- /src/model/LAMM/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .pcl_utils import * 2 | -------------------------------------------------------------------------------- /src/model/Octavius/__init__.py: -------------------------------------------------------------------------------- 1 | from .octavius import Octavius -------------------------------------------------------------------------------- /src/model/Octavius/moe/__init__.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import peft 3 | from peft import PEFT_TYPE_TO_CONFIG_MAPPING 4 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING 5 | 6 | 7 | # register MoE LoRA 8 | class PeftType(str, enum.Enum): 9 | PROMPT_TUNING = "PROMPT_TUNING" 10 | P_TUNING = "P_TUNING" 11 | PREFIX_TUNING = "PREFIX_TUNING" 12 | LORA = "LORA" 13 | ADALORA = "ADALORA" 14 | ADAPTION_PROMPT = "ADAPTION_PROMPT" 15 | IA3 = "IA3" 16 | MOE_LORA = 'MOE_LORA' 17 | 18 | peft.PeftType = PeftType 19 | 20 | from .moe_lora import MoeLoraConfig, MoeLoraModel 21 | PEFT_TYPE_TO_CONFIG_MAPPING[peft.PeftType.MOE_LORA] = MoeLoraConfig 22 | PEFT_TYPE_TO_MODEL_MAPPING[peft.PeftType.MOE_LORA] = MoeLoraModel 23 | 24 | 25 | __all__ = [ 26 | 'MoeLoraConfig', 27 | ] 28 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .training_agent import DeepSpeedAgent 2 | 3 | from .LAMM import LAMMPEFTModel 4 | from .LAMM import LAMMSFTModel 5 | from .Octavius import Octavius 6 | 7 | 8 | def load_model(args): 9 | agent_name = args["models"][args["model"]]["agent_name"] 10 | model_name = args["models"][args["model"]]["model_name"] 11 | model = globals()[model_name](**args) 12 | agent = globals()[agent_name](model, args) 13 | return agent 14 | -------------------------------------------------------------------------------- /src/model/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /src/model/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /src/model/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig 3 | -------------------------------------------------------------------------------- /src/model/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /src/model/llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /src/model/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /src/model/llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /src/model/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /src/model/llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | replace_llama_attn_with_flash_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /src/model/llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /src/slurm_eval.sh: -------------------------------------------------------------------------------- 1 | START_TIME=`date +%Y%m%d-%H:%M:%S` 2 | 3 | parition=$1 4 | GPUS=$2 5 | model_cfg=$3 6 | recipe_cfg=$4 7 | EXTRA_ARGS=${@:5} 8 | GPUS_PER_NODE=$(($GPUS<8?$GPUS:8)) 9 | 10 | LOG_FILE=../logs/evaluation_${START_TIME}.log 11 | 12 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 13 | srun -p ${parition} -J ChEF_eval --gres=gpu:${GPUS_PER_NODE} --ntasks=${GPUS} \ 14 | --ntasks-per-node=${GPUS_PER_NODE} --kill-on-bad-exit \ 15 | python eval.py \ 16 | --time ${START_TIME} \ 17 | --model_cfg=${model_cfg} \ 18 | --recipe_cfg=${recipe_cfg} \ 19 | ${EXTRA_ARGS} \ 20 | 2>&1 | tee -a $LOG_FILE > /dev/null & 21 | 22 | sleep 0.5s; 23 | tail -f ${LOG_FILE} -------------------------------------------------------------------------------- /src/slurm_eval_icl.sh: -------------------------------------------------------------------------------- 1 | START_TIME=`date +%Y%m%d-%H:%M:%S` 2 | 3 | parition=$1 4 | GPUS=$2 5 | model_cfg=$3 6 | dataset_name=$4 7 | EXTRA_ARGS=${@:5} 8 | GPUS_PER_NODE=$(($GPUS<8?$GPUS:8)) 9 | 10 | LOG_FILE=../logs/evaluation_${START_TIME}.log 11 | 12 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 13 | srun -p ${parition} -J ChEF_eval --gres=gpu:${GPUS_PER_NODE} --ntasks=${GPUS} \ 14 | --ntasks-per-node=${GPUS_PER_NODE} --kill-on-bad-exit \ 15 | python tools/ChEF/eval_icl.py \ 16 | --time ${START_TIME} \ 17 | --model_cfg=${model_cfg} \ 18 | --recipe_cfg=config/ChEF/desiderata_recipes/ICL/${dataset_name}.yaml \ 19 | ${EXTRA_ARGS} \ 20 | 2>&1 | tee -a $LOG_FILE > /dev/null & 21 | 22 | sleep 0.5s; 23 | tail -f ${LOG_FILE} -------------------------------------------------------------------------------- /src/tools/ChEF/eval_calibration.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import datetime 4 | import sys 5 | script_path = os.path.abspath(__file__) 6 | script_dir = os.path.dirname(script_path) 7 | chef_dir = os.path.join(script_dir, '../../../src') 8 | sys.path.append(chef_dir) 9 | 10 | from ChEF.evaluator import Evaluator, load_config, sample_dataset 11 | from ChEF.models import get_model 12 | from ChEF.scenario import dataset_dict 13 | 14 | def main(): 15 | model_cfg, recipe_cfg, save_dir, sample_len = load_config() 16 | # model 17 | model = get_model(model_cfg) 18 | 19 | # dataset 20 | scenario_cfg = recipe_cfg['scenario_cfg'] 21 | dataset_name = scenario_cfg['dataset_name'] 22 | dataset = dataset_dict[dataset_name](**scenario_cfg) 23 | # sample dataset 24 | dataset = sample_dataset(dataset, sample_len=sample_len, sample_seed=0) 25 | 26 | # save_cfg 27 | time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") 28 | save_base_dir = os.path.join(save_dir, model_cfg['model_name'], 'Calibration', dataset_name, time) 29 | os.makedirs(save_base_dir, exist_ok=True) 30 | with open(os.path.join(save_base_dir, 'config.yaml'), 'w', encoding='utf-8') as f: 31 | yaml.dump(data=dict(model_cfg=model_cfg, recipe_cfg=recipe_cfg), stream=f, allow_unicode=True) 32 | print(f'Save results in {save_base_dir}!') 33 | 34 | # evaluate 35 | eval_cfg = recipe_cfg['eval_cfg'] 36 | evaluater = Evaluator(dataset, save_base_dir, eval_cfg) 37 | evaluater.evaluate(model) 38 | 39 | if __name__ == '__main__': 40 | main() -------------------------------------------------------------------------------- /src/tools/ChEF/eval_hallucination.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import datetime 4 | import sys 5 | script_path = os.path.abspath(__file__) 6 | script_dir = os.path.dirname(script_path) 7 | chef_dir = os.path.join(script_dir, '../../../src') 8 | sys.path.append(chef_dir) 9 | 10 | from ChEF.evaluator import Evaluator, load_config, sample_dataset 11 | from ChEF.models import get_model 12 | from ChEF.scenario import dataset_dict 13 | 14 | def main(): 15 | model_cfg, recipe_cfg, save_dir, sample_len = load_config() 16 | 17 | # model 18 | model = get_model(model_cfg) 19 | 20 | # dataset 21 | scenario_cfg = recipe_cfg['scenario_cfg'] 22 | 23 | settings = ['POPE_COCO_random','POPE_COCO_popular','POPE_COCO_adversarial'] 24 | for setting in settings: 25 | scenario_cfg['dataset_name'] = setting 26 | dataset_name = scenario_cfg['dataset_name'] 27 | dataset = dataset_dict[dataset_name](**scenario_cfg) 28 | dataset = sample_dataset(dataset, sample_len=sample_len, sample_seed=0) 29 | # save_cfg 30 | time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") 31 | save_base_dir = os.path.join(save_dir, model_cfg['model_name'], 'Hallucination',dataset_name, time) 32 | os.makedirs(save_base_dir, exist_ok=True) 33 | with open(os.path.join(save_base_dir, 'config.yaml'), 'w', encoding='utf-8') as f: 34 | yaml.dump(data=dict(model_cfg=model_cfg, recipe_cfg=recipe_cfg), stream=f, allow_unicode=True) 35 | print(f'Save results in {save_base_dir}!') 36 | 37 | # evaluate 38 | eval_cfg = recipe_cfg['eval_cfg'] 39 | evaluater = Evaluator(dataset, save_base_dir, eval_cfg) 40 | evaluater.evaluate(model) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() -------------------------------------------------------------------------------- /src/tools/LAMM/eval_lamm2d.sh: -------------------------------------------------------------------------------- 1 | yaml_dict=(ScienceQA FSC147 VOC2012 SVT Flickr30k UCMerced CelebA_hair CelebA_smile CIFAR10 AI2D locating_LSP locating_VOC2012) 2 | 3 | for dataset in ${yaml_dict[*]}; do 4 | 5 | python eval.py --model_cfg config/ChEF/models/lamm.yaml --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml 6 | 7 | done -------------------------------------------------------------------------------- /src/tools/LAMM/eval_lamm3d.sh: -------------------------------------------------------------------------------- 1 | yaml_dict=(ScanNet ScanRefer ScanQA) 2 | 3 | for dataset in ${yaml_dict[*]}; do 4 | 5 | python eval.py --model_cfg config/ChEF/models/lamm_3d.yaml --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml 6 | 7 | done -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=4 3 | 4 | exp=$1 5 | visfeat_type=local 6 | now=$(date +"%Y%m%d_%H%M%S") 7 | 8 | ckpt_dir=../ckpt 9 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 10 | 11 | deepspeed --include localhost:0,1,2,3 --master_addr 127.0.0.1 --master_port 28457 train.py \ 12 | --stage 1 \ 13 | --cfg ./config/LAMM/train.yaml \ 14 | --data_path ../data/LAMM/2D_Instruct/meta_file/LAMM_instruct_186k.json \ 15 | --vision_root_path ../data/LAMM/2D_Instruct/ \ 16 | --conv_template default \ 17 | --max_tgt_len 400 \ 18 | --vision_type image \ 19 | --use_system \ 20 | --model lamm_peft \ 21 | --encoder_pretrain clip \ 22 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 23 | --vision_feature_type ${visfeat_type} \ 24 | --num_vision_token 256 \ 25 | --save_path ${ckpt_dir}/${exp} \ 26 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 27 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 28 | -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm2d_sft_stg1_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=8 3 | 4 | partition=$1 5 | exp=$2 6 | visfeat_type=local 7 | 8 | now=$(date +"%Y%m%d_%H%M%S") 9 | ckpt_dir=../ckpt 10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 11 | 12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \ 13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \ 14 | --stage 1 \ 15 | --cfg ./config/LAMM/train_sft.yaml \ 16 | --data_path ../data/LAMM/2D_pretrain/meta_file/blip_laion_cc_sbu_558k.json \ 17 | --vision_root_path ../data/LAMM/2D_pretrain/images \ 18 | --conv_template default \ 19 | --max_tgt_len 2048 \ 20 | --vision_type image \ 21 | --use_system \ 22 | --model lamm_sft \ 23 | --encoder_pretrain clip \ 24 | --gradient_checkpointing \ 25 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 26 | --vision_feature_type ${visfeat_type} \ 27 | --num_vision_token 256 \ 28 | --save_path ${ckpt_dir}/${exp} \ 29 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 30 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 31 | 32 | -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm2d_sft_stg2_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=8 3 | 4 | partition=$1 5 | exp=$2 6 | visfeat_type=local 7 | 8 | now=$(date +"%Y%m%d_%H%M%S") 9 | ckpt_dir=../ckpt 10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 11 | 12 | srun -p ${partition} -J ${exp} -x "SH-IDC1-10-140-1-164" --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \ 13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \ 14 | --stage 2 \ 15 | --cfg ./config/LAMM/train_sft.yaml \ 16 | --data_path ../data/LAMM/2D_Finetune/meta_file/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json \ 17 | --vision_root_path ../data/LAMM/2D_Finetune/images \ 18 | --llm_proj_path ../ckpt/LAMM15_stage1/pytorch_model.pt \ 19 | --conv_template default \ 20 | --max_tgt_len 2048 \ 21 | --vision_type image \ 22 | --use_system \ 23 | --model lamm_sft \ 24 | --encoder_pretrain clip \ 25 | --gradient_checkpointing \ 26 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 27 | --vision_feature_type ${visfeat_type} \ 28 | --num_vision_token 256 \ 29 | --save_path ${ckpt_dir}/${exp} \ 30 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 31 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 32 | 33 | -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm2d_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=4 3 | 4 | partition=$1 5 | exp=$2 6 | visfeat_type=local 7 | 8 | now=$(date +"%Y%m%d_%H%M%S") 9 | ckpt_dir=../ckpt 10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 11 | 12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \ 13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \ 14 | --stage 1 \ 15 | --cfg ./config/LAMM/train.yaml \ 16 | --data_path ../data/LAMM/2D_Instruct/meta_file/LAMM_instruct_186k.json \ 17 | --vision_root_path ../data/LAMM/2D_Instruct/ \ 18 | --conv_template default \ 19 | --max_tgt_len 400 \ 20 | --vision_type image \ 21 | --use_system \ 22 | --model lamm_peft \ 23 | --encoder_pretrain clip \ 24 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 25 | --vision_feature_type ${visfeat_type} \ 26 | --num_vision_token 256 \ 27 | --save_path ${ckpt_dir}/${exp} \ 28 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 29 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 30 | 31 | -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm3d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=4 3 | 4 | exp=$1 5 | visfeat_type=local 6 | now=$(date +"%Y%m%d_%H%M%S") 7 | 8 | ckpt_dir=../ckpt 9 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 10 | deepspeed --include localhost:0,1,2,3 --master_addr 127.0.0.1 --master_port 28457 train.py \ 11 | --stage 1 \ 12 | --cfg ./config/train.yaml \ 13 | --data_path ../data/LAMM/3D_Instruct/meta_file/LAMM_3dinstruct_10k.json \ 14 | --vision_root_path ../data/LAMM/3D_Instruct/ \ 15 | --max_tgt_len 400 \ 16 | --vision_type pcl \ 17 | --use_system \ 18 | --model lamm_peft \ 19 | --encoder_pretrain epcl \ 20 | --encoder_ckpt_path ../model_zoo/epcl_ckpt/epcl_scannet_vit-L-14_256tokens_latest.pth \ 21 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 22 | --vision_feature_type ${visfeat_type} \ 23 | --num_vision_token 256 \ 24 | --save_path ${ckpt_dir}/${exp} \ 25 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 26 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 27 | -------------------------------------------------------------------------------- /src/tools/LAMM/train_lamm3d_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | numgpu=4 3 | 4 | partition=$1 5 | exp=$2 6 | visfeat_type=local 7 | now=$(date +"%Y%m%d_%H%M%S") 8 | 9 | ckpt_dir=../ckpt 10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 11 | 12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \ 13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25441 train.py \ 14 | --stage 1 \ 15 | --cfg ./config/train.yaml \ 16 | --data_path ../data/LAMM/3D_Instruct/meta_file/LAMM_3dinstruct_10k.json \ 17 | --vision_root_path ../data/LAMM/3D_Instruct/ \ 18 | --max_tgt_len 400 \ 19 | --vision_type pcl \ 20 | --use_system \ 21 | --model lamm_peft \ 22 | --encoder_pretrain epcl \ 23 | --encoder_ckpt_path ../model_zoo/epcl_ckpt/epcl_scannet_vit-L-14_256tokens_latest.pth \ 24 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 25 | --vision_feature_type ${visfeat_type} \ 26 | --num_vision_token 256 \ 27 | --save_path ${ckpt_dir}/${exp} \ 28 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 29 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 30 | -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/data/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/data/ScanRefer.yaml: -------------------------------------------------------------------------------- 1 | NAME: ScanRefer 2 | DATA_PATH: data/scanrefer 3 | PC_PATH: data/scanrefer/point_data 4 | IMAGE_PATH: data/scanrefer/image_feature 5 | TEXT_PATH: data/scanrefer/text_feature 6 | scannet_object_clip_root: data/scanrefer/image_memory_bank 7 | scannet_text_clip_root: data/scanrefer/text_memory_bank -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/data/ScanReferValid.yaml: -------------------------------------------------------------------------------- 1 | NAME: ScanReferValid 2 | DATA_PATH: data/scanrefer 3 | NUM_CATEGORY: 261 4 | USE_NORMALS: FALSE -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/data/dataset_catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "scanrefer": { 3 | "config": "./data/ScanRefer.yaml", 4 | "train": "train", 5 | "test": "test", 6 | "usage": "train" 7 | }, 8 | "scanrefer_valid": { 9 | "config": "./data/ScanReferValid.yaml", 10 | "train": "train", 11 | "test": "test", 12 | "usage": "test" 13 | } 14 | } -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/models/pointbert/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/pointbert/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/models/pointbert/PointTransformer_8192point.yaml: -------------------------------------------------------------------------------- 1 | optimizer : { 2 | type: AdamW, 3 | kwargs: { 4 | lr : 0.0005, 5 | weight_decay : 0.05 6 | }} 7 | 8 | scheduler: { 9 | type: CosLR, 10 | kwargs: { 11 | epochs: 300, 12 | initial_epochs : 10 13 | }} 14 | 15 | model : { 16 | NAME: PointTransformer, 17 | trans_dim: 384, 18 | depth: 12, 19 | drop_path_rate: 0.1, 20 | cls_dim: 40, 21 | num_heads: 6, 22 | group_size: 32, 23 | num_group: 512, # 512 24 | encoder_dims: 256, 25 | } 26 | npoints: 8192 27 | total_bs : 32 28 | step_per_update : 1 29 | max_epoch : 300 30 | grad_norm_clip : 10 31 | 32 | consider_metric: CDL1 -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/models/pointnet2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/pointnet2/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/scripts/pretrain_pointbert.sh: -------------------------------------------------------------------------------- 1 | current_time=`date "+%Y_%m_%d_%H_%M"` 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 \ 4 | python -m torch.distributed.launch \ 5 | --nproc_per_node=2 \ 6 | --master_port 61234 \ 7 | main.py \ 8 | --model ULIP_PointBERT \ 9 | --npoints 2048 \ 10 | --lr 1e-4 \ 11 | --epochs 40 \ 12 | --batch_size 16 \ 13 | --lr_end 1e-5 \ 14 | --output_dir ./outputs/pointbert_2kpts_xyz_$current_time \ 15 | # --use_scanrefer \ -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/utils/.DS_Store -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * Copyright (c) 2023, salesforce.com, inc. 3 | * All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | * By Le Xue 7 | ''' 8 | -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/utils/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/utils/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/utils/build.py: -------------------------------------------------------------------------------- 1 | from utils import registry 2 | 3 | 4 | DATASETS = registry.Registry('dataset') 5 | 6 | 7 | def build_dataset_from_cfg(cfg, default_args = None): 8 | """ 9 | Build a dataset, defined by `dataset_name`. 10 | Args: 11 | cfg (eDICT): 12 | Returns: 13 | Dataset: a constructed dataset specified by dataset_name. 14 | """ 15 | return DATASETS.build(cfg, default_args = default_args) 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/tools/Octavius/ULIP/utils/io.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import open3d 4 | import os 5 | 6 | class IO: 7 | @classmethod 8 | def get(cls, file_path): 9 | _, file_extension = os.path.splitext(file_path) 10 | 11 | if file_extension in ['.npy']: 12 | return cls._read_npy(file_path) 13 | elif file_extension in ['.pcd']: 14 | return cls._read_pcd(file_path) 15 | elif file_extension in ['.h5']: 16 | return cls._read_h5(file_path) 17 | elif file_extension in ['.txt']: 18 | return cls._read_txt(file_path) 19 | else: 20 | raise Exception('Unsupported file extension: %s' % file_extension) 21 | 22 | # References: https://github.com/numpy/numpy/blob/master/numpy/lib/format.py 23 | @classmethod 24 | def _read_npy(cls, file_path): 25 | return np.load(file_path) 26 | 27 | # References: https://github.com/dimatura/pypcd/blob/master/pypcd/pypcd.py#L275 28 | # Support PCD files without compression ONLY! 29 | @classmethod 30 | def _read_pcd(cls, file_path): 31 | pc = open3d.io.read_point_cloud(file_path) 32 | ptcloud = np.array(pc.points) 33 | return ptcloud 34 | 35 | @classmethod 36 | def _read_txt(cls, file_path): 37 | return np.loadtxt(file_path) 38 | 39 | @classmethod 40 | def _read_h5(cls, file_path): 41 | f = h5py.File(file_path, 'r') 42 | return f['data'][()] -------------------------------------------------------------------------------- /src/tools/Octavius/octavius_ChEF.sh: -------------------------------------------------------------------------------- 1 | # choose one model_cfg from 2d, 3d, 2d+3d 2 | model_cfg=config/ChEF/models/octavius_2d.yaml 3 | recipe_cfg_list=(CIFAR10 Flickr30k CelebA_hair CelebA_smile VOC2012 ScienceQA) 4 | 5 | for dataset in ${recipe_cfg_list[*]}; do 6 | srun -p --gres=gpu:1 --ntasks-per-node=1 --kill-on-bad-exit \ 7 | python eval.py \ 8 | --model_cfg ${model_cfg} \ 9 | --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml 10 | done -------------------------------------------------------------------------------- /src/tools/Octavius/train_octavius_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | partition=$1 4 | numgpu=$2 5 | config=$3 6 | exp=$4 7 | visfeat_type=local 8 | 9 | now=$(date +"%Y%m%d_%H%M%S") 10 | ckpt_dir=../ckpt 11 | mkdir -p ${ckpt_dir}/${exp}/log_rest/ 12 | 13 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \ 14 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \ 15 | --stage 1 \ 16 | --cfg ${config} \ 17 | --conv_template default \ 18 | --max_tgt_len 400 \ 19 | --use_system \ 20 | --model octavius \ 21 | --encoder_pretrain clip \ 22 | --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \ 23 | --vision_feature_type ${visfeat_type} \ 24 | --num_vision_token 256 \ 25 | --save_path ${ckpt_dir}/${exp} \ 26 | --log_path ${ckpt_dir}/${exp}/log_rest/ \ 27 | 2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log 28 | --------------------------------------------------------------------------------