├── .gitignore
├── README.md
├── ckpt
    └── .gitkeep
├── data
    └── .gitkeep
├── docs
    ├── ch3ef.md
    ├── chef.md
    ├── lamm.md
    └── octavius.md
├── images
    ├── Ch3Ef_intro.png
    ├── ChEF-benchmark.png
    ├── ChEF-logo.png
    ├── LAMM-Dataset.png
    ├── LAMM-Framework.png
    ├── LAMM-benchmark.png
    ├── LAMM_2d_demo.png
    ├── LAMM_Imagewall.png
    ├── Octavius_arch.png
    ├── ch3ef-logo.png
    ├── lamm-title.png
    └── lamm-video.png
├── requirements
    ├── ChEF.txt
    ├── default.txt
    └── optional.txt
└── src
    ├── ChEF
        ├── __init__.py
        ├── data_process
        │   ├── Omnibenchmark.py
        │   └── mmbench.py
        ├── evaluator.py
        ├── inferencer
        │   ├── Multiturn.py
        │   ├── Singleturn.py
        │   ├── __init__.py
        │   └── utils.py
        ├── instruction
        │   ├── __init__.py
        │   ├── ice_retriever
        │   │   ├── __init__.py
        │   │   ├── base_retriever.py
        │   │   ├── fixed_retriever.py
        │   │   ├── random_retriever.py
        │   │   ├── topk_retriever.py
        │   │   ├── topk_retriever_img.py
        │   │   └── utils.py
        │   ├── prompt.py
        │   └── template.py
        ├── metric
        │   ├── Ch3Ef.py
        │   ├── __init__.py
        │   ├── caption.py
        │   ├── classification.py
        │   ├── counting.py
        │   ├── desiderata.py
        │   ├── detection.py
        │   ├── lamm_locating.py
        │   ├── mmmu.py
        │   ├── mmmu_utils.py
        │   ├── ocr.py
        │   ├── utils.py
        │   ├── vqa.py
        │   └── winoground.py
        ├── models
        │   ├── __init__.py
        │   ├── instruct_blip
        │   │   ├── __init__.py
        │   │   ├── common
        │   │   │   ├── config.py
        │   │   │   ├── dist_utils.py
        │   │   │   ├── gradcam.py
        │   │   │   ├── logger.py
        │   │   │   ├── optims.py
        │   │   │   ├── registry.py
        │   │   │   ├── utils.py
        │   │   │   └── vqa_tools
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── vqa.py
        │   │   │   │   └── vqa_eval.py
        │   │   ├── configs
        │   │   │   ├── default.yaml
        │   │   │   └── models
        │   │   │   │   ├── albef_classification_ve.yaml
        │   │   │   │   ├── albef_feature_extractor.yaml
        │   │   │   │   ├── albef_nlvr.yaml
        │   │   │   │   ├── albef_pretrain_base.yaml
        │   │   │   │   ├── albef_retrieval_coco.yaml
        │   │   │   │   ├── albef_retrieval_flickr.yaml
        │   │   │   │   ├── albef_vqav2.yaml
        │   │   │   │   ├── alpro_qa_msrvtt.yaml
        │   │   │   │   ├── alpro_qa_msvd.yaml
        │   │   │   │   ├── alpro_retrieval_didemo.yaml
        │   │   │   │   ├── alpro_retrieval_msrvtt.yaml
        │   │   │   │   ├── bert_config.json
        │   │   │   │   ├── bert_config_alpro.json
        │   │   │   │   ├── blip2
        │   │   │   │       ├── blip2_caption_flant5xl.yaml
        │   │   │   │       ├── blip2_caption_opt2.7b.yaml
        │   │   │   │       ├── blip2_caption_opt6.7b.yaml
        │   │   │   │       ├── blip2_coco.yaml
        │   │   │   │       ├── blip2_instruct_flant5xl.yaml
        │   │   │   │       ├── blip2_instruct_flant5xxl.yaml
        │   │   │   │       ├── blip2_instruct_vicuna13b.yaml
        │   │   │   │       ├── blip2_instruct_vicuna7b.yaml
        │   │   │   │       ├── blip2_pretrain.yaml
        │   │   │   │       ├── blip2_pretrain_flant5xl.yaml
        │   │   │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
        │   │   │   │       ├── blip2_pretrain_flant5xxl.yaml
        │   │   │   │       ├── blip2_pretrain_llama7b.yaml
        │   │   │   │       ├── blip2_pretrain_opt2.7b.yaml
        │   │   │   │       ├── blip2_pretrain_opt6.7b.yaml
        │   │   │   │       └── blip2_pretrain_vitL.yaml
        │   │   │   │   ├── blip_caption_base_coco.yaml
        │   │   │   │   ├── blip_caption_large_coco.yaml
        │   │   │   │   ├── blip_classification_base.yaml
        │   │   │   │   ├── blip_feature_extractor_base.yaml
        │   │   │   │   ├── blip_itm_base.yaml
        │   │   │   │   ├── blip_itm_large.yaml
        │   │   │   │   ├── blip_nlvr.yaml
        │   │   │   │   ├── blip_pretrain_base.yaml
        │   │   │   │   ├── blip_pretrain_large.yaml
        │   │   │   │   ├── blip_retrieval_coco.yaml
        │   │   │   │   ├── blip_retrieval_flickr.yaml
        │   │   │   │   ├── blip_vqa_aokvqa.yaml
        │   │   │   │   ├── blip_vqa_okvqa.yaml
        │   │   │   │   ├── blip_vqav2.yaml
        │   │   │   │   ├── clip
        │   │   │   │       ├── RN101-quickgelu.json
        │   │   │   │       ├── RN101.json
        │   │   │   │       ├── RN50-quickgelu.json
        │   │   │   │       ├── RN50.json
        │   │   │   │       ├── RN50x16.json
        │   │   │   │       ├── RN50x4.json
        │   │   │   │       ├── ViT-B-16-plus-240.json
        │   │   │   │       ├── ViT-B-16-plus.json
        │   │   │   │       ├── ViT-B-16.json
        │   │   │   │       ├── ViT-B-32-plus-256.json
        │   │   │   │       ├── ViT-B-32-quickgelu.json
        │   │   │   │       ├── ViT-B-32.json
        │   │   │   │       ├── ViT-H-14.json
        │   │   │   │       ├── ViT-H-16.json
        │   │   │   │       ├── ViT-L-14-280.json
        │   │   │   │       ├── ViT-L-14-336.json
        │   │   │   │       ├── ViT-L-14.json
        │   │   │   │       ├── ViT-L-16-320.json
        │   │   │   │       ├── ViT-L-16.json
        │   │   │   │       ├── ViT-g-14.json
        │   │   │   │       ├── timm-efficientnetv2_rw_s.json
        │   │   │   │       ├── timm-resnet50d.json
        │   │   │   │       ├── timm-resnetaa50d.json
        │   │   │   │       ├── timm-resnetblur50.json
        │   │   │   │       ├── timm-swin_base_patch4_window7_224.json
        │   │   │   │       ├── timm-vit_base_patch16_224.json
        │   │   │   │       ├── timm-vit_base_patch32_224.json
        │   │   │   │       └── timm-vit_small_patch16_224.json
        │   │   │   │   ├── clip_resnet50.yaml
        │   │   │   │   ├── clip_vit_base16.yaml
        │   │   │   │   ├── clip_vit_base32.yaml
        │   │   │   │   ├── clip_vit_large14.yaml
        │   │   │   │   ├── clip_vit_large14_336.yaml
        │   │   │   │   ├── gpt_dialogue_base.yaml
        │   │   │   │   ├── img2prompt-vqa
        │   │   │   │       └── img2prompt_vqa_base.yaml
        │   │   │   │   ├── med_config.json
        │   │   │   │   ├── med_config_albef.json
        │   │   │   │   ├── med_large_config.json
        │   │   │   │   └── pnp-vqa
        │   │   │   │       ├── pnp_vqa_3b.yaml
        │   │   │   │       ├── pnp_vqa_base.yaml
        │   │   │   │       ├── pnp_vqa_large.yaml
        │   │   │   │       ├── unifiedqav2_3b_config.json
        │   │   │   │       ├── unifiedqav2_base_config.json
        │   │   │   │       └── unifiedqav2_large_config.json
        │   │   ├── models
        │   │   │   ├── __init__.py
        │   │   │   ├── base_model.py
        │   │   │   ├── blip2_models
        │   │   │   │   ├── Qformer.py
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── blip2.py
        │   │   │   │   ├── blip2_image_text_matching.py
        │   │   │   │   ├── blip2_opt.py
        │   │   │   │   ├── blip2_qformer.py
        │   │   │   │   ├── blip2_t5.py
        │   │   │   │   ├── blip2_t5_instruct.py
        │   │   │   │   ├── blip2_vicuna_instruct.py
        │   │   │   │   ├── modeling_llama.py
        │   │   │   │   ├── modeling_opt.py
        │   │   │   │   └── modeling_t5.py
        │   │   │   ├── blip_models
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── blip.py
        │   │   │   │   ├── blip_caption.py
        │   │   │   │   ├── blip_classification.py
        │   │   │   │   ├── blip_feature_extractor.py
        │   │   │   │   ├── blip_image_text_matching.py
        │   │   │   │   ├── blip_nlvr.py
        │   │   │   │   ├── blip_outputs.py
        │   │   │   │   ├── blip_pretrain.py
        │   │   │   │   ├── blip_retrieval.py
        │   │   │   │   ├── blip_vqa.py
        │   │   │   │   └── nlvr_encoder.py
        │   │   │   ├── clip_vit.py
        │   │   │   ├── eva_vit.py
        │   │   │   ├── med.py
        │   │   │   ├── timesformer
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── conv2d_same.py
        │   │   │   │   ├── features.py
        │   │   │   │   ├── helpers.py
        │   │   │   │   ├── linear.py
        │   │   │   │   ├── vit.py
        │   │   │   │   └── vit_utils.py
        │   │   │   └── vit.py
        │   │   └── processors
        │   │   │   ├── __init__.py
        │   │   │   ├── base_processor.py
        │   │   │   ├── blip_processors.py
        │   │   │   ├── clip_processors.py
        │   │   │   ├── functional_video.py
        │   │   │   ├── gpt_processors.py
        │   │   │   └── randaugment.py
        │   ├── internlm
        │   │   ├── __init__.py
        │   │   ├── build_mlp.py
        │   │   ├── configuration_internlm_xcomposer2.py
        │   │   ├── modeling_internlm2.py
        │   │   ├── modeling_internlm_xcomposer2.py
        │   │   ├── rewrite_modeling_internlm_xcomposer2.py
        │   │   ├── tokenization_internlm_xcomposer2.py
        │   │   └── zero_to_fp32.py
        │   ├── kosmos2
        │   │   ├── data
        │   │   │   ├── dict.txt
        │   │   │   └── sentencepiece.bpe.model
        │   │   ├── unilm
        │   │   │   ├── __init__.py
        │   │   │   ├── data
        │   │   │   │   └── utils.py
        │   │   │   ├── models
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── connector.py
        │   │   │   │   ├── gpt.py
        │   │   │   │   ├── gpt_eval.py
        │   │   │   │   ├── unigpt.py
        │   │   │   │   └── vl
        │   │   │   │   │   ├── __init__.py
        │   │   │   │   │   ├── clip.py
        │   │   │   │   │   └── vlm_generator.py
        │   │   │   └── tasks
        │   │   │   │   ├── __init__.py
        │   │   │   │   └── generation_obj.py
        │   │   └── utils.py
        │   ├── llama_adapter_v2
        │   │   ├── __init__.py
        │   │   ├── llama.py
        │   │   ├── llama_adapter.py
        │   │   ├── tokenizer.py
        │   │   └── utils.py
        │   ├── minigpt4
        │   │   ├── __init__.py
        │   │   ├── common
        │   │   │   ├── __init__.py
        │   │   │   ├── config.py
        │   │   │   ├── dist_utils.py
        │   │   │   ├── gradcam.py
        │   │   │   ├── logger.py
        │   │   │   ├── optims.py
        │   │   │   ├── registry.py
        │   │   │   └── utils.py
        │   │   ├── configs
        │   │   │   ├── default.yaml
        │   │   │   └── models
        │   │   │   │   └── minigpt4.yaml
        │   │   ├── conversation
        │   │   │   ├── __init__.py
        │   │   │   └── conversation.py
        │   │   ├── minigpt4_eval.yaml
        │   │   ├── models
        │   │   │   ├── Qformer.py
        │   │   │   ├── __init__.py
        │   │   │   ├── base_model.py
        │   │   │   ├── blip2.py
        │   │   │   ├── blip2_outputs.py
        │   │   │   ├── eva_vit.py
        │   │   │   ├── mini_gpt4.py
        │   │   │   └── modeling_llama.py
        │   │   └── processors
        │   │   │   ├── __init__.py
        │   │   │   ├── base_processor.py
        │   │   │   ├── blip_processors.py
        │   │   │   └── randaugment.py
        │   ├── mplug_owl
        │   │   ├── __init__.py
        │   │   ├── configuration_mplug_owl.py
        │   │   ├── modeling_mplug_owl.py
        │   │   ├── processing_mplug_owl.py
        │   │   └── tokenization_mplug_owl.py
        │   ├── otter
        │   │   ├── __init__.py
        │   │   ├── config.json
        │   │   ├── configuration_otter.py
        │   │   ├── flamingo_pt2otter_hf.py
        │   │   ├── modeling_otter.py
        │   │   └── otter_pt2otter_hf.py
        │   ├── qwen
        │   │   ├── __init__.py
        │   │   ├── configuration_qwen.py
        │   │   ├── modeling_qwen.py
        │   │   ├── qwen_generation_utils.py
        │   │   ├── tokenization_qwen.py
        │   │   └── visual.py
        │   ├── rlhfv
        │   │   ├── __init__.py
        │   │   ├── beit3.py
        │   │   ├── conversation.py
        │   │   ├── llava.py
        │   │   ├── muffin.py
        │   │   └── utils.py
        │   ├── shikra
        │   │   ├── __init__.py
        │   │   ├── builder
        │   │   │   ├── __init__.py
        │   │   │   ├── build_shikra.py
        │   │   │   └── builder.py
        │   │   ├── conversation
        │   │   │   ├── __init__.py
        │   │   │   └── base_conversation.py
        │   │   ├── dataset
        │   │   │   ├── __init__.py
        │   │   │   ├── builder.py
        │   │   │   ├── process_function
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── box_process_function.py
        │   │   │   │   └── shikra_process_function.py
        │   │   │   ├── root.py
        │   │   │   ├── single_image_convsation.py
        │   │   │   ├── single_image_interactive.py
        │   │   │   └── utils
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── compute_metrics.py
        │   │   │   │   ├── concatenate_dataset.py
        │   │   │   │   ├── flickr30k_entities_utils.py
        │   │   │   │   ├── io.py
        │   │   │   │   ├── mixin.py
        │   │   │   │   └── transform.py
        │   │   ├── shikra
        │   │   │   ├── __init__.py
        │   │   │   ├── apply_delta.py
        │   │   │   ├── make_delta.py
        │   │   │   └── shikra.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── common.py
        │   │   │   └── llama_flash_attn_monkey_patch.py
        │   ├── test_base.py
        │   ├── test_gemini.py
        │   ├── test_gpt.py
        │   ├── test_instructblip.py
        │   ├── test_internlmxcomposer.py
        │   ├── test_kosmos.py
        │   ├── test_lamm.py
        │   ├── test_lamm15.py
        │   ├── test_llamaadapterv2.py
        │   ├── test_llava15.py
        │   ├── test_llavarlhf.py
        │   ├── test_minigpt4.py
        │   ├── test_mplugowl.py
        │   ├── test_octavius.py
        │   ├── test_otter.py
        │   ├── test_qwenvl.py
        │   ├── test_rlhfv.py
        │   ├── test_shikra.py
        │   └── utils.py
        ├── resources
        │   └── ChEF-logo.png
        ├── scenario
        │   ├── Ch3Ef_dataset.py
        │   ├── LAMM_dataset.py
        │   ├── MMBench_dataset.py
        │   ├── MME_dataset.py
        │   ├── MMMU_dataset.py
        │   ├── POPE_dataset.py
        │   ├── SEED_Bench2_dataset.py
        │   ├── SEED_Bench_dataset.py
        │   ├── Winoground_dataset.py
        │   ├── __init__.py
        │   ├── caption_dataset.py
        │   ├── classification.py
        │   ├── counting_dataset.py
        │   ├── det_dataset.py
        │   ├── lamm_sysmsg.py
        │   ├── octavius_pcl_dataset.py
        │   ├── utils.py
        │   └── vqa_dataset.py
        ├── test
        │   ├── test_model.py
        │   ├── test_model.sh
        │   └── test_recipes.py
        └── tools
        │   └── eval_results.py
    ├── cli_demo.py
    ├── config
        ├── ChEF
        │   ├── desiderata_recipes
        │   │   ├── Calibration
        │   │   │   ├── MMBench.yaml
        │   │   │   └── ScienceQA.yaml
        │   │   ├── Hallucination
        │   │   │   ├── POPE_adversarial.yaml
        │   │   │   ├── POPE_popular.yaml
        │   │   │   └── POPE_random.yaml
        │   │   ├── ICL
        │   │   │   ├── MMBench.yaml
        │   │   │   └── ScienceQA.yaml
        │   │   ├── Insfollow
        │   │   │   ├── MMBench.yaml
        │   │   │   └── ScienceQA.yaml
        │   │   └── Robust
        │   │   │   ├── MMBench.yaml
        │   │   │   └── ScienceQA.yaml
        │   ├── models
        │   │   ├── gemini.yaml
        │   │   ├── gpt.yaml
        │   │   ├── instructblip_vicuna.yaml
        │   │   ├── internlm_xcomposer.yaml
        │   │   ├── kosmos2.yaml
        │   │   ├── lamm.yaml
        │   │   ├── lamm15.yaml
        │   │   ├── lamm_3d.yaml
        │   │   ├── llamaadapterv2.yaml
        │   │   ├── llava15.yaml
        │   │   ├── llavarlhf.yaml
        │   │   ├── minigpt4.yaml
        │   │   ├── mplug.yaml
        │   │   ├── octavius_2d+3d.yaml
        │   │   ├── octavius_2d.yaml
        │   │   ├── octavius_3d.yaml
        │   │   ├── otter.yaml
        │   │   ├── qwen_vl.yaml
        │   │   ├── rlhfv.yaml
        │   │   ├── shikra.yaml
        │   │   └── test.yaml
        │   └── scenario_recipes
        │   │   ├── CIFAR10
        │   │       ├── direct.yaml
        │   │       └── ppl.yaml
        │   │   ├── Ch3Ef
        │   │       ├── Harmless.yaml
        │   │       ├── Harmless_ppl.yaml
        │   │       ├── Helpful.yaml
        │   │       ├── Helpful_ppl.yaml
        │   │       ├── Honest.yaml
        │   │       └── Honest_ppl.yaml
        │   │   ├── FSC147
        │   │       ├── direct.yaml
        │   │       └── ppl.yaml
        │   │   ├── Flickr30k
        │   │       ├── direct.yaml
        │   │       ├── random_ppl.yaml
        │   │       └── topp_ppl.yaml
        │   │   ├── LAMM
        │   │       ├── AI2D.yaml
        │   │       ├── CIFAR10.yaml
        │   │       ├── CelebA_hair.yaml
        │   │       ├── CelebA_smile.yaml
        │   │       ├── FSC147.yaml
        │   │       ├── Flickr30k.yaml
        │   │       ├── SVT.yaml
        │   │       ├── ScanNet.yaml
        │   │       ├── ScanQA.yaml
        │   │       ├── ScanRefer.yaml
        │   │       ├── ScienceQA.yaml
        │   │       ├── UCMerced.yaml
        │   │       ├── VOC2012.yaml
        │   │       ├── locating_LSP.yaml
        │   │       └── locating_VOC2012.yaml
        │   │   ├── MMBench
        │   │       ├── direct.yaml
        │   │       └── ppl.yaml
        │   │   ├── MME
        │   │       ├── direct.yaml
        │   │       └── ppl.yaml
        │   │   ├── MMMU
        │   │       └── default.yaml
        │   │   ├── Octavius3D
        │   │       ├── nr3d_caption_direct3d.yaml
        │   │       ├── scan_caption_direct3d.yaml
        │   │       ├── scan_cls_direct3d.yaml
        │   │       ├── scan_vqa_direct3d.yaml
        │   │       └── shapenet_cls_direct3d.yaml
        │   │   ├── Omnibenchmark
        │   │       ├── multiturn_direct.yaml
        │   │       ├── multiturn_ppl.yaml
        │   │       ├── singleturn_direct.yaml
        │   │       └── singleturn_ppl.yaml
        │   │   ├── SEEDBench-2
        │   │       └── ppl.yaml
        │   │   ├── SEEDBench
        │   │       └── default.yaml
        │   │   ├── ScienceQA
        │   │       ├── direct.yaml
        │   │       ├── direct_CoT.yaml
        │   │       ├── ppl.yaml
        │   │       └── ppl_CoT.yaml
        │   │   ├── VOC2012
        │   │       ├── kosmos2_multiturn_direct.yaml
        │   │       ├── kosmos2_multiturn_ppl.yaml
        │   │       ├── kosmos2_singleturn_direct.yaml
        │   │       ├── multiturn_direct.yaml
        │   │       ├── multiturn_ppl.yaml
        │   │       ├── shikra_multiturn_direct.yaml
        │   │       ├── shikra_multiturn_ppl.yaml
        │   │       ├── shikra_singleturn_direct.yaml
        │   │       └── singleturn_direct.yaml
        │   │   └── Winoground
        │   │       └── default.yaml
        ├── LAMM
        │   ├── train.yaml
        │   ├── train_ds3.yaml
        │   └── train_sft.yaml
        └── Octavius
        │   ├── octavius_2d+3d_e6_bs64.yaml
        │   ├── octavius_2d_e4_bs64.yaml
        │   └── octavius_3d_e3_bs64.yaml
    ├── datasets
        ├── __init__.py
        ├── dataset.py
        ├── samplers.py
        ├── system_msg.py
        └── utils.py
    ├── dist.py
    ├── eval.py
    ├── model
        ├── LAMM
        │   ├── CLIP
        │   │   ├── __init__.py
        │   │   ├── bpe_simple_vocab_16e6.txt.gz
        │   │   ├── clip.py
        │   │   ├── model.py
        │   │   └── simple_tokenizer.py
        │   ├── EPCL
        │   │   ├── __init__.py
        │   │   ├── epcl.py
        │   │   ├── helpers.py
        │   │   ├── model_3detr.py
        │   │   ├── position_embedding.py
        │   │   ├── third_party
        │   │   │   └── pointnet2
        │   │   │   │   ├── _ext_src
        │   │   │   │       ├── include
        │   │   │   │       │   ├── ball_query.h
        │   │   │   │       │   ├── cuda_utils.h
        │   │   │   │       │   ├── group_points.h
        │   │   │   │       │   ├── interpolate.h
        │   │   │   │       │   ├── sampling.h
        │   │   │   │       │   └── utils.h
        │   │   │   │       └── src
        │   │   │   │       │   ├── ball_query.cpp
        │   │   │   │       │   ├── ball_query_gpu.cu
        │   │   │   │       │   ├── bindings.cpp
        │   │   │   │       │   ├── group_points.cpp
        │   │   │   │       │   ├── group_points_gpu.cu
        │   │   │   │       │   ├── interpolate.cpp
        │   │   │   │       │   ├── interpolate_gpu.cu
        │   │   │   │       │   ├── sampling.cpp
        │   │   │   │       │   └── sampling_gpu.cu
        │   │   │   │   ├── build
        │   │   │   │       └── temp.linux-x86_64-cpython-310
        │   │   │   │       │   ├── .ninja_deps
        │   │   │   │       │   ├── .ninja_log
        │   │   │   │       │   └── build.ninja
        │   │   │   │   ├── pointnet2_modules.py
        │   │   │   │   ├── pointnet2_test.py
        │   │   │   │   ├── pointnet2_utils.py
        │   │   │   │   ├── pytorch_utils.py
        │   │   │   │   └── setup.py
        │   │   ├── transformer.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── ap_calculator.py
        │   │   │   ├── box_intersection.c
        │   │   │   ├── box_intersection.pyx
        │   │   │   ├── box_ops3d.py
        │   │   │   ├── box_util.py
        │   │   │   ├── cython_compile.py
        │   │   │   ├── cython_compile.sh
        │   │   │   ├── dist.py
        │   │   │   ├── download_weights.py
        │   │   │   ├── eval_det.py
        │   │   │   ├── io.py
        │   │   │   ├── logger.py
        │   │   │   ├── misc.py
        │   │   │   ├── nms.py
        │   │   │   ├── pc_util.py
        │   │   │   └── random_cuboid.py
        │   ├── README.md
        │   ├── __init__.py
        │   ├── conversations.py
        │   ├── flash_attn_patch.py
        │   ├── modeling_lightllm.py
        │   ├── modeling_llama.py
        │   ├── openlamm.py
        │   ├── utils
        │   │   ├── __init__.py
        │   │   ├── data.py
        │   │   ├── helpers.py
        │   │   ├── multimodal_preprocessors.py
        │   │   └── pcl_utils.py
        │   └── xformers_patch.py
        ├── Octavius
        │   ├── __init__.py
        │   ├── moe
        │   │   ├── __init__.py
        │   │   ├── layer.py
        │   │   └── moe_lora.py
        │   ├── octavius.py
        │   └── resampler3d.py
        ├── __init__.py
        ├── llava
        │   ├── __init__.py
        │   ├── constants.py
        │   ├── conversation.py
        │   ├── mm_utils.py
        │   ├── model
        │   │   ├── __init__.py
        │   │   ├── apply_delta.py
        │   │   ├── builder.py
        │   │   ├── consolidate.py
        │   │   ├── language_model
        │   │   │   ├── llava_llama.py
        │   │   │   ├── llava_mpt.py
        │   │   │   └── mpt
        │   │   │   │   ├── adapt_tokenizer.py
        │   │   │   │   ├── attention.py
        │   │   │   │   ├── blocks.py
        │   │   │   │   ├── configuration_mpt.py
        │   │   │   │   ├── custom_embedding.py
        │   │   │   │   ├── flash_attn_triton.py
        │   │   │   │   ├── hf_prefixlm_converter.py
        │   │   │   │   ├── meta_init_context.py
        │   │   │   │   ├── modeling_mpt.py
        │   │   │   │   ├── norm.py
        │   │   │   │   └── param_init_fns.py
        │   │   ├── llava_arch.py
        │   │   ├── make_delta.py
        │   │   ├── multimodal_encoder
        │   │   │   ├── builder.py
        │   │   │   └── clip_encoder.py
        │   │   ├── multimodal_projector
        │   │   │   └── builder.py
        │   │   └── utils.py
        │   ├── train
        │   │   ├── llama_flash_attn_monkey_patch.py
        │   │   ├── llama_xformers_attn_monkey_patch.py
        │   │   ├── llava_trainer.py
        │   │   ├── train.py
        │   │   ├── train_mem.py
        │   │   └── train_xformers.py
        │   └── utils.py
        └── training_agent.py
    ├── slurm_eval.sh
    ├── slurm_eval_icl.sh
    ├── tools
        ├── ChEF
        │   ├── eval_calibration.py
        │   ├── eval_hallucination.py
        │   ├── eval_icl.py
        │   ├── eval_insfollow.py
        │   ├── eval_langperf.py
        │   └── eval_robust.py
        ├── LAMM
        │   ├── eval_lamm2d.sh
        │   ├── eval_lamm3d.sh
        │   ├── train_lamm2d.sh
        │   ├── train_lamm2d_sft_stg1_slurm.sh
        │   ├── train_lamm2d_sft_stg2_slurm.sh
        │   ├── train_lamm2d_slurm.sh
        │   ├── train_lamm3d.sh
        │   └── train_lamm3d_slurm.sh
        └── Octavius
        │   ├── ULIP
        │       ├── .DS_Store
        │       ├── data
        │       │   ├── .DS_Store
        │       │   ├── ScanRefer.yaml
        │       │   ├── ScanReferValid.yaml
        │       │   ├── dataset_3d.py
        │       │   ├── dataset_catalog.json
        │       │   ├── labels.json
        │       │   └── templates.json
        │       ├── main.py
        │       ├── models
        │       │   ├── .DS_Store
        │       │   ├── ULIP_models.py
        │       │   ├── losses.py
        │       │   ├── pointbert
        │       │   │   ├── .DS_Store
        │       │   │   ├── PointTransformer_8192point.yaml
        │       │   │   ├── checkpoint.py
        │       │   │   ├── dvae.py
        │       │   │   ├── logger.py
        │       │   │   ├── misc.py
        │       │   │   └── point_encoder.py
        │       │   └── pointnet2
        │       │   │   ├── .DS_Store
        │       │   │   ├── pointnet2.py
        │       │   │   └── pointnet2_utils.py
        │       ├── scripts
        │       │   └── pretrain_pointbert.sh
        │       └── utils
        │       │   ├── .DS_Store
        │       │   ├── __init__.py
        │       │   ├── bpe_simple_vocab_16e6.txt.gz
        │       │   ├── build.py
        │       │   ├── config.py
        │       │   ├── io.py
        │       │   ├── logger.py
        │       │   ├── registry.py
        │       │   ├── tokenizer.py
        │       │   └── utils.py
        │   ├── octavius_ChEF.sh
        │   └── train_octavius_slurm.sh
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # source file related
 2 | *.pyc
 3 | *.o
 4 | 
 5 | 
 6 | 
 7 | *.so
 8 | *.egg
 9 | *.egg-info
10 | 
11 | 
12 | 
13 | # training related
14 | *.log
15 | *.pth
16 | *.pt
17 | *.model
18 | !*.bpe.model
19 | *.0
20 | 
21 | # result related
22 | answers/
23 | results/
24 | ckpt/*
25 | !ckpt/.gitkeep
26 | 
27 | *.jsonl
28 | 
29 | 
30 | # assets related
31 | data/*
32 | !data/.gitkeep
33 | model_zoo/*
34 | !model_zoo/.gitkeep
35 | 
36 | # package related
37 | src/run.sh
38 | 
39 | *.ttf
40 | 


--------------------------------------------------------------------------------
/ckpt/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/ckpt/.gitkeep


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/data/.gitkeep


--------------------------------------------------------------------------------
/images/Ch3Ef_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/Ch3Ef_intro.png


--------------------------------------------------------------------------------
/images/ChEF-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ChEF-benchmark.png


--------------------------------------------------------------------------------
/images/ChEF-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ChEF-logo.png


--------------------------------------------------------------------------------
/images/LAMM-Dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-Dataset.png


--------------------------------------------------------------------------------
/images/LAMM-Framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-Framework.png


--------------------------------------------------------------------------------
/images/LAMM-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM-benchmark.png


--------------------------------------------------------------------------------
/images/LAMM_2d_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM_2d_demo.png


--------------------------------------------------------------------------------
/images/LAMM_Imagewall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/LAMM_Imagewall.png


--------------------------------------------------------------------------------
/images/Octavius_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/Octavius_arch.png


--------------------------------------------------------------------------------
/images/ch3ef-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/ch3ef-logo.png


--------------------------------------------------------------------------------
/images/lamm-title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/lamm-title.png


--------------------------------------------------------------------------------
/images/lamm-video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/images/lamm-video.png


--------------------------------------------------------------------------------
/requirements/ChEF.txt:
--------------------------------------------------------------------------------
1 | PyYAML==6.0.1
2 | tqdm==4.64.1
3 | pandas==2.0.3
4 | transformers==4.31.0
5 | sentence-transformers==2.2.2


--------------------------------------------------------------------------------
/requirements/default.txt:
--------------------------------------------------------------------------------
 1 | data
 2 | ninja
 3 | accelerate>=0.20.3
 4 | einops==0.6.1
 5 | ftfy==6.1.1
 6 | iopath==0.1.10
 7 | ipdb==0.13.13
 8 | numpy==1.24.3
 9 | Pillow==9.5.0
10 | PyYAML==6.0.1
11 | regex==2022.10.31
12 | pytorchvideo
13 | fvcore
14 | decord==0.6.0
15 | tqdm
16 | setuptools==65.5.1
17 | bigmodelvis
18 | nltk
19 | tensorboard
20 | cython
21 | plyfile
22 | trimesh
23 | sentencepiece
24 | 


--------------------------------------------------------------------------------
/requirements/optional.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/requirements/optional.txt


--------------------------------------------------------------------------------
/src/ChEF/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/data_process/mmbench.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import io
 3 | import base64
 4 | import pandas as pd
 5 | from PIL import Image
 6 | def decode_base64_to_image(base64_string):
 7 |     image_data = base64.b64decode(base64_string)
 8 |     image = Image.open(io.BytesIO(image_data))
 9 |     return image
10 | 
11 | def main(split='dev'):
12 |     base_path = '../../../data/MMBench'
13 |     save_image_dir = os.path.join(base_path, 'images')
14 |     os.makedirs(save_image_dir,exist_ok=True)
15 |     df = pd.read_csv(os.path.join(base_path, f'mmbench_{split}_20230712.tsv'), sep='\t')
16 |     for i in range(len(df)):
17 |         image = df.iloc[i]['image']
18 |         index = df.iloc[i]['index']
19 |         image = decode_base64_to_image(image)
20 |         image_name = f'mmbench_image_{index}.png'
21 |         image.save(os.path.join(save_image_dir, image_name))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     main()


--------------------------------------------------------------------------------
/src/ChEF/inferencer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .Singleturn import Direct3D_Inferencer, Direct_Inferencer, PPL_Inferencer
 2 | from .Multiturn import Multi_Turn_PPL_Inferencer, Multi_Direct_Inferencer
 3 | 
 4 | inferencer_dict = {
 5 |     'Direct': Direct_Inferencer,
 6 |     'Direct3D': Direct3D_Inferencer,  
 7 |     'PPL': PPL_Inferencer,
 8 |     'Multi_PPL': Multi_Turn_PPL_Inferencer,
 9 |     'Multi_Direct': Multi_Direct_Inferencer,
10 | }
11 | 
12 | def build_inferencer(inferencer_type, **kwargs):
13 |     return inferencer_dict[inferencer_type](**kwargs)


--------------------------------------------------------------------------------
/src/ChEF/inferencer/utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | def copy_batch_dict(batch, idx):
 4 |     answer_dict = {}
 5 |     for key in batch.keys():
 6 |         if not isinstance(batch[key], list):
 7 |             answer_dict[key] = batch[key]
 8 |         else:
 9 |             answer_dict[key] = batch[key][idx]
10 |     if 'image_path' in answer_dict and isinstance(answer_dict['image_path'], Image.Image): # image_path in MMBench is Image.Image
11 |         del answer_dict['image_path']
12 |     return answer_dict


--------------------------------------------------------------------------------
/src/ChEF/instruction/ice_retriever/__init__.py:
--------------------------------------------------------------------------------
 1 | from .random_retriever import RandomRetriever
 2 | from .topk_retriever import TopkRetriever
 3 | from .fixed_retriever import FixedRetriever
 4 | from .topk_retriever_img import ImageTopkRetriever
 5 | 
 6 | 
 7 | retriever_dict = {
 8 |     'random': RandomRetriever,
 9 |     'topk_text': TopkRetriever,
10 |     'fixed': FixedRetriever,
11 |     'topk_img': ImageTopkRetriever
12 | }
13 | 
14 | def build_retriever(train_dataset, test_dataset, retriever_type, **kwargs):
15 |     build_fuc = retriever_dict[retriever_type]
16 |     return build_fuc(train_dataset, test_dataset, **kwargs)


--------------------------------------------------------------------------------
/src/ChEF/metric/Ch3Ef.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from .utils import Base_Metric
 3 | 
 4 | class Ch3Ef_Metric(Base_Metric):
 5 | 
 6 |     def __init__(self, dataset_name, ppl=False, **kwargs):
 7 |         super().__init__(dataset_name)
 8 |         self.ppl = ppl
 9 | 
10 |     def ppl_metric(self, answers):
11 |         score = 0.0
12 |         for item in tqdm(answers, desc="Running Metric"):
13 |             if "I don't know." in item['options']:
14 |                 gt = item['options'][:2]
15 |             else:
16 |                 gt = item['options'][:1]
17 |             pred_text = item['answer']
18 |             result = pred_text in gt
19 |             score += result
20 |             item['metric_result'] = result
21 |         score = score/len(answers) * 100
22 |         return dict(
23 |             ACC = score, 
24 |         ), answers
25 |     
26 | 
27 |     def metric_func(self, answers):
28 |         if self.ppl:
29 |             return self.ppl_metric(answers)
30 |         return dict(), answers


--------------------------------------------------------------------------------
/src/ChEF/metric/counting.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from .utils import Base_Metric
 3 | 
 4 | class Counting(Base_Metric):
 5 |     def __init__(self, dataset_name, inference_type = 'direct', **kwargs):
 6 |         super().__init__(dataset_name)
 7 |         self.inference_type = inference_type
 8 |         assert self.inference_type in ['direct', 'ppl']
 9 |         from .utils import ennum2numerical
10 |         self.parse_num_func = ennum2numerical
11 |     
12 |     def mae_metric(self, answers):
13 |         score = 0
14 |         for item in tqdm(answers, desc="Running MAE Metric"):
15 |             gt_num = item['gt_answers']
16 |             text = item['answer']
17 |             pred_num = self.parse_num_func(text)
18 |             score += min(gt_num, abs(pred_num-gt_num))
19 |         return score / len(answers)
20 | 
21 |     def acc_metric(self, answers):
22 |         score = 0
23 |         for item in tqdm(answers, desc="Running ACC Metric"):
24 |             gt_num = item['gt_answers']
25 |             text = item['answer']
26 |             pred_num = self.parse_num_func(text)
27 |             score += (pred_num == gt_num)
28 |         return score / len(answers) * 100
29 | 
30 |     def metric_func(self, answers):
31 |         res_dict = {}
32 |         if self.inference_type == 'direct':
33 |             res_dict['MAE'] = self.mae_metric(answers)
34 |         res_dict['ACC'] = self.acc_metric(answers)
35 |         return res_dict, answers
36 | 


--------------------------------------------------------------------------------
/src/ChEF/metric/ocr.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | 
 3 | from .utils import Base_Metric, parse_caption_sentence
 4 | 
 5 | class SVT_OCR(Base_Metric):
 6 | 
 7 |     def __init__(self, dataset_name, **kwargs):
 8 |         super().__init__(dataset_name)
 9 | 
10 |     def metric_func(self, answers):
11 |         score = 0.0
12 |         for item in tqdm(answers, desc="Running Metric"):
13 |             gt_word_list = item['gt_answers']
14 |             pred_text = item['answer']
15 |             pred_word_list = parse_caption_sentence(pred_text).lower().split()
16 |             correct = 0
17 |             for word in gt_word_list:
18 |                 if word.lower() in pred_word_list:
19 |                     correct += 1
20 |             tmp_score = correct / len(gt_word_list)
21 |             score += tmp_score
22 |             item['metric_result'] = tmp_score
23 | 
24 |         return dict(
25 |             ACC = score/len(answers),
26 |         ), answers


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from .common.registry import registry
14 | from .models import *
15 | from .processors import *
16 | 
17 | 
18 | root_dir = os.path.dirname(os.path.abspath(__file__))
19 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
20 | 
21 | registry.register_path("library_root", root_dir)
22 | repo_root = os.path.join(root_dir, "..")
23 | registry.register_path("repo_root", repo_root)
24 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
25 | registry.register_path("cache_root", cache_root)
26 | 
27 | registry.register("MAX_INT", sys.maxsize)
28 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
29 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/export/home/.cache/lavis"
11 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xxl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xxl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "./llm/vicuna-13b"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "eachadea/vicuna-7b-1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_llama7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2_llama
 8 |   load_finetuned: False
 9 |   
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # LLM
24 |   llm_model: "/export/home/project/stanford_alpaca/llama_7B"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip2_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-32
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 336
53 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: img2prompt_vqa
 8 |   model_type: base
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_generation_moodel:
47 |     pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth"
48 | 
49 | 
50 | 
51 | preprocess:
52 |   vis_processor:
53 |       eval:
54 |         name: "blip_image_eval"
55 |         image_size: 384
56 |   text_processor:
57 |       eval:
58 |         name: "blip_caption"
59 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/pnp-vqa/pnp_vqa_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_answering_model:
47 |     arch: pnp_unifiedqav2_fid
48 | 
49 |     pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
50 | 
51 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
52 | 
53 | preprocess:
54 |   vis_processor:
55 |       eval:
56 |         name: "blip_image_eval"
57 |         image_size: 384
58 |   text_processor:
59 |       eval:
60 |         name: "blip_caption"
61 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/pnp-vqa/unifiedqav2_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 3072,
 6 |   "d_kv": 64,
 7 |   "d_model": 768,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 12,
21 |   "num_heads": 12,
22 |   "num_layers": 12,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/configs/models/pnp-vqa/unifiedqav2_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 4096,
 6 |   "d_kv": 64,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 24,
21 |   "num_heads": 16,
22 |   "num_layers": 24,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/instruct_blip/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from .base_processor import BaseProcessor
 9 | 
10 | from .blip_processors import (
11 |     BlipImageTrainProcessor,
12 |     Blip2ImageTrainProcessor,
13 |     BlipImageEvalProcessor,
14 |     BlipCaptionProcessor,
15 | )
16 | from .gpt_processors import (
17 |     GPTVideoFeatureProcessor,
18 |     GPTDialogueProcessor,
19 | )
20 | from.clip_processors import ClipImageTrainProcessor
21 | 
22 | from ..common.registry import registry
23 | 
24 | __all__ = [
25 |     "BaseProcessor",
26 |     # BLIP
27 |     "BlipImageTrainProcessor",
28 |     "Blip2ImageTrainProcessor",
29 |     "BlipImageEvalProcessor",
30 |     "BlipCaptionProcessor",
31 |     "ClipImageTrainProcessor",
32 |     # GPT
33 |     "GPTVideoFeatureProcessor",
34 |     "GPTDialogueProcessor",
35 | ]
36 | 
37 | 
38 | def load_processor(name, cfg=None):
39 |     """
40 |     Example
41 | 
42 |     >>> processor = load_processor("alpro_video_train", cfg=None)
43 |     """
44 |     processor = registry.get_processor_class(name).from_config(cfg)
45 | 
46 |     return processor
47 | 


--------------------------------------------------------------------------------
/src/ChEF/models/instruct_blip/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/src/ChEF/models/internlm/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_internlm_xcomposer2 import InternLMXcomposer2Config
2 | from .rewrite_modeling_internlm_xcomposer2 import RewriteInternLMXComposer2ForCausalLM as InternLMXComposer2ForCausalLM
3 | from .tokenization_internlm_xcomposer2 import InternLMXComposer2Tokenizer


--------------------------------------------------------------------------------
/src/ChEF/models/kosmos2/data/sentencepiece.bpe.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/kosmos2/data/sentencepiece.bpe.model


--------------------------------------------------------------------------------
/src/ChEF/models/kosmos2/unilm/__init__.py:
--------------------------------------------------------------------------------
1 | from ChEF.models.kosmos2.unilm import models
2 | from ChEF.models.kosmos2.unilm import tasks


--------------------------------------------------------------------------------
/src/ChEF/models/kosmos2/unilm/models/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from fairseq.models import import_models
4 | 
5 | models_dir = os.path.dirname(__file__)
6 | import_models(models_dir, "ChEF.models.kosmos2.unilm.models")


--------------------------------------------------------------------------------
/src/ChEF/models/kosmos2/unilm/models/vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/kosmos2/unilm/models/vl/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/models/kosmos2/unilm/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import importlib
3 | import os
4 | from fairseq.tasks import import_tasks
5 | 
6 | tasks_dir = os.path.dirname(__file__)
7 | import_tasks(tasks_dir, "ChEF.models.kosmos2.unilm.tasks")
8 | 


--------------------------------------------------------------------------------
/src/ChEF/models/llama_adapter_v2/__init__.py:
--------------------------------------------------------------------------------
1 | from .llama import ModelArgs, Transformer
2 | from .tokenizer import Tokenizer
3 | from .llama_adapter import *
4 | from .utils import format_prompt, format_prompt_icl


--------------------------------------------------------------------------------
/src/ChEF/models/llama_adapter_v2/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from sentencepiece import SentencePieceProcessor
 5 | from logging import getLogger
 6 | from typing import List
 7 | import os
 8 | 
 9 | 
10 | logger = getLogger()
11 | 
12 | 
13 | class Tokenizer:
14 |     def __init__(self, model_path: str):
15 |         # reload tokenizer
16 |         assert os.path.isfile(model_path), model_path
17 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
18 |         logger.info(f"Reloaded SentencePiece model from {model_path}")
19 | 
20 |         # BOS / EOS token IDs
21 |         self.n_words: int = self.sp_model.vocab_size()
22 |         self.bos_id: int = self.sp_model.bos_id()
23 |         self.eos_id: int = self.sp_model.eos_id()
24 |         self.pad_id: int = self.sp_model.pad_id()
25 |         logger.info(
26 |             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
27 |         )
28 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
29 | 
30 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
31 |         assert type(s) is str
32 |         t = self.sp_model.encode(s)
33 |         if bos:
34 |             t = [self.bos_id] + t
35 |         if eos:
36 |             t = t + [self.eos_id]
37 |         return t
38 | 
39 |     def decode(self, t: List[int]) -> str:
40 |         return self.sp_model.decode(t)
41 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from .common.registry import registry
14 | from .models import *
15 | from .processors import *
16 | 
17 | 
18 | root_dir = os.path.dirname(os.path.abspath(__file__))
19 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
20 | 
21 | registry.register_path("library_root", root_dir)
22 | repo_root = os.path.join(root_dir, "..")
23 | registry.register_path("repo_root", repo_root)
24 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
25 | registry.register_path("cache_root", cache_root)
26 | 
27 | registry.register("MAX_INT", sys.maxsize)
28 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
29 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/minigpt4/common/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 |   # For default users
3 |   # cache_root: "cache"
4 |   # For internal use with persistent storage
5 |   cache_root: "/export/home/.cache/minigpt4"
6 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/configs/models/minigpt4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp32"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # Vicuna
16 |   llama_model: ../model_zoo/Vicuna/7b_v0
17 | 
18 |   # generation configs
19 |   prompt: ""
20 | 
21 | preprocess:
22 |     vis_processor:
23 |         train:
24 |           name: "blip2_image_train"
25 |           image_size: 224
26 |         eval:
27 |           name: "blip2_image_eval"
28 |           image_size: 224
29 |     text_processor:
30 |         train:
31 |           name: "blip_caption"
32 |         eval:
33 |           name: "blip_caption"
34 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/minigpt4/conversation/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/minigpt4_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 |   model_type: pretrain_vicuna
 4 |   freeze_vit: True
 5 |   freeze_qformer: True
 6 |   max_txt_len: 160
 7 |   end_sym: "###"
 8 |   low_resource: False
 9 |   prompt_path: "alignment.txt"
10 |   prompt_template: '###Human: {} ###Assistant: '
11 |   ckpt: 'pretrained_minigpt4_7b.pth'
12 | 
13 | 
14 | preprocess:
15 |   vis_processor:
16 |     train:
17 |       name: "blip2_image_train"
18 |       image_size: 224
19 |     eval:
20 |       name: "blip2_image_eval"
21 |       image_size: 224
22 |   text_processor:
23 |     train:
24 |       name: "blip_caption"
25 |     eval:
26 |       name: "blip_caption"
27 | 
28 | 
29 | run:
30 |   task: image_text_pretrain
31 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from .base_processor import BaseProcessor
 9 | from .blip_processors import (
10 |     Blip2ImageTrainProcessor,
11 |     Blip2ImageEvalProcessor,
12 |     BlipCaptionProcessor,
13 | )
14 | 
15 | from ..common.registry import registry
16 | 
17 | __all__ = [
18 |     "BaseProcessor",
19 |     "Blip2ImageTrainProcessor",
20 |     "Blip2ImageEvalProcessor",
21 |     "BlipCaptionProcessor",
22 | ]
23 | 
24 | 
25 | def load_processor(name, cfg=None):
26 |     """
27 |     Example
28 | 
29 |     >>> processor = load_processor("alpro_video_train", cfg=None)
30 |     """
31 |     processor = registry.get_processor_class(name).from_config(cfg)
32 | 
33 |     return processor
34 | 


--------------------------------------------------------------------------------
/src/ChEF/models/minigpt4/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/src/ChEF/models/otter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/models/otter/__init__.py


--------------------------------------------------------------------------------
/src/ChEF/models/otter/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "otter",
 3 |     "cross_attn_every_n_layers": 4,
 4 |     "tie_word_embeddings": false,
 5 |     "use_media_placement_augmentation": true,
 6 |     "only_attend_previous": true,
 7 |     "text_config": {
 8 |         "_name_or_path": "luodian/llama-7b-hf",
 9 |         "model_type": "llama"
10 |     },
11 |     "vision_config": {
12 |         "_name_or_path": "openai/clip-vit-large-patch14",
13 |         "model_type": "clip_vision_model",
14 |         "hidden_size": 1024,
15 |         "intermediate_size": 4096,
16 |         "num_attention_heads": 16,
17 |         "num_hidden_layers": 24,
18 |         "image_size": 224,
19 |         "patch_size": 14
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/ChEF/models/qwen/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_qwen import QWenConfig
2 | from .modeling_qwen import QWenLMHeadModel
3 | from .tokenization_qwen import QWenTokenizer
4 | from .qwen_generation_utils import make_context, get_stop_words_ids, decode_tokens


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/builder/__init__.py:
--------------------------------------------------------------------------------
1 | from .builder import load_pretrained
2 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/builder/builder.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, Tuple
 2 | 
 3 | from torch import nn
 4 | 
 5 | from .build_shikra import load_pretrained_shikra
 6 | 
 7 | PREPROCESSOR = Dict[str, Any]
 8 | 
 9 | 
10 | # TODO: Registry
11 | def load_pretrained(model_args, training_args) -> Tuple[nn.Module, PREPROCESSOR]:
12 |     type_ = model_args.type
13 |     if type_ == 'shikra':
14 |         return load_pretrained_shikra(model_args, training_args)
15 |     else:
16 |         assert False
17 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/conversation/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_conversation import SeparatorStyle, Conversation, register_conv_template, get_conv_template


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .root import *
2 | from .utils import *
3 | from .process_function import *
4 | from .single_image_convsation import *
5 | 
6 | from .builder import prepare_data
7 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/dataset/process_function/__init__.py:
--------------------------------------------------------------------------------
 1 | from .shikra_process_function import (
 2 |     ShikraConvProcess,
 3 |     ShikraImageProcessor,
 4 |     ShikraTextProcess,
 5 | )
 6 | 
 7 | from .box_process_function import (
 8 |     BoxFormatProcess,
 9 |     BoxFormatter,
10 |     PlainBoxFormatter,
11 |     TokenFormatter,
12 |     prepare_target_processor,
13 | )
14 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .io import read_img_general, init_ceph_client_if_needed
2 | from .transform import Expand2square, de_norm_box_xyxy, norm_box_xyxy, expand2square, box_xywh_to_xyxy
3 | from .compute_metrics import BaseComputeMetrics
4 | from .mixin import QuestionTemplateMixin, MInstrDataset
5 | from .concatenate_dataset import ConcatDataset, InterleaveDateset, SubSet, ConcatDatasetWithShuffle
6 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/dataset/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import logging
 4 | 
 5 | import cv2
 6 | import numpy as np
 7 | from PIL import Image
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(logging.INFO)
11 | logging.basicConfig(
12 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
13 |     datefmt="%m/%d/%Y %H:%M:%S",
14 |     handlers=[logging.StreamHandler(sys.stdout), ],
15 | )
16 | 
17 | 
18 | def read_img_general(img_path):
19 |     if "s3://" in img_path:
20 |         cv_img = read_img_ceph(img_path)
21 |         # noinspection PyUnresolvedReferences
22 |         return Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
23 |     else:
24 |         return Image.open(img_path).convert('RGB')
25 | 
26 | 
27 | client = None
28 | 
29 | 
30 | def read_img_ceph(img_path):
31 |     init_ceph_client_if_needed()
32 |     img_bytes = client.get(img_path)
33 |     assert img_bytes is not None, f"Please check image at {img_path}"
34 |     img_mem_view = memoryview(img_bytes)
35 |     img_array = np.frombuffer(img_mem_view, np.uint8)
36 |     # noinspection PyUnresolvedReferences
37 |     img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
38 |     return img
39 | 
40 | 
41 | def init_ceph_client_if_needed():
42 |     global client
43 |     if client is None:
44 |         logger.info(f"initializing ceph client ...")
45 |         st = time.time()
46 |         from petrel_client.client import Client  # noqa
47 |         client = Client(enable_mc=True)
48 |         ed = time.time()
49 |         logger.info(f"initialize client cost {ed - st:.2f} s")


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/shikra/__init__.py:
--------------------------------------------------------------------------------
1 | from .shikra import ShikraLlamaForCausalLM, ShikraConfig
2 | 


--------------------------------------------------------------------------------
/src/ChEF/models/shikra/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import (
2 |     print_trainable_params,
3 |     show,
4 |     draw_bounding_boxes,
5 |     post_process_generate_ids,
6 |     decode_generate_ids,
7 |     smart_tokenizer_and_embedding_resize,
8 | )
9 | 


--------------------------------------------------------------------------------
/src/ChEF/models/test_lamm15.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .test_lamm import TestLAMM
 3 | from model.LAMM import LAMMSFTModel
 4 | 
 5 | class TestLAMM15(TestLAMM):
 6 |     def __init__(self, model_path, device=None, task_type='normal', **kwargs):
 7 |         self.conv_mode = 'simple'
 8 |         self.model = LAMMSFTModel(**kwargs)
 9 |         ckpt = torch.load(model_path, map_location=torch.device('cpu'))
10 |         self.model.load_state_dict(ckpt, strict=False)             # TODO: load delta_ckpt from model_path in lamm_3d.yaml
11 |         self.model = self.model.eval().half()
12 |         self.task_type = task_type
13 |         self.move_to_device(device)
14 |         self.model.device = device


--------------------------------------------------------------------------------
/src/ChEF/resources/ChEF-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/ChEF/resources/ChEF-logo.png


--------------------------------------------------------------------------------
/src/ChEF/scenario/Ch3Ef_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from torch.utils.data import Dataset
 4 | 
 5 | class Ch3EfDataset(Dataset):
 6 |     task_name = 'Ch3Ef'
 7 |     dataset_name = 'Ch3Ef'
 8 |     def __init__(self, base_data_path, dimension, ppl=False, **kwargs):
 9 |         self.base_data_path = base_data_path
10 |         super().__init__()
11 |         meta_base_dir = os.path.join(self.base_data_path, 'meta_file')
12 |         self.data = json.load(open(os.path.join(meta_base_dir, f'{dimension}.json')))
13 |         self.data = self.data['items']
14 |         self.ppl = ppl
15 |         
16 |     
17 |     def __len__(self):
18 |         return len(self.data)
19 |     
20 |     def __getitem__(self, index):
21 |         item = self.data[index]
22 |         id = str(item['id']) if 'id' in item else str(index)
23 | 
24 |         res_dict = {
25 |             'id': id,
26 |             'image_path': [os.path.join(self.base_data_path,img_path) for img_path in self.data[index]['image']],
27 |             'question': self.data[index]['query'],
28 |             'source': self.data[index]['source']
29 |         }
30 |         if self.ppl:
31 |             res_dict['gt_answers'] = self.data[index]['options'][0]
32 |             res_dict['options'] = self.data[index]['options']
33 |         else:
34 |             res_dict['gt_answers'] = self.data[index]['options'][0]
35 |         return res_dict


--------------------------------------------------------------------------------
/src/ChEF/scenario/utils.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | from sentence_transformers import util
 3 | from torch.utils.data import Dataset
 4 | 
 5 | class Bert_Similarity:
 6 |     def __init__(self, model_path = 'sentence-transformers/all-MiniLM-L6-v2') -> None:
 7 |         self.model = SentenceTransformer(model_path).cuda()
 8 |         self.cos_func = util.pytorch_cos_sim
 9 |     
10 |     def similarity_score(self, str1, str2):
11 |         embedding_1 = self.model.encode(str1, convert_to_tensor=True)
12 |         embedding_2 = self.model.encode(str2, convert_to_tensor=True)
13 |         score = self.cos_func(embedding_1, embedding_2).item()
14 |         return score
15 |     
16 |     def bert_embedding(self, str):
17 |         return self.model.encode(str, convert_to_tensor=True)
18 |     
19 |     def embedding_similarity_score(self, emb1, emb2):
20 |         score_metric = self.cos_func(emb1, emb2)
21 |         return score_metric
22 |     
23 | 
24 | mmbench_rand_acc = {'circular': 2.55,
25 |                     'vanilla': 27.57}
26 | 
27 | sqa_rand_acc = {'circular': 35.8,
28 |                 'vanilla': 35.8}
29 | 
30 | rand_acc = {'MMBench': mmbench_rand_acc,
31 |             'ScienceQA': sqa_rand_acc}
32 | 


--------------------------------------------------------------------------------
/src/ChEF/test/test_model.sh:
--------------------------------------------------------------------------------
1 | model_cfg=$1
2 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
3 | python test/test_model.py ${model_cfg} --debug


--------------------------------------------------------------------------------
/src/ChEF/test/test_recipes.py:
--------------------------------------------------------------------------------
 1 | scenario_recipes = [
 2 |     'configs/scenario_recipes/CIFAR10/default.yaml',
 3 |     # 'configs/scenario_recipes/CIFAR10/direct.yaml',
 4 |     'configs/scenario_recipes/Flickr30k/default.yaml',
 5 |     # 'configs/scenario_recipes/Flickr30k/direct.yaml',
 6 |     'configs/scenario_recipes/FSC147/default.yaml',
 7 |     # 'configs/scenario_recipes/FSC147/direct.yaml',
 8 |     'configs/scenario_recipes/MMBench/default.yaml',
 9 |     # 'configs/scenario_recipes/MMBench/direct.yaml',
10 |     'configs/scenario_recipes/MME/default.yaml',
11 |     # 'configs/scenario_recipes/MME/direct.yaml',
12 |     'configs/scenario_recipes/Omnibenchmark/default.yaml',
13 |     # 'configs/scenario_recipes/Omnibenchmark/direct.yaml',
14 |     # 'configs/scenario_recipes/Omnibenchmark/single_ppl.yaml',
15 |     'configs/scenario_recipes/ScienceQA/default.yaml',
16 |     # 'configs/scenario_recipes/ScienceQA/direct_CoT.yaml',
17 |     # 'configs/scenario_recipes/ScienceQA/direct.yaml',
18 |     'configs/scenario_recipes/SEEDBench/default.yaml',
19 |     'configs/scenario_recipes/VOC2012/default.yaml',
20 |     # 'configs/scenario_recipes/VOC2012/direct.yaml',
21 | ]
22 | 
23 | desiderata_recipes = [
24 |     
25 | ]


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Calibration/MMBench.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   ppl_cfg: 
 5 |     content_only: False
 6 |   split: dev
 7 |   hint: True
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     query_type: query_pool
12 |     prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 
13 |     template_assigned_ids: 1 # (kosmos: 0) (default: 1)
14 |   inferencer_cfg:
15 |     inferencer_type: Calibration
16 |     batch_size: 6
17 |     CoT: True
18 |     max_new_tokens: 256
19 |   metric_cfg:
20 |     metric_type: Calibration


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Calibration/ScienceQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     query_type: query_pool
10 |     query_assigned_ids: 1 # otter 1
11 |     template_assigned_ids: 1 # otter 1
12 |   inferencer_cfg:
13 |     inferencer_type: Calibration
14 |     batch_size: 8
15 |     CoT: True
16 |     max_new_tokens: 256
17 |   metric_cfg:
18 |     metric_type: Calibration


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Hallucination/POPE_adversarial.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: POPE_COCO_adversarial
 3 |   base_data_path: ../data/coco_pope
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     query_type: standard_query
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 8
13 |     CoT: True
14 |     max_new_tokens: 256
15 |   metric_cfg:
16 |     metric_type: Hallucination
17 | 


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Hallucination/POPE_popular.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: POPE_COCO_popular
 3 |   base_data_path: ../data/coco_pope
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     query_type: standard_query
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 8
13 |     CoT: True
14 |     max_new_tokens: 256
15 |   metric_cfg:
16 |     metric_type: Hallucination
17 | 


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Hallucination/POPE_random.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: POPE_COCO_random
 3 |   base_data_path: ../data/coco_pope
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     query_type: standard_query
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 16
13 |     max_new_tokens: 256
14 |   metric_cfg:
15 |     metric_type: Hallucination
16 | 


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/ICL/MMBench.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   ppl_cfg: 
 5 |     content_only: False
 6 |   split: dev
 7 |   hint: True
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     prompt_type: singleturn 
12 |     prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 
13 |     template_assigned_ids: 1 # (kosmos: 0) (default: 1) 
14 |     incontext_cfg:
15 |       retriever_type: random
16 |       ice_num: 1
17 |       random_seed: 1
18 |   inferencer_cfg:
19 |     inferencer_type: PPL
20 |     batch_size: 1
21 |     CoT: False
22 |     max_new_tokens: 256
23 |   metric_cfg:
24 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/ICL/ScienceQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     query_type: standard_query
10 |     incontext_cfg:
11 |       ice_with_image: False
12 |       retriever_type: random
13 |       ice_num: 3
14 |       random_seed: 1
15 |   inferencer_cfg:
16 |     inferencer_type: ICL_PPL
17 |     batch_size: 4
18 |     CoT: True
19 |     max_new_tokens: 256
20 |   metric_cfg:
21 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Insfollow/MMBench.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   ppl_cfg: 
 5 |     content_only: False
 6 |   split: dev
 7 |   hint: True
 8 |   option_map: ''
 9 | 
10 | eval_cfg:
11 |   instruction_cfg: 
12 |     query_type: query_pool
13 |     prompt_assigned_ids: 1 # (kosmos: 5) (default: 1) 
14 |     template_assigned_ids: 1 # (kosmos: 0) (default: 1) 
15 |   inferencer_cfg:
16 |     inferencer_type: PPL
17 |     batch_size: 6
18 |     CoT: True
19 |     max_new_tokens: 256
20 |   metric_cfg:
21 |     metric_type: Instruct_Follow


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Insfollow/ScienceQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 |   option_map: ''
 7 |   
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     query_type: query_pool
12 |     query_assigned_ids: 0 # otter 1
13 |     template_assigned_ids: 0 # otter 1
14 |   inferencer_cfg:
15 |     inferencer_type: PPL
16 |     batch_size: 8
17 |     CoT: True
18 |     max_new_tokens: 256
19 |   metric_cfg:
20 |     metric_type: Instruct_Follow
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Robust/MMBench.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   ppl_cfg: 
 5 |     content_only: False
 6 |   split: dev
 7 |   hint: True
 8 |   img_crp: True
 9 |   text_crp: True
10 |   data_c_path: ../data/ChEF/MMBench_C
11 | 
12 | eval_cfg:
13 |   instruction_cfg: 
14 |     query_type: query_pool 
15 |     query_assigned_ids: 1 # (kosmos: 5) (default: 1) 
16 |     template_assigned_ids: 1 # (kosmos: 0) (default: 1) 
17 |   inferencer_cfg:
18 |     inferencer_type: PPL
19 |     batch_size: 4
20 |     CoT: True
21 |     max_new_tokens: 256
22 |   metric_cfg:
23 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/desiderata_recipes/Robust/ScienceQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 |   img_crp: True
 7 |   text_crp: True
 8 |   data_c_path: ../data/ChEF/ScienceQA_C
 9 | 
10 | eval_cfg:
11 |   instruction_cfg: 
12 |     query_type: query_pool
13 |     query_assigned_ids: 0 # otter 1
14 |     template_assigned_ids: 0 # otter 1
15 |   inferencer_cfg:
16 |     inferencer_type: PPL
17 |     batch_size: 8
18 |     CoT: True
19 |     max_new_tokens: 256
20 |   metric_cfg:
21 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/models/gemini.yaml:
--------------------------------------------------------------------------------
1 | model_name: Gemini
2 | api_key: 
3 | gpt_name: gemini-pro-vision
4 | safety_block_none: True


--------------------------------------------------------------------------------
/src/config/ChEF/models/gpt.yaml:
--------------------------------------------------------------------------------
1 | model_name: GPT
2 | api_key: 
3 | gpt_name: gpt-4-vision-preview


--------------------------------------------------------------------------------
/src/config/ChEF/models/instructblip_vicuna.yaml:
--------------------------------------------------------------------------------
1 | model_name: InstructBLIP


--------------------------------------------------------------------------------
/src/config/ChEF/models/internlm_xcomposer.yaml:
--------------------------------------------------------------------------------
1 | model_name: InternLMXComposer
2 | model_path: ../model_zoo/InternLM/internlm-xcomposer2-vl-7b


--------------------------------------------------------------------------------
/src/config/ChEF/models/kosmos2.yaml:
--------------------------------------------------------------------------------
1 | model_name: Kosmos2
2 | model_path: ../model_zoo/Kosmos/kosmos-2.pt
3 | if_grounding: False # set True for detection and grounding evaluation


--------------------------------------------------------------------------------
/src/config/ChEF/models/lamm.yaml:
--------------------------------------------------------------------------------
 1 | model_name: LAMM
 2 | model_path: ../model_zoo/LAMM/LAMM_v1.0/vicuna13b_v0_lamm186k_ep2_clip_system/pytorch_model.pt
 3 | llm_ckpt_path: ../model_zoo/Vicuna/13b_v0
 4 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14
 5 | task_type: noraml
 6 | encoder_pretrain: clip
 7 | vision_type: image
 8 | vision_feature_type: local
 9 | vision_output_layer: -2
10 | num_vision_token: 256
11 | lora_r: 32
12 | lora_alpha: 32
13 | lora_dropout: 0.1
14 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj']
15 | max_tgt_len: 1024
16 | stage: 2


--------------------------------------------------------------------------------
/src/config/ChEF/models/lamm15.yaml:
--------------------------------------------------------------------------------
 1 | model_name: LAMM_SFT
 2 | model_path: ../model_zoo/LAMM/LAMM_v1.5/lamm_sft_finetune/pytorch_model.pt
 3 | llm_ckpt_path: ../model_zoo/Vicuna/13b_v0
 4 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14
 5 | task_type: noraml
 6 | encoder_pretrain: clip
 7 | vision_type: image
 8 | vision_feature_type: local
 9 | vision_output_layer: -2
10 | num_vision_token: 256
11 | max_tgt_len: 1024
12 | stage: 3


--------------------------------------------------------------------------------
/src/config/ChEF/models/lamm_3d.yaml:
--------------------------------------------------------------------------------
 1 | model_name: LAMM
 2 | model_path: ../model_zoo/LAMM/vicuna13b_v0_lamm10k_ep2_epcl_system/pytorch_model.pt
 3 | llm_ckpt_path: ../model_zoo/vicuna/13b_v0
 4 | encoder_ckpt_path: ../model_zoo/clip_vit-L-14_scannet_ddp_ep1080_vit256token/checkpoint_best.pth
 5 | task_type: noraml
 6 | encoder_pretrain: epcl
 7 | vision_type: pcl
 8 | vision_feature_type: local
 9 | vision_output_layer: -2
10 | num_vision_token: 256
11 | lora_r: 32
12 | lora_alpha: 32
13 | lora_dropout: 0.1
14 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj']
15 | max_tgt_len: 1024
16 | stage: 2


--------------------------------------------------------------------------------
/src/config/ChEF/models/llamaadapterv2.yaml:
--------------------------------------------------------------------------------
1 | model_name: LLaMA-Adapter-v2
2 | model_path: ../model_zoo/LLaMAAdapter/LLaMA_model_weights
3 | max_seq_len: 1024
4 | max_batch_size: 20


--------------------------------------------------------------------------------
/src/config/ChEF/models/llava15.yaml:
--------------------------------------------------------------------------------
1 | model_name: LLaVA1.5
2 | model_path: ../model_zoo/LLaVA/LLaVA_v1.5/llava-v1.5-13b
3 | vis_processor_path: ../model_zoo/clip-vit-large-patch14


--------------------------------------------------------------------------------
/src/config/ChEF/models/llavarlhf.yaml:
--------------------------------------------------------------------------------
1 | model_name: LLaVARLHF
2 | model_path: ../model_zoo/LLaVA-RLHF-13b-v1.5-336
3 | vis_processor_path: ../model_zoo/clip-vit-large-patch14


--------------------------------------------------------------------------------
/src/config/ChEF/models/minigpt4.yaml:
--------------------------------------------------------------------------------
1 | model_name: MiniGPT-4
2 | model_path: ../model_zoo/MiniGPT4
3 | cfg_path: ChEF/models/minigpt4/minigpt4_eval.yaml


--------------------------------------------------------------------------------
/src/config/ChEF/models/mplug.yaml:
--------------------------------------------------------------------------------
1 | model_name: mPLUG-Owl
2 | model_path: ../model_zoo/mPLUG_Owl/mplug-owl-llama-7b


--------------------------------------------------------------------------------
/src/config/ChEF/models/octavius_2d+3d.yaml:
--------------------------------------------------------------------------------
 1 | model_name: Octavius
 2 | stage: 2
 3 | octavius_modality: ['image', 'pcl']
 4 | 
 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0
 6 | delta_ckpt_path: ../ckpt/octavius_2d+3d_e6_bs64_raw/pytorch_model.pt
 7 | 
 8 | encoder_pretrain: clip
 9 | vision_feature_type: local
10 | vision_output_layer: -2
11 | num_vision_token: 256
12 | 
13 | # peft and lora
14 | peft_type: moe_lora
15 | moe_lora_num_experts: 6
16 | moe_gate_mode: top2_gate
17 | lora_r: 32
18 | lora_alpha: 32
19 | lora_dropout: 0.1
20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj']
21 | 
22 | # pcl modality
23 | num_query_rsp_3d: 16
24 | hidden_size_rsp_3d: 768
25 | num_layers_rsp_3d: 1
26 | num_heads_rsp_3d: 8
27 | 
28 | max_tgt_len: 400
29 | conv_mode: simple
30 | 


--------------------------------------------------------------------------------
/src/config/ChEF/models/octavius_2d.yaml:
--------------------------------------------------------------------------------
 1 | model_name: Octavius_2d
 2 | stage: 2
 3 | octavius_modality: ['image']
 4 | 
 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0
 6 | delta_ckpt_path: ../ckpt/octavius_2d_e4_bs64_raw/pytorch_model.pt
 7 | 
 8 | encoder_pretrain: clip
 9 | vision_feature_type: local
10 | vision_output_layer: -2
11 | num_vision_token: 256
12 | 
13 | # peft and lora
14 | peft_type: moe_lora
15 | moe_lora_num_experts: 4
16 | moe_gate_mode: top2_gate
17 | lora_r: 32
18 | lora_alpha: 32
19 | lora_dropout: 0.1
20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj']
21 | 
22 | # pcl modality
23 | num_query_rsp_3d: 16
24 | hidden_size_rsp_3d: 768
25 | num_layers_rsp_3d: 1
26 | num_heads_rsp_3d: 8
27 | 
28 | max_tgt_len: 400
29 | conv_mode: simple
30 | 


--------------------------------------------------------------------------------
/src/config/ChEF/models/octavius_3d.yaml:
--------------------------------------------------------------------------------
 1 | model_name: Octavius_3d
 2 | stage: 2
 3 | octavius_modality: ['pcl']
 4 | 
 5 | llm_ckpt_path: ../model_zoo/vicuna_ckpt/13b_v0
 6 | delta_ckpt_path: ../ckpt/octavius_3d_e3_bs64/pytorch_model.pt
 7 | 
 8 | encoder_pretrain: clip
 9 | vision_feature_type: local
10 | vision_output_layer: -2
11 | num_vision_token: 256
12 | 
13 | # peft and lora
14 | peft_type: moe_lora
15 | moe_lora_num_experts: 3
16 | moe_gate_mode: top2_gate
17 | lora_r: 32
18 | lora_alpha: 32
19 | lora_dropout: 0.1
20 | lora_target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj']
21 | 
22 | # pcl modality
23 | num_query_rsp_3d: 16
24 | hidden_size_rsp_3d: 768
25 | num_layers_rsp_3d: 1
26 | num_heads_rsp_3d: 8
27 | 
28 | max_tgt_len: 400
29 | conv_mode: simple
30 | 


--------------------------------------------------------------------------------
/src/config/ChEF/models/otter.yaml:
--------------------------------------------------------------------------------
1 | model_name: Otter
2 | model_path: ../model_zoo/Otter/otter-9b-hf


--------------------------------------------------------------------------------
/src/config/ChEF/models/qwen_vl.yaml:
--------------------------------------------------------------------------------
1 | model_name: QwenVL
2 | model_path: ../model_zoo/Qwen/Qwen-VL-Chat


--------------------------------------------------------------------------------
/src/config/ChEF/models/rlhfv.yaml:
--------------------------------------------------------------------------------
1 | model_name: RLHFV
2 | model_path: ../model_zoo/RLHF-V


--------------------------------------------------------------------------------
/src/config/ChEF/models/shikra.yaml:
--------------------------------------------------------------------------------
1 | model_name: Shikra
2 | model_path: ../model_zoo/Shikra/shikra-7b
3 | encoder_ckpt_path: ../model_zoo/clip-vit-large-patch14
4 | 


--------------------------------------------------------------------------------
/src/config/ChEF/models/test.yaml:
--------------------------------------------------------------------------------
1 | model_name: Test


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/CIFAR10/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: CIFAR10
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | eval_cfg:
 5 |   instruction_cfg: 
 6 |     prompt_type: singleturn
 7 |     prompt_assigned_ids: 0 # (mplug: 6, llava15, lamm15: 1) (default:0) 
 8 |     template_assigned_ids: 0 # (otter, lamm, minigpt4 :1, llava15, lamm15: 3) (default:0) 
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     max_new_tokens: 16
12 |     batch_size: 32
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/CIFAR10/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: CIFAR10
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |     prompt_assigned_ids: 0 # (mplug: 6, llava15, lamm15: 1) (default:0) 
 9 |     template_assigned_ids: 0 # (lamm, minigpt4 :1, otter: 2, llava15, lamm15: 3) (default:0) 
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 4
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Harmless.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Harmless
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     batch_size: 8
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: Ch3Ef


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Harmless_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Harmless
 5 |   ppl: True
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg:
 9 |     prompt_type: singleturn
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 1
13 |     max_new_tokens: 256
14 |   metric_cfg:
15 |     metric_type: Ch3Ef
16 |     ppl: True


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Helpful.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Helpful
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     batch_size: 8
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: Ch3Ef


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Helpful_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Helpful
 5 |   ppl: True
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg:
 9 |     prompt_type: singleturn
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 1
13 |     max_new_tokens: 256
14 |   metric_cfg:
15 |     metric_type: Ch3Ef
16 |     ppl: True


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Honest.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Honest
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     batch_size: 8
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: Ch3Ef


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Ch3Ef/Honest_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Ch3Ef
 3 |   base_data_path: ../data/Ch3Ef
 4 |   dimension: Honest
 5 |   ppl: True
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg:
 9 |     prompt_type: singleturn
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 2
13 |     max_new_tokens: 256
14 |   metric_cfg:
15 |     metric_type: Ch3Ef
16 |     ppl: True


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/FSC147/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: FSC147
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: singleturn
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     max_new_tokens: 256
11 |     batch_size: 32
12 |   metric_cfg:
13 |     metric_type: basic
14 |     inference_type: direct


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/FSC147/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: FSC147
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl_cfg: 
 5 |     heatmap_width: 2
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg:
 9 |     prompt_type: singleturn
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 8
13 |   metric_cfg:
14 |     metric_type: basic
15 |     inference_type: ppl


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Flickr30k/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Flickr30k
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: singleturn
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     max_new_tokens: 64
11 |     batch_size: 16
12 |   metric_cfg:
13 |     metric_type: basic
14 |     strategy: direct


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Flickr30k/random_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Flickr30k
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl_cfg:
 5 |     negative_opt_num: 3
 6 |     random_seed: 0
 7 |     strategy: random
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg:
11 |     prompt_type: singleturn
12 |   inferencer_cfg:
13 |     inferencer_type: PPL
14 |     batch_size: 4
15 |   metric_cfg:
16 |     metric_type: basic
17 |     strategy: acc


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Flickr30k/topp_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Flickr30k
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl_cfg:
 5 |     negative_opt_num: 3
 6 |     random_seed: 0
 7 |     strategy: top_similarity
 8 |     model_path: ../model_zoo/all-MiniLM-L6-v2
 9 | 
10 | eval_cfg:
11 |   instruction_cfg:
12 |     prompt_type: singleturn
13 |   inferencer_cfg:
14 |     inferencer_type: PPL
15 |     batch_size: 4
16 |   metric_cfg:
17 |     metric_type: basic
18 |     strategy: acc


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/AI2D.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: AI2D
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 8
11 |     CoT: True
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/CIFAR10.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: CIFAR10_LAMM
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 32
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/CelebA_hair.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: CelebA(Hair)
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 32
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/CelebA_smile.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: CelebA(Smile)
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 32
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/FSC147.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: FSC147_LAMM
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     max_new_tokens: 256
11 |     batch_size: 32
12 |   metric_cfg:
13 |     metric_type: basic
14 |     inference_type: direct


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/Flickr30k.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Flickr30k_LAMM
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 32
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/SVT.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: SVT
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 16
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/ScanNet.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScanNet_LAMM
 3 |   base_data_path: ../data/LAMM/3D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct3D
10 |     batch_size: 1
11 |   metric_cfg:
12 |     metric_type: LAMM
13 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/ScanQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScanQA_LAMM
 3 |   base_data_path: ../data/LAMM/3D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct3D
10 |     batch_size: 1
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/ScanRefer.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScanRefer_LAMM
 3 |   base_data_path: ../data/LAMM/3D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct3D
10 |     batch_size: 1
11 |   metric_cfg:
12 |     metric_type: LAMM
13 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/ScienceQA.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA_LAMM
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 1
11 |     CoT: True
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/UCMerced.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: UCMerced
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 16
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/VOC2012.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012_LAMM
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 32
11 |   metric_cfg:
12 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/locating_LSP.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Locating_LSP
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     max_new_tokens: 256
11 |     batch_size: 32
12 |   metric_cfg:
13 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/LAMM/locating_VOC2012.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Locating_VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     query_type: standard_query
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     max_new_tokens: 256
11 |     batch_size: 32
12 |   metric_cfg:
13 |     metric_type: LAMM


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/MMBench/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   split: dev
 5 |   hint: True
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     prompt_type: singleturn
10 |     prompt_assigned_ids: 1
11 |     template_assigned_ids: 1
12 |   inferencer_cfg:
13 |     inferencer_type: Direct
14 |     batch_size: 2
15 |     CoT: False
16 |     max_new_tokens: 256
17 |   metric_cfg:
18 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/MMBench/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMBench
 3 |   base_data_path: ../data/MMBench
 4 |   ppl_cfg: 
 5 |     content_only: False
 6 |   split: dev
 7 |   hint: True
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     prompt_type: singleturn
12 |     prompt_assigned_ids: 1 
13 |     template_assigned_ids: 1
14 |   inferencer_cfg:
15 |     inferencer_type: PPL
16 |     batch_size: 1
17 |     CoT: False
18 |     max_new_tokens: 256
19 |   metric_cfg:
20 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/MME/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MME
 3 |   base_data_path: ../data/MME
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |     prompt_assigned_ids: 2
 9 |     template_assigned_ids: 0
10 |   inferencer_cfg:
11 |     inferencer_type: Direct
12 |     batch_size: 8
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/MME/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MME
 3 |   base_data_path: ../data/MME
 4 |   ppl_cfg: True
 5 |   option_lower: True
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     prompt_type: singleturn
10 |     prompt_assigned_ids: 2
11 |     template_assigned_ids: 0
12 |   inferencer_cfg:
13 |     inferencer_type: PPL
14 |     batch_size: 8
15 |   metric_cfg:
16 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/MMMU/default.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: MMMU
 3 |   base_data_path: ../data/MMMU
 4 |   img_folder: ../data/MMMU_images
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     batch_size: 1
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: MMMU


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Octavius3D/nr3d_caption_direct3d.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: OctaviusPCLDataset
 3 |   base_data_path: ../data/Octavius/3D_Benchmark/meta_file
 4 |   task_name: Caption
 5 |   inference_dataset_name: nr3d
 6 |   vision_root_path: ../data/Octavius/3D_Instruct
 7 | 
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     query_type: standard_query
11 |   inferencer_cfg:
12 |     inferencer_type: Direct3D
13 |     batch_size: 1
14 |   metric_cfg:
15 |     metric_type: Octavius3D


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Octavius3D/scan_caption_direct3d.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: OctaviusPCLDataset
 3 |   base_data_path: ../data/Octavius/3D_Benchmark/meta_file
 4 |   task_name: Caption
 5 |   inference_dataset_name: scannet
 6 |   vision_root_path: ../data/Octavius/3D_Instruct
 7 | 
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     query_type: standard_query
11 |   inferencer_cfg:
12 |     inferencer_type: Direct3D
13 |     batch_size: 1
14 |   metric_cfg:
15 |     metric_type: Octavius3D


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Octavius3D/scan_cls_direct3d.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: OctaviusPCLDataset
 3 |   base_data_path: ../data/Octavius/3D_Benchmark/meta_file
 4 |   task_name: Classification
 5 |   inference_dataset_name: scannet
 6 |   vision_root_path: ../data/Octavius/3D_Instruct
 7 | 
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     query_type: standard_query
11 |   inferencer_cfg:
12 |     inferencer_type: Direct3D
13 |     batch_size: 1
14 |   metric_cfg:
15 |     metric_type: Octavius3D
16 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Octavius3D/scan_vqa_direct3d.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: OctaviusPCLDataset
 3 |   base_data_path: ../data/Octavius/3D_Benchmark/meta_file
 4 |   task_name: VQA
 5 |   inference_dataset_name: scannet
 6 |   vision_root_path: ../data/Octavius/3D_Instruct
 7 | 
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     query_type: standard_query
11 |   inferencer_cfg:
12 |     inferencer_type: Direct3D
13 |     batch_size: 1
14 |   metric_cfg:
15 |     metric_type: Octavius3D


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Octavius3D/shapenet_cls_direct3d.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: OctaviusPCLDataset
 3 |   base_data_path: ../data/Octavius/3D_Benchmark/meta_file
 4 |   task_name: Classification
 5 |   inference_dataset_name: shapenet
 6 |   vision_root_path: ../data/Octavius/3D_Instruct
 7 | 
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     query_type: standard_query
11 |   inferencer_cfg:
12 |     inferencer_type: Direct3D
13 |     batch_size: 1
14 |   metric_cfg:
15 |     metric_type: Octavius3D
16 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Omnibenchmark/multiturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Omnibenchmark
 3 |   base_data_path: ../data/ChEF/OmniBenchmark_Bamboo
 4 |   bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
 5 |   ppl_cfg: 
 6 |     negative_opt_num: 3
 7 |     random_seed: 0
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     prompt_type: multiturn
12 |   inferencer_cfg:
13 |     inferencer_type: Multi_Direct
14 |     batch_size: 8
15 |   metric_cfg:
16 |     metric_type: basic
17 |     inference_type: multiturn
18 |     ppl: False
19 |     bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
20 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Omnibenchmark/multiturn_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Omnibenchmark
 3 |   base_data_path: ../data/ChEF/OmniBenchmark_Bamboo
 4 |   bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
 5 |   ppl_cfg: 
 6 |     negative_opt_num: 3
 7 |     random_seed: 0
 8 | 
 9 | eval_cfg:
10 |   instruction_cfg: 
11 |     prompt_type: multiturn
12 |   inferencer_cfg:
13 |     inferencer_type: Multi_PPL
14 |     batch_size: 8
15 |   metric_cfg:
16 |     metric_type: basic
17 |     inference_type: multiturn
18 |     ppl: True
19 |     bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
20 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Omnibenchmark/singleturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Omnibenchmark
 3 |   base_data_path: ../data/ChEF/OmniBenchmark_Bamboo
 4 |   bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
 5 |   multi_turn: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     prompt_type: singleturn
10 |   inferencer_cfg:
11 |     inferencer_type: Direct
12 |     batch_size: 8
13 |   metric_cfg:
14 |     metric_type: basic
15 |     inference_type: singleturn
16 |     ppl: False
17 |     bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
18 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Omnibenchmark/singleturn_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Omnibenchmark
 3 |   base_data_path: ../data/ChEF/OmniBenchmark_Bamboo
 4 |   bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
 5 |   ppl_cfg: 
 6 |     negative_opt_num: 3
 7 |     random_seed: 0
 8 |   multi_turn: False
 9 | 
10 | eval_cfg:
11 |   instruction_cfg: 
12 |     prompt_type: singleturn
13 |   inferencer_cfg:
14 |     inferencer_type: PPL
15 |     batch_size: 8
16 |   metric_cfg:
17 |     metric_type: basic
18 |     inference_type: singleturn
19 |     ppl: True
20 |     bamboo_tree_path: ../data/Bamboo/sensexo_visual_add_academic_add_state_V4.visual.json
21 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/SEEDBench-2/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: SEEDBench2
 3 |   base_data_path: ../data/SEEDBench2
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |     prompt_assigned_ids: 2
 9 |     template_assigned_ids: 0
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 1
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/SEEDBench/default.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: SEEDBench
 3 |   base_data_path: ../data/SEED-Bench
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |     prompt_assigned_ids: 4
 9 |     template_assigned_ids: 0
10 |   inferencer_cfg:
11 |     inferencer_type: PPL
12 |     batch_size: 8
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/ScienceQA/direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 8
11 |     CoT: False
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/ScienceQA/direct_CoT.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg: 
 7 |     prompt_type: singleturn
 8 |   inferencer_cfg:
 9 |     inferencer_type: Direct
10 |     batch_size: 8
11 |     CoT: True
12 |     max_new_tokens: 256
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/ScienceQA/ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     prompt_type: singleturn
10 |     prompt_assigned_ids: 0
11 |     template_assigned_ids: 0
12 |   inferencer_cfg:
13 |     inferencer_type: PPL
14 |     batch_size: 8
15 |     CoT: False
16 |     max_new_tokens: 256
17 |   metric_cfg:
18 |     metric_type: basic
19 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/ScienceQA/ppl_CoT.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: ScienceQA
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl: True
 5 |   option_content: False
 6 | 
 7 | eval_cfg:
 8 |   instruction_cfg: 
 9 |     prompt_type: singleturn
10 |     prompt_assigned_ids: 0
11 |     template_assigned_ids: 0
12 |   inferencer_cfg:
13 |     inferencer_type: PPL
14 |     batch_size: 2
15 |     CoT: True
16 |     max_new_tokens: 128
17 |   metric_cfg:
18 |     metric_type: basic
19 | 


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/kosmos2_multiturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |    
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: multiturn
 8 |     prompt_assigned_ids: 2
 9 |     template_assigned_ids: 2
10 |   inferencer_cfg:
11 |     inferencer_type: Multi_Direct
12 |     max_new_tokens: 512
13 |     batch_size: 8
14 |   metric_cfg:
15 |     metric_type: KOSMOS


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/kosmos2_multiturn_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   option_template: kosmos
 5 |   ppl_cfg:
 6 |     negative_opt_num: 3
 7 |     random_seed: 0
 8 |    
 9 | eval_cfg:
10 |   instruction_cfg:
11 |     prompt_type: multiturn
12 |     prompt_assigned_ids: 2
13 |     template_assigned_ids: 2
14 |   inferencer_cfg:
15 |     inferencer_type: Multi_PPL
16 |     batch_size: 8
17 |   metric_cfg:
18 |     metric_type: KOSMOS
19 |     inference_type: ppl


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/kosmos2_singleturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |    
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: singleturn
 8 |     prompt_assigned_ids: 1
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     max_new_tokens: 512
12 |     batch_size: 8
13 |   metric_cfg:
14 |     metric_type: KOSMOS


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/multiturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: multiturn
 8 |   inferencer_cfg:
 9 |     inferencer_type: Multi_Direct
10 |     max_new_tokens: 64
11 |     batch_size: 16
12 |   metric_cfg:
13 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/multiturn_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   ppl_cfg:
 5 |     negative_opt_num: 3
 6 |     random_seed: 0
 7 |    
 8 | eval_cfg:
 9 |   instruction_cfg:
10 |     prompt_type: multiturn
11 |   inferencer_cfg:
12 |     inferencer_type: Multi_PPL
13 |     batch_size: 8
14 |   metric_cfg:
15 |     metric_type: basic
16 |     inference_type: ppl


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/shikra_multiturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 | 
 5 | eval_cfg:
 6 |   instruction_cfg:
 7 |     prompt_type: multiturn
 8 |     prompt_assigned_ids: 0
 9 |     template_assigned_ids: 0
10 |   inferencer_cfg:
11 |     inferencer_type: Multi_Direct
12 |     max_new_tokens: 64
13 |     batch_size: 16
14 |   metric_cfg:
15 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/shikra_multiturn_ppl.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   option_template: shikra
 5 |   ppl_cfg:
 6 |     negative_opt_num: 3
 7 |     random_seed: 0
 8 |    
 9 | eval_cfg:
10 |   instruction_cfg:
11 |     prompt_type: multiturn
12 |     prompt_assigned_ids: 1
13 |     template_assigned_ids: 1
14 |   inferencer_cfg:
15 |     inferencer_type: Multi_PPL
16 |     batch_size: 8
17 |   metric_cfg:
18 |     metric_type: basic
19 |     inference_type: ppl


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/shikra_singleturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   multi_turn: False
 5 |    
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |     prompt_assigned_ids: 1
10 |   inferencer_cfg:
11 |     inferencer_type: Direct
12 |     max_new_tokens: 512
13 |     batch_size: 8
14 |   metric_cfg:
15 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/VOC2012/singleturn_direct.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: VOC2012
 3 |   base_data_path: ../data/LAMM/2D_Benchmark
 4 |   multi_turn: False
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg:
 8 |     prompt_type: singleturn
 9 |   inferencer_cfg:
10 |     inferencer_type: Direct
11 |     max_new_tokens: 64
12 |     batch_size: 16
13 |   metric_cfg:
14 |     metric_type: basic


--------------------------------------------------------------------------------
/src/config/ChEF/scenario_recipes/Winoground/default.yaml:
--------------------------------------------------------------------------------
 1 | scenario_cfg:
 2 |   dataset_name: Winoground
 3 |   base_data_path: ../data/Winoground
 4 |   ppl: True
 5 | 
 6 | eval_cfg:
 7 |   instruction_cfg: 
 8 |     query_type: standard_query
 9 |   inferencer_cfg:
10 |     inferencer_type: PPL
11 |     batch_size: 16
12 |     max_new_tokens: 32
13 |     multi_img: True
14 |   metric_cfg:
15 |     metric_type: Winoground
16 | 


--------------------------------------------------------------------------------
/src/dist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import socket
 3 | import time
 4 | import torch
 5 | 
 6 | def init_distributed_mode():
 7 |     if 'SLURM_PROCID' in os.environ:
 8 |         global_rank = int(os.environ['SLURM_PROCID'])
 9 |         world_size = int(os.environ['SLURM_NPROCS'])
10 |         local_rank = global_rank % torch.cuda.device_count()
11 |     else:
12 |         print('Not using distributed mode')
13 |         return dict(
14 |             local_rank=0, 
15 |             global_rank=0, 
16 |             world_size=1)
17 |     
18 |     print(f"Start inference, world_size: {world_size}, global_rank: {global_rank}, local_rank:{local_rank}")
19 |     os.environ['LOCAL_RANK'] = str(local_rank)
20 | 
21 |     return dict(
22 |         local_rank=local_rank, 
23 |         global_rank=global_rank, 
24 |         world_size=world_size)


--------------------------------------------------------------------------------
/src/model/LAMM/CLIP/__init__.py:
--------------------------------------------------------------------------------
1 | # remove fp32 LN & return intermediate features
2 | from .clip import *
3 | 


--------------------------------------------------------------------------------
/src/model/LAMM/CLIP/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/CLIP/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/__init__.py:
--------------------------------------------------------------------------------
1 | from .model_3detr import build_3detr, build_epcl_encoder
2 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/ball_query.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | #pragma once
4 | #include <torch/extension.h>
5 | 
6 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
7 |                       const int nsample);
8 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | #ifndef _CUDA_UTILS_H
 4 | #define _CUDA_UTILS_H
 5 | 
 6 | #include <ATen/ATen.h>
 7 | #include <ATen/cuda/CUDAContext.h>
 8 | #include <cmath>
 9 | 
10 | #include <cuda.h>
11 | #include <cuda_runtime.h>
12 | 
13 | #include <vector>
14 | 
15 | #define TOTAL_THREADS 512
16 | 
17 | inline int opt_n_threads(int work_size) {
18 |   const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
19 | 
20 |   return max(min(1 << pow_2, TOTAL_THREADS), 1);
21 | }
22 | 
23 | inline dim3 opt_block_config(int x, int y) {
24 |   const int x_threads = opt_n_threads(x);
25 |   const int y_threads =
26 |       max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
27 |   dim3 block_config(x_threads, y_threads, 1);
28 | 
29 |   return block_config;
30 | }
31 | 
32 | #define CUDA_CHECK_ERRORS()                                           \
33 |   do {                                                                \
34 |     cudaError_t err = cudaGetLastError();                             \
35 |     if (cudaSuccess != err) {                                         \
36 |       fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
37 |               cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
38 |               __FILE__);                                              \
39 |       exit(-1);                                                       \
40 |     }                                                                 \
41 |   } while (0)
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/group_points.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | 
4 | #pragma once
5 | #include <torch/extension.h>
6 | 
7 | at::Tensor group_points(at::Tensor points, at::Tensor idx);
8 | at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
9 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/interpolate.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <torch/extension.h>
 6 | #include <vector>
 7 | 
 8 | std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
 9 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
10 |                              at::Tensor weight);
11 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
12 |                                   at::Tensor weight, const int m);
13 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/sampling.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | #pragma once
 5 | #include <torch/extension.h>
 6 | 
 7 | at::Tensor gather_points(at::Tensor points, at::Tensor idx);
 8 | at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
 9 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
10 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/include/utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | #pragma once
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <torch/extension.h>
 7 | 
 8 | #define CHECK_CUDA(x)                                          \
 9 |   do {                                                         \
10 |     AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \
11 |   } while (0)
12 | 
13 | #define CHECK_CONTIGUOUS(x)                                         \
14 |   do {                                                              \
15 |     AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
16 |   } while (0)
17 | 
18 | #define CHECK_IS_INT(x)                              \
19 |   do {                                               \
20 |     AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \
21 |              #x " must be an int tensor");           \
22 |   } while (0)
23 | 
24 | #define CHECK_IS_FLOAT(x)                              \
25 |   do {                                                 \
26 |     AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \
27 |              #x " must be a float tensor");            \
28 |   } while (0)
29 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/src/ball_query.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | #include "ball_query.h"
 5 | #include "utils.h"
 6 | 
 7 | void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
 8 |                                      int nsample, const float *new_xyz,
 9 |                                      const float *xyz, int *idx);
10 | 
11 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
12 |                       const int nsample) {
13 |   CHECK_CONTIGUOUS(new_xyz);
14 |   CHECK_CONTIGUOUS(xyz);
15 |   CHECK_IS_FLOAT(new_xyz);
16 |   CHECK_IS_FLOAT(xyz);
17 | 
18 |   if (new_xyz.is_cuda()) {
19 |     CHECK_CUDA(xyz);
20 |   }
21 | 
22 |   at::Tensor idx =
23 |       torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
24 |                    at::device(new_xyz.device()).dtype(at::ScalarType::Int));
25 | 
26 |   if (new_xyz.is_cuda()) {
27 |     query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
28 |                                     radius, nsample, new_xyz.data<float>(),
29 |                                     xyz.data<float>(), idx.data<int>());
30 |   } else {
31 |     AT_ASSERT(false, "CPU not supported");
32 |   }
33 | 
34 |   return idx;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/_ext_src/src/bindings.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | #include "ball_query.h"
 5 | #include "group_points.h"
 6 | #include "interpolate.h"
 7 | #include "sampling.h"
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("gather_points", &gather_points);
11 |   m.def("gather_points_grad", &gather_points_grad);
12 |   m.def("furthest_point_sampling", &furthest_point_sampling);
13 | 
14 |   m.def("three_nn", &three_nn);
15 |   m.def("three_interpolate", &three_interpolate);
16 |   m.def("three_interpolate_grad", &three_interpolate_grad);
17 | 
18 |   m.def("ball_query", &ball_query);
19 | 
20 |   m.def("group_points", &group_points);
21 |   m.def("group_points_grad", &group_points_grad);
22 | }
23 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/EPCL/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/pointnet2_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | """ Testing customized ops. """
 4 | 
 5 | import torch
 6 | from torch.autograd import gradcheck
 7 | import numpy as np
 8 | 
 9 | import os
10 | import sys
11 | 
12 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13 | sys.path.append(BASE_DIR)
14 | import pointnet2_utils
15 | 
16 | 
17 | def test_interpolation_grad():
18 |     batch_size = 1
19 |     feat_dim = 2
20 |     m = 4
21 |     feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda()
22 | 
23 |     def interpolate_func(inputs):
24 |         idx = torch.from_numpy(np.array([[[0, 1, 2], [1, 2, 3]]])).int().cuda()
25 |         weight = torch.from_numpy(np.array([[[1, 1, 1], [2, 2, 2]]])).float().cuda()
26 |         interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight)
27 |         return interpolated_feats
28 | 
29 |     assert gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     test_interpolation_grad()
34 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/third_party/pointnet2/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from setuptools import setup
 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 8 | import glob
 9 | import os.path as osp
10 | 
11 | this_dir = osp.dirname(osp.abspath(__file__))
12 | 
13 | _ext_src_root = "_ext_src"
14 | _ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob(
15 |     "{}/src/*.cu".format(_ext_src_root)
16 | )
17 | _ext_headers = glob.glob("{}/include/*".format(_ext_src_root))
18 | 
19 | setup(
20 |     name="pointnet2",
21 |     ext_modules=[
22 |         CUDAExtension(
23 |             name="pointnet2._ext",
24 |             sources=_ext_sources,
25 |             extra_compile_args={
26 |                 "cxx": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))],
27 |                 "nvcc": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))],
28 |             },
29 |             include_dirs=[osp.join(this_dir, _ext_src_root, "include")],
30 |         )
31 |     ],
32 |     cmdclass={"build_ext": BuildExtension},
33 | )
34 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/model/LAMM/EPCL/utils/__init__.py


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/utils/cython_compile.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | from setuptools import setup, Extension
 4 | from Cython.Build import cythonize
 5 | import numpy as np
 6 | 
 7 | 
 8 | # hacky way to find numpy include path
 9 | # replace with actual path if this does not work
10 | np_include_path = np.__file__.replace("__init__.py", "core/include/")
11 | INCLUDE_PATH = [np_include_path]
12 | 
13 | setup(
14 |     ext_modules=cythonize(
15 |         Extension(
16 |             "box_intersection",
17 |             sources=["box_intersection.pyx"],
18 |             include_dirs=INCLUDE_PATH,
19 |         )
20 |     ),
21 | )
22 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/utils/cython_compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | python cython_compile.py build_ext --inplace
4 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/utils/download_weights.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | import os
 5 | from urllib import request
 6 | import torch
 7 | import pickle
 8 | 
 9 | ## Define the weights you want and where to store them
10 | dataset = "scannet"
11 | encoder = "_masked"  # or ""
12 | epoch = 1080
13 | base_url = "https://dl.fbaipublicfiles.com/3detr/checkpoints"
14 | local_dir = "/tmp/"
15 | 
16 | ### Downloading the weights
17 | weights_file = f"{dataset}{encoder}_ep{epoch}.pth"
18 | metrics_file = f"{dataset}{encoder}_ep{epoch}_metrics.pkl"
19 | local_weights = os.path.join(local_dir, weights_file)
20 | local_metrics = os.path.join(local_dir, metrics_file)
21 | 
22 | url = os.path.join(base_url, weights_file)
23 | request.urlretrieve(url, local_weights)
24 | print(f"Downloaded weights from {url} to {local_weights}")
25 | 
26 | url = os.path.join(base_url, metrics_file)
27 | request.urlretrieve(url, local_metrics)
28 | print(f"Downloaded metrics from {url} to {local_metrics}")
29 | 
30 | # weights can be simply loaded with pytorch
31 | weights = torch.load(local_weights, map_location=torch.device("cpu"))
32 | print("Weights loaded successfully.")
33 | 
34 | # metrics can be loaded with pickle
35 | with open(local_metrics, "rb") as fh:
36 |     metrics = pickle.load(fh)
37 | print("Metrics loaded successfully.")
38 | 


--------------------------------------------------------------------------------
/src/model/LAMM/EPCL/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import torch
 4 | 
 5 | try:
 6 |     from tensorboardX import SummaryWriter
 7 | except ImportError:
 8 |     print("Cannot import tensorboard. Will log to txt files only.")
 9 |     SummaryWriter = None
10 | 
11 | from utils.dist import is_primary
12 | 
13 | 
14 | class Logger(object):
15 |     def __init__(self, log_dir=None) -> None:
16 |         self.log_dir = log_dir
17 |         if SummaryWriter is not None and is_primary():
18 |             self.writer = SummaryWriter(self.log_dir)
19 |         else:
20 |             self.writer = None
21 | 
22 |     def log_scalars(self, scalar_dict, step, prefix=None):
23 |         if self.writer is None:
24 |             return
25 |         for k in scalar_dict:
26 |             v = scalar_dict[k]
27 |             if isinstance(v, torch.Tensor):
28 |                 v = v.detach().cpu().item()
29 |             if prefix is not None:
30 |                 k = prefix + k
31 |             self.writer.add_scalar(k, v, step)
32 | 


--------------------------------------------------------------------------------
/src/model/LAMM/__init__.py:
--------------------------------------------------------------------------------
1 | from .openlamm import LAMMPEFTModel
2 | from .openlamm import LAMMSFTModel


--------------------------------------------------------------------------------
/src/model/LAMM/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .pcl_utils import *
2 | 


--------------------------------------------------------------------------------
/src/model/Octavius/__init__.py:
--------------------------------------------------------------------------------
1 | from .octavius import Octavius


--------------------------------------------------------------------------------
/src/model/Octavius/moe/__init__.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | import peft
 3 | from peft import PEFT_TYPE_TO_CONFIG_MAPPING
 4 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
 5 | 
 6 | 
 7 | # register MoE LoRA
 8 | class PeftType(str, enum.Enum):
 9 |     PROMPT_TUNING = "PROMPT_TUNING"
10 |     P_TUNING = "P_TUNING"
11 |     PREFIX_TUNING = "PREFIX_TUNING"
12 |     LORA = "LORA"
13 |     ADALORA = "ADALORA"
14 |     ADAPTION_PROMPT = "ADAPTION_PROMPT"
15 |     IA3 = "IA3"
16 |     MOE_LORA = 'MOE_LORA'
17 | 
18 | peft.PeftType = PeftType
19 | 
20 | from .moe_lora import MoeLoraConfig, MoeLoraModel
21 | PEFT_TYPE_TO_CONFIG_MAPPING[peft.PeftType.MOE_LORA] = MoeLoraConfig
22 | PEFT_TYPE_TO_MODEL_MAPPING[peft.PeftType.MOE_LORA] = MoeLoraModel
23 | 
24 | 
25 | __all__ = [
26 |     'MoeLoraConfig', 
27 | ]
28 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from .training_agent import DeepSpeedAgent
 2 | 
 3 | from .LAMM import LAMMPEFTModel
 4 | from .LAMM import LAMMSFTModel
 5 | from .Octavius import Octavius
 6 | 
 7 | 
 8 | def load_model(args):
 9 |     agent_name = args["models"][args["model"]]["agent_name"]
10 |     model_name = args["models"][args["model"]]["model_name"]
11 |     model = globals()[model_name](**args)
12 |     agent = globals()[agent_name](model, args)
13 |     return agent
14 | 


--------------------------------------------------------------------------------
/src/model/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/src/model/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/src/model/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 | 


--------------------------------------------------------------------------------
/src/model/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/src/model/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/src/model/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/src/model/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/src/model/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/src/model/llava/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | replace_llama_attn_with_flash_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/src/model/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/src/slurm_eval.sh:
--------------------------------------------------------------------------------
 1 | START_TIME=`date +%Y%m%d-%H:%M:%S`
 2 | 
 3 | parition=$1
 4 | GPUS=$2
 5 | model_cfg=$3
 6 | recipe_cfg=$4
 7 | EXTRA_ARGS=${@:5}
 8 | GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
 9 | 
10 | LOG_FILE=../logs/evaluation_${START_TIME}.log
11 | 
12 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
13 |     srun -p ${parition} -J ChEF_eval --gres=gpu:${GPUS_PER_NODE} --ntasks=${GPUS} \
14 |         --ntasks-per-node=${GPUS_PER_NODE} --kill-on-bad-exit \
15 |             python eval.py \
16 |                 --time ${START_TIME} \
17 |                 --model_cfg=${model_cfg} \
18 |                 --recipe_cfg=${recipe_cfg} \
19 |                 ${EXTRA_ARGS} \
20 |                 2>&1 | tee -a $LOG_FILE > /dev/null &
21 | 
22 |     sleep 0.5s;
23 |     tail -f ${LOG_FILE}


--------------------------------------------------------------------------------
/src/slurm_eval_icl.sh:
--------------------------------------------------------------------------------
 1 | START_TIME=`date +%Y%m%d-%H:%M:%S`
 2 | 
 3 | parition=$1
 4 | GPUS=$2
 5 | model_cfg=$3
 6 | dataset_name=$4
 7 | EXTRA_ARGS=${@:5}
 8 | GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
 9 | 
10 | LOG_FILE=../logs/evaluation_${START_TIME}.log
11 | 
12 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
13 |     srun -p ${parition} -J ChEF_eval --gres=gpu:${GPUS_PER_NODE} --ntasks=${GPUS} \
14 |         --ntasks-per-node=${GPUS_PER_NODE} --kill-on-bad-exit \
15 |             python tools/ChEF/eval_icl.py \
16 |                 --time ${START_TIME} \
17 |                 --model_cfg=${model_cfg} \
18 |                 --recipe_cfg=config/ChEF/desiderata_recipes/ICL/${dataset_name}.yaml  \
19 |                 ${EXTRA_ARGS} \
20 |                 2>&1 | tee -a $LOG_FILE > /dev/null &
21 | 
22 |     sleep 0.5s;
23 |     tail -f ${LOG_FILE}


--------------------------------------------------------------------------------
/src/tools/ChEF/eval_calibration.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | import datetime
 4 | import sys
 5 | script_path = os.path.abspath(__file__)
 6 | script_dir = os.path.dirname(script_path)
 7 | chef_dir = os.path.join(script_dir, '../../../src')
 8 | sys.path.append(chef_dir)
 9 | 
10 | from ChEF.evaluator import Evaluator, load_config, sample_dataset
11 | from ChEF.models import get_model
12 | from ChEF.scenario import dataset_dict
13 | 
14 | def main():
15 |     model_cfg, recipe_cfg, save_dir, sample_len = load_config()
16 |     # model
17 |     model = get_model(model_cfg)
18 | 
19 |     # dataset
20 |     scenario_cfg = recipe_cfg['scenario_cfg']
21 |     dataset_name = scenario_cfg['dataset_name']
22 |     dataset = dataset_dict[dataset_name](**scenario_cfg)
23 |     # sample dataset
24 |     dataset = sample_dataset(dataset, sample_len=sample_len, sample_seed=0)
25 | 
26 |     # save_cfg
27 |     time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
28 |     save_base_dir = os.path.join(save_dir, model_cfg['model_name'], 'Calibration', dataset_name, time)
29 |     os.makedirs(save_base_dir, exist_ok=True)
30 |     with open(os.path.join(save_base_dir, 'config.yaml'), 'w', encoding='utf-8') as f:
31 |         yaml.dump(data=dict(model_cfg=model_cfg, recipe_cfg=recipe_cfg), stream=f, allow_unicode=True)
32 |     print(f'Save results in {save_base_dir}!')
33 | 
34 |     # evaluate
35 |     eval_cfg = recipe_cfg['eval_cfg']
36 |     evaluater = Evaluator(dataset, save_base_dir, eval_cfg)
37 |     evaluater.evaluate(model)
38 | 
39 | if __name__ == '__main__':
40 |     main()


--------------------------------------------------------------------------------
/src/tools/ChEF/eval_hallucination.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | import datetime
 4 | import sys
 5 | script_path = os.path.abspath(__file__)
 6 | script_dir = os.path.dirname(script_path)
 7 | chef_dir = os.path.join(script_dir, '../../../src')
 8 | sys.path.append(chef_dir)
 9 | 
10 | from ChEF.evaluator import Evaluator, load_config, sample_dataset
11 | from ChEF.models import get_model
12 | from ChEF.scenario import dataset_dict
13 | 
14 | def main():
15 |     model_cfg, recipe_cfg, save_dir, sample_len = load_config()
16 | 
17 |     # model
18 |     model = get_model(model_cfg)
19 |     
20 |     # dataset
21 |     scenario_cfg = recipe_cfg['scenario_cfg']
22 | 
23 |     settings = ['POPE_COCO_random','POPE_COCO_popular','POPE_COCO_adversarial']
24 |     for setting in settings:
25 |         scenario_cfg['dataset_name'] = setting
26 |         dataset_name = scenario_cfg['dataset_name']
27 |         dataset = dataset_dict[dataset_name](**scenario_cfg)
28 |         dataset = sample_dataset(dataset, sample_len=sample_len, sample_seed=0)
29 |         # save_cfg
30 |         time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
31 |         save_base_dir = os.path.join(save_dir, model_cfg['model_name'], 'Hallucination',dataset_name, time)
32 |         os.makedirs(save_base_dir, exist_ok=True)
33 |         with open(os.path.join(save_base_dir, 'config.yaml'), 'w', encoding='utf-8') as f:
34 |             yaml.dump(data=dict(model_cfg=model_cfg, recipe_cfg=recipe_cfg), stream=f, allow_unicode=True)
35 |         print(f'Save results in {save_base_dir}!')
36 | 
37 |         # evaluate
38 |         eval_cfg = recipe_cfg['eval_cfg']
39 |         evaluater = Evaluator(dataset, save_base_dir, eval_cfg)
40 |         evaluater.evaluate(model)
41 |     
42 | 
43 | if __name__ == '__main__':
44 |     main()


--------------------------------------------------------------------------------
/src/tools/LAMM/eval_lamm2d.sh:
--------------------------------------------------------------------------------
1 | yaml_dict=(ScienceQA FSC147 VOC2012 SVT Flickr30k UCMerced CelebA_hair CelebA_smile CIFAR10 AI2D locating_LSP locating_VOC2012)
2 | 
3 | for dataset in ${yaml_dict[*]}; do
4 |     
5 |     python eval.py --model_cfg config/ChEF/models/lamm.yaml --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml
6 | 
7 | done


--------------------------------------------------------------------------------
/src/tools/LAMM/eval_lamm3d.sh:
--------------------------------------------------------------------------------
1 | yaml_dict=(ScanNet ScanRefer ScanQA)
2 | 
3 | for dataset in ${yaml_dict[*]}; do
4 |     
5 |     python eval.py --model_cfg config/ChEF/models/lamm_3d.yaml --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml
6 | 
7 | done


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=4
 3 | 
 4 | exp=$1
 5 | visfeat_type=local
 6 | now=$(date +"%Y%m%d_%H%M%S")
 7 | 
 8 | ckpt_dir=../ckpt
 9 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
10 | 
11 | deepspeed --include localhost:0,1,2,3 --master_addr 127.0.0.1 --master_port 28457 train.py \
12 |     --stage 1 \
13 |     --cfg ./config/LAMM/train.yaml \
14 |     --data_path  ../data/LAMM/2D_Instruct/meta_file/LAMM_instruct_186k.json \
15 |     --vision_root_path ../data/LAMM/2D_Instruct/ \
16 |     --conv_template default \
17 |     --max_tgt_len 400 \
18 |     --vision_type image \
19 |     --use_system \
20 |     --model lamm_peft \
21 |     --encoder_pretrain clip \
22 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
23 |     --vision_feature_type ${visfeat_type} \
24 |     --num_vision_token 256 \
25 |     --save_path  ${ckpt_dir}/${exp} \
26 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
27 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
28 | 


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm2d_sft_stg1_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=8
 3 | 
 4 | partition=$1
 5 | exp=$2
 6 | visfeat_type=local
 7 | 
 8 | now=$(date +"%Y%m%d_%H%M%S")
 9 | ckpt_dir=../ckpt
10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
11 | 
12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \
13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \
14 |     --stage 1 \
15 |     --cfg ./config/LAMM/train_sft.yaml \
16 |     --data_path  ../data/LAMM/2D_pretrain/meta_file/blip_laion_cc_sbu_558k.json \
17 |     --vision_root_path ../data/LAMM/2D_pretrain/images \
18 |     --conv_template default \
19 |     --max_tgt_len 2048 \
20 |     --vision_type image \
21 |     --use_system \
22 |     --model lamm_sft \
23 |     --encoder_pretrain clip \
24 |     --gradient_checkpointing \
25 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
26 |     --vision_feature_type ${visfeat_type} \
27 |     --num_vision_token 256 \
28 |     --save_path  ${ckpt_dir}/${exp} \
29 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
30 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
31 | 
32 | 


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm2d_sft_stg2_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=8
 3 | 
 4 | partition=$1
 5 | exp=$2
 6 | visfeat_type=local
 7 | 
 8 | now=$(date +"%Y%m%d_%H%M%S")
 9 | ckpt_dir=../ckpt
10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
11 | 
12 | srun -p ${partition} -J ${exp} -x "SH-IDC1-10-140-1-164" --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \
13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \
14 |     --stage 2 \
15 |     --cfg ./config/LAMM/train_sft.yaml \
16 |     --data_path  ../data/LAMM/2D_Finetune/meta_file/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json \
17 |     --vision_root_path ../data/LAMM/2D_Finetune/images \
18 |     --llm_proj_path ../ckpt/LAMM15_stage1/pytorch_model.pt \
19 |     --conv_template default \
20 |     --max_tgt_len 2048 \
21 |     --vision_type image \
22 |     --use_system \
23 |     --model lamm_sft \
24 |     --encoder_pretrain clip \
25 |     --gradient_checkpointing \
26 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
27 |     --vision_feature_type ${visfeat_type} \
28 |     --num_vision_token 256 \
29 |     --save_path  ${ckpt_dir}/${exp} \
30 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
31 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
32 | 
33 | 


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm2d_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=4
 3 | 
 4 | partition=$1
 5 | exp=$2
 6 | visfeat_type=local
 7 | 
 8 | now=$(date +"%Y%m%d_%H%M%S")
 9 | ckpt_dir=../ckpt
10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
11 | 
12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \
13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \
14 |     --stage 1 \
15 |     --cfg ./config/LAMM/train.yaml \
16 |     --data_path  ../data/LAMM/2D_Instruct/meta_file/LAMM_instruct_186k.json \
17 |     --vision_root_path ../data/LAMM/2D_Instruct/ \
18 |     --conv_template default \
19 |     --max_tgt_len 400 \
20 |     --vision_type image \
21 |     --use_system \
22 |     --model lamm_peft \
23 |     --encoder_pretrain clip \
24 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
25 |     --vision_feature_type ${visfeat_type} \
26 |     --num_vision_token 256 \
27 |     --save_path  ${ckpt_dir}/${exp} \
28 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
29 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
30 | 
31 | 


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm3d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=4
 3 | 
 4 | exp=$1
 5 | visfeat_type=local
 6 | now=$(date +"%Y%m%d_%H%M%S")
 7 | 
 8 | ckpt_dir=../ckpt
 9 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
10 | deepspeed --include localhost:0,1,2,3 --master_addr 127.0.0.1 --master_port 28457 train.py \
11 |     --stage 1 \
12 |     --cfg ./config/train.yaml \
13 |     --data_path  ../data/LAMM/3D_Instruct/meta_file/LAMM_3dinstruct_10k.json \
14 |     --vision_root_path ../data/LAMM/3D_Instruct/ \
15 |     --max_tgt_len 400 \
16 |     --vision_type pcl \
17 |     --use_system \
18 |     --model lamm_peft \
19 |     --encoder_pretrain epcl \
20 |     --encoder_ckpt_path ../model_zoo/epcl_ckpt/epcl_scannet_vit-L-14_256tokens_latest.pth \
21 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
22 |     --vision_feature_type ${visfeat_type} \
23 |     --num_vision_token 256 \
24 |     --save_path  ${ckpt_dir}/${exp} \
25 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
26 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
27 | 


--------------------------------------------------------------------------------
/src/tools/LAMM/train_lamm3d_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | numgpu=4
 3 | 
 4 | partition=$1
 5 | exp=$2
 6 | visfeat_type=local
 7 | now=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | ckpt_dir=../ckpt
10 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
11 | 
12 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \
13 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25441 train.py \
14 |     --stage 1 \
15 |     --cfg ./config/train.yaml \
16 |     --data_path  ../data/LAMM/3D_Instruct/meta_file/LAMM_3dinstruct_10k.json \
17 |     --vision_root_path ../data/LAMM/3D_Instruct/ \
18 |     --max_tgt_len 400 \
19 |     --vision_type pcl \
20 |     --use_system \
21 |     --model lamm_peft \
22 |     --encoder_pretrain epcl \
23 |     --encoder_ckpt_path ../model_zoo/epcl_ckpt/epcl_scannet_vit-L-14_256tokens_latest.pth \
24 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
25 |     --vision_feature_type ${visfeat_type} \
26 |     --num_vision_token 256 \
27 |     --save_path  ${ckpt_dir}/${exp} \
28 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
29 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
30 | 


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/data/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/data/ScanRefer.yaml:
--------------------------------------------------------------------------------
1 | NAME: ScanRefer
2 | DATA_PATH: data/scanrefer
3 | PC_PATH: data/scanrefer/point_data
4 | IMAGE_PATH: data/scanrefer/image_feature
5 | TEXT_PATH: data/scanrefer/text_feature
6 | scannet_object_clip_root: data/scanrefer/image_memory_bank
7 | scannet_text_clip_root: data/scanrefer/text_memory_bank


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/data/ScanReferValid.yaml:
--------------------------------------------------------------------------------
1 | NAME: ScanReferValid
2 | DATA_PATH: data/scanrefer
3 | NUM_CATEGORY: 261
4 | USE_NORMALS: FALSE


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/data/dataset_catalog.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scanrefer": {
 3 |         "config": "./data/ScanRefer.yaml",
 4 |         "train": "train",
 5 |         "test": "test",
 6 |         "usage": "train"
 7 |     },
 8 |     "scanrefer_valid": {
 9 |         "config": "./data/ScanReferValid.yaml",
10 |         "train": "train",
11 |         "test": "test",
12 |         "usage": "test"
13 |     }
14 | }


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/models/pointbert/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/pointbert/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/models/pointbert/PointTransformer_8192point.yaml:
--------------------------------------------------------------------------------
 1 | optimizer : {
 2 |   type: AdamW,
 3 |   kwargs: {
 4 |   lr : 0.0005, 
 5 |   weight_decay : 0.05
 6 | }}
 7 | 
 8 | scheduler: {
 9 |   type: CosLR,
10 |   kwargs: {
11 |     epochs: 300,
12 |     initial_epochs : 10
13 | }}
14 | 
15 | model : {
16 |   NAME: PointTransformer,
17 |   trans_dim: 384, 
18 |   depth: 12, 
19 |   drop_path_rate: 0.1, 
20 |   cls_dim: 40, 
21 |   num_heads: 6,
22 |   group_size: 32, 
23 |   num_group: 512, # 512
24 |   encoder_dims: 256,
25 | }
26 | npoints: 8192
27 | total_bs : 32
28 | step_per_update : 1
29 | max_epoch : 300
30 | grad_norm_clip : 10
31 | 
32 | consider_metric: CDL1


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/models/pointnet2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/models/pointnet2/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/scripts/pretrain_pointbert.sh:
--------------------------------------------------------------------------------
 1 | current_time=`date "+%Y_%m_%d_%H_%M"`
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1 \
 4 | python -m torch.distributed.launch \
 5 | --nproc_per_node=2 \
 6 | --master_port 61234 \
 7 | main.py \
 8 | --model ULIP_PointBERT \
 9 | --npoints 2048 \
10 | --lr 1e-4 \
11 | --epochs 40 \
12 | --batch_size 16 \
13 | --lr_end 1e-5 \
14 | --output_dir ./outputs/pointbert_2kpts_xyz_$current_time \
15 | # --use_scanrefer \


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/utils/.DS_Store


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/utils/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 |  * Copyright (c) 2023, salesforce.com, inc.
3 |  * All rights reserved.
4 |  * SPDX-License-Identifier: BSD-3-Clause
5 |  * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |  * By Le Xue
7 | '''
8 | 


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/utils/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/LAMM/ea571363883ceba58a0f724ef197ed7205e07465/src/tools/Octavius/ULIP/utils/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/utils/build.py:
--------------------------------------------------------------------------------
 1 | from utils import registry
 2 | 
 3 | 
 4 | DATASETS = registry.Registry('dataset')
 5 | 
 6 | 
 7 | def build_dataset_from_cfg(cfg, default_args = None):
 8 |     """
 9 |     Build a dataset, defined by `dataset_name`.
10 |     Args:
11 |         cfg (eDICT): 
12 |     Returns:
13 |         Dataset: a constructed dataset specified by dataset_name.
14 |     """
15 |     return DATASETS.build(cfg, default_args = default_args)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/src/tools/Octavius/ULIP/utils/io.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import numpy as np
 3 | import open3d
 4 | import os
 5 | 
 6 | class IO:
 7 |     @classmethod
 8 |     def get(cls, file_path):
 9 |         _, file_extension = os.path.splitext(file_path)
10 | 
11 |         if file_extension in ['.npy']:
12 |             return cls._read_npy(file_path)
13 |         elif file_extension in ['.pcd']:
14 |             return cls._read_pcd(file_path)
15 |         elif file_extension in ['.h5']:
16 |             return cls._read_h5(file_path)
17 |         elif file_extension in ['.txt']:
18 |             return cls._read_txt(file_path)
19 |         else:
20 |             raise Exception('Unsupported file extension: %s' % file_extension)
21 | 
22 |     # References: https://github.com/numpy/numpy/blob/master/numpy/lib/format.py
23 |     @classmethod
24 |     def _read_npy(cls, file_path):
25 |         return np.load(file_path)
26 |        
27 |     # References: https://github.com/dimatura/pypcd/blob/master/pypcd/pypcd.py#L275
28 |     # Support PCD files without compression ONLY!
29 |     @classmethod
30 |     def _read_pcd(cls, file_path):
31 |         pc = open3d.io.read_point_cloud(file_path)
32 |         ptcloud = np.array(pc.points)
33 |         return ptcloud
34 | 
35 |     @classmethod
36 |     def _read_txt(cls, file_path):
37 |         return np.loadtxt(file_path)
38 | 
39 |     @classmethod
40 |     def _read_h5(cls, file_path):
41 |         f = h5py.File(file_path, 'r')
42 |         return f['data'][()]


--------------------------------------------------------------------------------
/src/tools/Octavius/octavius_ChEF.sh:
--------------------------------------------------------------------------------
 1 | # choose one model_cfg from 2d, 3d, 2d+3d
 2 | model_cfg=config/ChEF/models/octavius_2d.yaml
 3 | recipe_cfg_list=(CIFAR10 Flickr30k CelebA_hair CelebA_smile VOC2012 ScienceQA)
 4 | 
 5 | for dataset in ${recipe_cfg_list[*]}; do
 6 |     srun -p <YOUR_PARTITION> --gres=gpu:1 --ntasks-per-node=1 --kill-on-bad-exit \
 7 |         python eval.py \
 8 |             --model_cfg ${model_cfg} \
 9 |             --recipe_cfg config/ChEF/scenario_recipes/LAMM/${dataset}.yaml
10 | done


--------------------------------------------------------------------------------
/src/tools/Octavius/train_octavius_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | partition=$1
 4 | numgpu=$2
 5 | config=$3
 6 | exp=$4
 7 | visfeat_type=local
 8 | 
 9 | now=$(date +"%Y%m%d_%H%M%S")
10 | ckpt_dir=../ckpt
11 | mkdir -p ${ckpt_dir}/${exp}/log_rest/
12 | 
13 | srun -p ${partition} -J ${exp} --gres=gpu:${numgpu} --ntasks-per-node 1 --kill-on-bad-exit \
14 | torchrun --nnodes=1 --nproc_per_node=${numgpu} --master_port=25440 train.py \
15 |     --stage 1 \
16 |     --cfg ${config} \
17 |     --conv_template default \
18 |     --max_tgt_len 400 \
19 |     --use_system \
20 |     --model octavius \
21 |     --encoder_pretrain clip \
22 |     --llm_ckpt_path ../model_zoo/vicuna_ckpt/13b_v0/ \
23 |     --vision_feature_type ${visfeat_type} \
24 |     --num_vision_token 256 \
25 |     --save_path  ${ckpt_dir}/${exp} \
26 |     --log_path ${ckpt_dir}/${exp}/log_rest/ \
27 |     2>&1 | tee ${ckpt_dir}/${exp}/log_rest/train_${now}.log
28 | 


--------------------------------------------------------------------------------