├── lmms-eval ├── lmms_eval │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── instance.py │ │ └── filter.py │ ├── models │ │ ├── model_utils │ │ │ └── __init__.py │ │ └── __init__.py │ ├── tasks │ │ ├── _task_utils │ │ │ ├── gpt_eval_utils.py │ │ │ └── file_utils.py │ │ ├── multilingual-llava-bench-in-the-wild │ │ │ ├── README.md │ │ │ ├── urdu_llava_in_the_wild.yaml │ │ │ ├── arabic_llava_in_the_wild.yaml │ │ │ ├── french_llava_in_the_wild.yaml │ │ │ ├── hindi_llava_in_the_wild.yaml │ │ │ ├── spanish_llava_in_the_wild.yaml │ │ │ ├── bengali_llava_in_the_wild.yaml │ │ │ ├── chinese_llava_in_the_wild.yaml │ │ │ ├── russian_llava_in_the_wild.yaml │ │ │ ├── japanese_llava_in_the_wild.yaml │ │ │ └── _default_template.yaml │ │ ├── ok_vqa │ │ │ ├── _ok_vqa.yaml │ │ │ ├── ok_vqa_val2014.yaml │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ └── utils.py │ │ ├── flickr30k │ │ │ ├── flickr30k.yaml │ │ │ └── flickr30k_test.yaml │ │ ├── mmmu │ │ │ ├── mmmu.yaml │ │ │ ├── mmmu_test.yaml │ │ │ └── mmmu_val.yaml │ │ ├── vqav2 │ │ │ ├── _vqav2.yaml │ │ │ ├── vqav2_test.yaml │ │ │ ├── vqav2_val.yaml │ │ │ └── _default_template_vqav2_yaml │ │ ├── cmmmu │ │ │ ├── _cmmmu.yaml │ │ │ ├── _default_template_cmmmu_yaml │ │ │ ├── cmmmu_test.yaml │ │ │ └── cmmmu_val.yaml │ │ ├── docvqa │ │ │ ├── docvqa.yaml │ │ │ ├── docvqa_val.yaml │ │ │ ├── docvqa_test.yaml │ │ │ ├── _default_template_docvqa_yaml │ │ │ └── utils.py │ │ ├── iconqa │ │ │ ├── iconqa.yaml │ │ │ ├── iconqa_test.yaml │ │ │ ├── iconqa_val.yaml │ │ │ ├── _default_template_docvqa_yaml │ │ │ └── utils.py │ │ ├── nocaps │ │ │ ├── nocaps.yaml │ │ │ ├── _default_template_nocaps_yaml │ │ │ ├── nocaps_test.yaml │ │ │ └── nocaps_val.yaml │ │ ├── textvqa │ │ │ ├── _textvqa.yaml │ │ │ ├── textvqa_test.yaml │ │ │ ├── textvqa_val.yaml │ │ │ └── _default_template_textvqa_yaml │ │ ├── websrc │ │ │ ├── websrc.yaml │ │ │ ├── websrc_val.yaml │ │ │ ├── websrc_test.yaml │ │ │ └── README.md │ │ ├── infovqa │ │ │ ├── infovqa.yaml │ │ │ ├── infovqa_val.yaml │ │ │ ├── infovqa_test.yaml │ │ │ ├── _default_template_infovqa_yaml │ │ │ └── utils.py │ │ ├── textcaps │ │ │ ├── textcaps.yaml │ │ │ ├── _default_template_textcaps_yaml │ │ │ ├── textcaps_test.yaml │ │ │ ├── textcaps_val.yaml │ │ │ └── textcaps_train.yaml │ │ ├── vizwiz_vqa │ │ │ ├── _vizwiz_vqa.yaml │ │ │ ├── vizwiz_vqa_val.yaml │ │ │ ├── vizwiz_vqa_test.yaml │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ └── utils.py │ │ ├── pope │ │ │ ├── pope_full.yaml │ │ │ ├── pope.yaml │ │ │ ├── pope_adv.yaml │ │ │ ├── pope_pop.yaml │ │ │ └── pope_random.yaml │ │ ├── scienceqa │ │ │ ├── scienceqa_full.yaml │ │ │ ├── scienceqa.yaml │ │ │ ├── scienceqa_img.yaml │ │ │ └── utils.py │ │ ├── coco_cap │ │ │ ├── coco2014_cap.yaml │ │ │ ├── coco2017_cap.yaml │ │ │ ├── coco_cap.yaml │ │ │ ├── coco2017_cap_test.yaml │ │ │ ├── coco2014_cap_test.yaml │ │ │ ├── coco2017_cap_val.yaml │ │ │ └── coco2014_cap_val.yaml │ │ ├── multidocvqa │ │ │ ├── multidocvqa.yaml │ │ │ ├── multidocvqa_test.yaml │ │ │ └── multidocvqa_val.yaml │ │ ├── screenspot │ │ │ ├── _screenspot.yaml │ │ │ ├── screenspot_rec_test.yaml │ │ │ ├── screenspot_reg_test.yaml │ │ │ ├── _default_template_reg_yaml │ │ │ └── _default_template_rec_yaml │ │ ├── refcoco │ │ │ ├── refcoco_seg_val.yaml │ │ │ ├── refcoco_bbox_val.yaml │ │ │ ├── refcoco_seg_test.yaml │ │ │ ├── refcoco_bbox_test.yaml │ │ │ ├── refcoco_bbox_testA.yaml │ │ │ ├── refcoco_bbox_testB.yaml │ │ │ ├── refcoco_seg_testA.yaml │ │ │ ├── refcoco_seg_testB.yaml │ │ │ ├── refcoco_bbox_rec_val.yaml │ │ │ ├── refcoco_bbox_rec_test.yaml │ │ │ ├── refcoco_bbox_rec_testA.yaml │ │ │ ├── refcoco_bbox_rec_testB.yaml │ │ │ ├── _refcoco.yaml │ │ │ ├── _generate_config.py │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ └── _default_template_bbox_rec_yaml │ │ ├── refcoco+ │ │ │ ├── refcoco+_seg_val.yaml │ │ │ ├── refcoco+_bbox_val.yaml │ │ │ ├── refcoco+_seg_testA.yaml │ │ │ ├── refcoco+_seg_testB.yaml │ │ │ ├── refcoco+_bbox_testA.yaml │ │ │ ├── refcoco+_bbox_testB.yaml │ │ │ ├── refcoco+_bbox_rec_val.yaml │ │ │ ├── refcoco+_bbox_rec_testA.yaml │ │ │ ├── refcoco+_bbox_rec_testB.yaml │ │ │ ├── _refcoco.yaml │ │ │ ├── _generate_config.py │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ └── _default_template_bbox_rec_yaml │ │ ├── refcocog │ │ │ ├── refcocog_seg_val.yaml │ │ │ ├── refcocog_bbox_val.yaml │ │ │ ├── refcocog_seg_test.yaml │ │ │ ├── _refcoco.yaml │ │ │ ├── refcocog_bbox_test.yaml │ │ │ ├── refcocog_bbox_rec_val.yaml │ │ │ ├── refcocog_bbox_rec_test.yaml │ │ │ ├── _generate_config.py │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ └── _default_template_bbox_rec_yaml │ │ ├── olympiadbench │ │ │ ├── olympiadbench.yaml │ │ │ ├── olympiadbench_test_cn.yaml │ │ │ ├── olympiadbench_test_en.yaml │ │ │ └── cn_utils.py │ │ ├── mathvista │ │ │ ├── mathvista.yaml │ │ │ ├── mathvista_test.yaml │ │ │ └── mathvista_testmini.yaml │ │ ├── mmbench │ │ │ ├── mmbench_cn.yaml │ │ │ ├── mmbench_en.yaml │ │ │ ├── mmbench_cn_test.yaml │ │ │ ├── mmbench_en_test.yaml │ │ │ ├── mmbench.yaml │ │ │ ├── mmbench_cn_dev.yaml │ │ │ ├── mmbench_en_dev.yaml │ │ │ ├── _default_template_mmbench_cn_yaml │ │ │ ├── _default_template_mmbench_en_yaml │ │ │ └── mmbench_cc.yaml │ │ ├── mathverse │ │ │ ├── mathverse.yaml │ │ │ ├── mathverse_testmini.yaml │ │ │ ├── mathverse_testmini_text_lite.yaml │ │ │ ├── mathverse_testmini_text_only.yaml │ │ │ ├── mathverse_testmini_vision_only.yaml │ │ │ ├── mathverse_testmini_text_dominant.yaml │ │ │ ├── mathverse_testmini_vision_dominant.yaml │ │ │ └── mathverse_testmini_vision_intensive.yaml │ │ ├── seedbench │ │ │ ├── seedbench_ppl.yaml │ │ │ ├── seedbench.yaml │ │ │ └── utils.py │ │ ├── stvqa │ │ │ ├── stvqa.yaml │ │ │ └── utils.py │ │ ├── ocrbench │ │ │ └── ocrbench.yaml │ │ ├── gqa │ │ │ ├── gqa.yaml │ │ │ └── utils.py │ │ ├── mmvet │ │ │ └── mmvet.yaml │ │ ├── chartqa │ │ │ ├── chartqa.yaml │ │ │ └── utils.py │ │ ├── ai2d │ │ │ ├── ai2d.yaml │ │ │ └── utils.py │ │ ├── realworldqa │ │ │ └── realworldqa.yaml │ │ ├── mme │ │ │ └── mme.yaml │ │ ├── llava-bench-coco │ │ │ └── llava-bench-coco.yaml │ │ ├── ferret │ │ │ └── ferret.yaml │ │ ├── llava-in-the-wild │ │ │ └── llava-in-the-wild.yaml │ │ ├── hallusion_bench │ │ │ └── hallusion_bench_image.yaml │ │ ├── seedbench_2 │ │ │ ├── seedbench_2.yaml │ │ │ └── utils.py │ │ └── naturalbench │ │ │ └── naturalbench.yaml │ └── filters │ │ ├── decontamination.py │ │ ├── transformation.py │ │ ├── __init__.py │ │ └── selection.py ├── miscs │ ├── llava_result_check.md │ ├── test_scienceqa.py │ ├── repr_scripts.sh │ ├── script.sh │ └── test_llava.py ├── setup.py ├── docs │ └── README.md ├── llava_repr_requirements.txt └── pyproject.toml ├── ola_vlm ├── eval │ ├── mmstar │ │ ├── evaluate │ │ │ └── __init__.py │ │ └── smp │ │ │ ├── __init__.py │ │ │ └── log.py │ ├── eval_mmstar.py │ ├── merge_json.py │ ├── get_probe_dsg_scores.py │ └── eval_cv_bench.py ├── __init__.py ├── model │ ├── aux_heads │ │ ├── __init__.py │ │ ├── depth_anything_v2 │ │ │ └── dinov2_layers │ │ │ │ ├── __init__.py │ │ │ │ ├── layer_scale.py │ │ │ │ ├── drop_path.py │ │ │ │ ├── mlp.py │ │ │ │ └── swiglu_ffn.py │ │ └── gen_head.py │ ├── __init__.py │ ├── multimodal_encoder │ │ └── builder.py │ ├── utils.py │ ├── consolidate.py │ ├── apply_delta.py │ ├── multimodal_projector │ │ └── builder.py │ └── make_delta.py ├── train │ ├── train_mem.py │ ├── ola_vlm_train_mem.py │ └── probe_dsg_train_mem.py └── constants.py ├── assets ├── pb.jpg ├── arch.png ├── cars.jpg ├── teaser.png └── probe_plots.png ├── datasets └── ocr_vqa │ └── dataset.json ├── .gitignore ├── scripts ├── probe │ ├── eval_probe_task.sh │ ├── eval_probe_cos_sim.sh │ └── probe.sh ├── zero2.json ├── zero2_offload.json ├── zero3.json ├── eval │ ├── mmstar.sh │ └── cv-bench.sh ├── zero3_offload.json └── train │ ├── finetune.sh │ ├── vpt_ift.sh │ ├── vpt.sh │ └── pretrain.sh ├── .gitattributes ├── docs └── Evaluation.md └── setup.py /lmms-eval/lmms_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lmms-eval/miscs/llava_result_check.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/model_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ola_vlm/eval/mmstar/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .mmstar import MMStar_eval -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/pb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/pb.jpg -------------------------------------------------------------------------------- /assets/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/arch.png -------------------------------------------------------------------------------- /assets/cars.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/cars.jpg -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/teaser.png -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: 3 | - ok_vqa_val2014 -------------------------------------------------------------------------------- /assets/probe_plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/probe_plots.png -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml: -------------------------------------------------------------------------------- 1 | group: flickr30k 2 | task: 3 | - flickr30k_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml: -------------------------------------------------------------------------------- 1 | group: mmmu 2 | task: 3 | - mmmu_val 4 | - mmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml: -------------------------------------------------------------------------------- 1 | group: vqav2 2 | task: 3 | - vqav2_val 4 | - vqav2_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml: -------------------------------------------------------------------------------- 1 | group: cmmmu 2 | task: 3 | - cmmmu_val 4 | - cmmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml: -------------------------------------------------------------------------------- 1 | group: docvqa 2 | task: 3 | - docvqa_val 4 | - docvqa_test -------------------------------------------------------------------------------- /ola_vlm/eval/mmstar/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .misc import * 3 | from .log import * -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml: -------------------------------------------------------------------------------- 1 | group: iconqa 2 | task: 3 | - iconqa_val 4 | - iconqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml: -------------------------------------------------------------------------------- 1 | group : nocaps 2 | task: 3 | - nocaps_test 4 | - nocaps_val -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml: -------------------------------------------------------------------------------- 1 | group: textvqa 2 | task: 3 | - textvqa_val 4 | - textvqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc.yaml: -------------------------------------------------------------------------------- 1 | group: websrc 2 | task: 3 | - websrc_val 4 | - websrc_test 5 | -------------------------------------------------------------------------------- /ola_vlm/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | from .model import LlavaPhi3ForCausalLM 3 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml: -------------------------------------------------------------------------------- 1 | group: infovqa 2 | task: 3 | - infovqa_val 4 | - infovqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml: -------------------------------------------------------------------------------- 1 | group : textcaps 2 | task: 3 | - textcaps_val 4 | - textcaps_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: 3 | - vizwiz_vqa_val 4 | - vizwiz_vqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_full.yaml: -------------------------------------------------------------------------------- 1 | group : pope_full 2 | task: 3 | - pope_adv 4 | - pope_pop 5 | - pope_random -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml: -------------------------------------------------------------------------------- 1 | group: scienceqa_full 2 | task: 3 | - scienceqa 4 | - scienceqa_img -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2014_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2017_cap 2 | task: 3 | - coco2017_cap_val 4 | - coco2017_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_test" 2 | test_split: test 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_val" 2 | test_split: val 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml: -------------------------------------------------------------------------------- 1 | group: multidocvqa 2 | task: 3 | - multidocvqa_val 4 | - multidocvqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot 2 | task: 3 | - screenspot_reg_test 4 | - screenspot_rec_test -------------------------------------------------------------------------------- /lmms-eval/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # This is to make sure that the package supports editable installs 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: ok_vqa_val2014 3 | test_split: val2014 4 | include: _default_template_vqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_val 3 | test_split: val 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_val 3 | test_split: val 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_test 3 | test_split: test 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testA 3 | include: _default_template_seg_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testB 3 | include: _default_template_seg_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_test 3 | test_split: test 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testA 3 | test_split: testA 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testB 3 | test_split: testB 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testA 3 | test_split: testA 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testB 3 | test_split: testB 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_test 3 | include: _default_template_seg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /datasets/ocr_vqa/dataset.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c9d2bb4c67462e2649be5099a3b790c95ad073fe46243310b79a1d4c8bee75ed 3 | size 112962519 4 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testA 3 | include: _default_template_bbox_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testB 3 | include: _default_template_bbox_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog 2 | task: 3 | - refcocog_seg_test 4 | - refcocog_seg_val 5 | - refcocog_bbox_test 6 | - refcocog_bbox_val 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_test 3 | include: _default_template_bbox_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_rec 2 | task: screenspot_rec_test 3 | include: _default_template_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_reg 2 | task: screenspot_reg_test 3 | include: _default_template_reg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test 5 | - coco2017_cap_val 6 | - coco2017_cap_test 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: "Provide a one-sentence caption for the provided image." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_val 3 | test_split: val 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: Provide a one-sentence caption for the provided image. -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml: -------------------------------------------------------------------------------- 1 | group: olympiadbench 2 | task: 3 | - olympiadbench_test_en 4 | - olympiadbench_test_cn 5 | metadata: 6 | - version: 0.0 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_test 3 | test_split: test 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testA 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testB 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testA 3 | test_split: testA 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testB 3 | test_split: testB 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_test 3 | include: _default_template_bbox_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_scienceqa.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"] 4 | for doc in dataset: 5 | print(doc["id"]) 6 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml: -------------------------------------------------------------------------------- 1 | group: mathvista 2 | task: 3 | - mathvista_testmini 4 | - mathvista_test 5 | metadata: 6 | version: 0.0 7 | gpt_eval_model_name: "gpt-4-0613" 8 | quick_extract: false -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+ 2 | task: 3 | - refcoco+_seg_val 4 | - refcoco+_seg_testA 5 | - refcoco+_seg_testB 6 | - refcoco+_bbox_val 7 | - refcoco+_bbox_testA 8 | - refcoco+_bbox_testB 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_val" 2 | test_split: validation 3 | metric_list: 4 | - metric: anls 5 | aggregation: mean 6 | higher_is_better: true 7 | include: _default_template_docvqa_yaml 8 | -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .da_v2_head import DepthHead, DAv2_Head, DepthProbeHead, TaskTokenDepthHead 2 | from .oneformer_head import OneFormerSegHead, OneFormerTaskTokenSegHead 3 | from .gen_head import GenHead, TaskTokenGenHead -------------------------------------------------------------------------------- /ola_vlm/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from ola_vlm.train.train import train 2 | 3 | if __name__ == "__main__": 4 | try: 5 | train(attn_implementation="flash_attention_2") 6 | except: 7 | train(attn_implementation="eager") 8 | -------------------------------------------------------------------------------- /ola_vlm/train/ola_vlm_train_mem.py: -------------------------------------------------------------------------------- 1 | from ola_vlm.train.ola_vlm_train import train 2 | 3 | if __name__ == "__main__": 4 | try: 5 | train(attn_implementation="flash_attention_2") 6 | except: 7 | train(attn_implementation="eager") -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_cn 2 | task: 3 | - mmbench_cn_dev 4 | - mmbench_cn_test 5 | - mmbench_cn_cc 6 | metadata: 7 | version: 0.0 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | sys_prompt: "有如下几个选项:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_en 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | metadata: 6 | version: 0.0 7 | sys_prompt: "There are several options:" 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | -------------------------------------------------------------------------------- /ola_vlm/train/probe_dsg_train_mem.py: -------------------------------------------------------------------------------- 1 | from ola_vlm.train.probe_dsg_train import train 2 | 3 | if __name__ == "__main__": 4 | try: 5 | train(attn_implementation="flash_attention_2") 6 | except: 7 | train(attn_implementation="eager") -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_val" 2 | test_split: validation 3 | output_type: generate_until 4 | metric_list: 5 | - metric: anls 6 | aggregation: mean 7 | higher_is_better: true 8 | include: _default_template_infovqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: urdu 4 | token: True 5 | task: "llava_in_the_wild_urdu" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: arabic 4 | token: True 5 | task: "llava_in_the_wild_arabic" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: french 4 | token: True 5 | task: "llava_in_the_wild_french" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: hindi 4 | token: True 5 | task: "llava_in_the_wild_hindi" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: spanish 4 | token: True 5 | task: "llava_in_the_wild_spanish" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco 2 | task: 3 | - refcoco_seg_test 4 | - refcoco_seg_val 5 | - refcoco_seg_testA 6 | - refcoco_seg_testB 7 | - refcoco_bbox_test 8 | - refcoco_bbox_val 9 | - refcoco_bbox_testA 10 | - refcoco_bbox_testB 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: bengali 4 | token: True 5 | task: "llava_in_the_wild_bengali" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: chinese 4 | token: True 5 | task: "llava_in_the_wild_chinese" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: russian 4 | token: True 5 | task: "llava_in_the_wild_russian" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function utils.textvqa_aggreate_submissions 6 | higher_is_better: true 7 | include: _default_template_textvqa_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: japanese 4 | token: True 5 | task: "llava_in_the_wild_japanese" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml: -------------------------------------------------------------------------------- 1 | task: mmbench_cn_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function cn_utils.mmbench_aggregate_test_results 6 | higher_is_better: true 7 | include: _default_template_mmbench_cn_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_test" 2 | test_split: test 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function en_utils.mmbench_aggregate_test_results 7 | higher_is_better: true 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def generate_submission_file(file_name, args, subpath="submissions"): 5 | path = os.path.join(args.output_path, subpath) 6 | os.makedirs(path, exist_ok=True) 7 | path = os.path.join(path, file_name) 8 | return os.path.abspath(path) 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | - mmbench_cn_dev 6 | - mmbench_cn_test 7 | - mmbench_cn_cc 8 | metadata: 9 | version: 0.0 10 | sys_prompt: "There are several options:" 11 | gpt_eval_model_name: "gpt-3.5-turbo-0613" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/CMMMU 2 | output_type: generate_until 3 | doc_to_visual: !function utils.cmmmu_doc_to_visual 4 | doc_to_text: !function utils.cmmmu_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_test" 2 | include: _default_template_vqav2_yaml 3 | test_split: test 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.vqav2_aggreate_submissions 7 | higher_is_better: true 8 | process_results: !function utils.vqav2_process_results_test 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_test" 2 | test_split: test 3 | process_results: !function utils.docvqa_test_process_results 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.docvqa_test_aggregate_results 7 | higher_is_better: true 8 | include: _default_template_docvqa_yaml 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_val" 2 | include: _default_template_vqav2_yaml 3 | test_split: validation 4 | metric_list: 5 | - metric: exact_match 6 | aggregation: mean 7 | higher_is_better: true 8 | ignore_case: true 9 | ignore_punctuation: true 10 | process_results: !function utils.vqav2_process_results_val 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_test" 2 | test_split: test 3 | output_type: generate_until 4 | process_results: !function utils.infovqa_test_process_results 5 | metric_list: 6 | - metric: submission 7 | aggregation: !function utils.infovqa_test_aggregate_results 8 | higher_is_better: true 9 | include: _default_template_infovqa_yaml 10 | -------------------------------------------------------------------------------- /ola_vlm/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_cn_dev" 2 | test_split: "dev" 3 | metric_list: 4 | - metric: gpt_eval_score 5 | aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval 6 | higher_is_better: true 7 | - metric: submission 8 | higher_is_better: true 9 | aggregation: !function cn_utils.mmbench_aggregate_dev_results 10 | include: _default_template_mmbench_cn_yaml 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_val 2 | test_split: validation 3 | metric_list: 4 | - metric: exact_match 5 | aggregation: mean 6 | higher_is_better: true 7 | ignore_case: true 8 | ignore_punctuation: true 9 | - metric: submission 10 | aggregation: !function utils.textvqa_aggreate_submissions 11 | higher_is_better: true 12 | include: _default_template_textvqa_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_dev" 2 | test_split: dev 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: gpt_eval_score 6 | aggregation: !function en_utils.mmbench_aggregate_dev_results_eval 7 | higher_is_better: true 8 | - metric: submission 9 | aggregation: !function en_utils.mmbench_aggregate_dev_results_submission 10 | higher_is_better: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | *.pyc 4 | *.egg-info 5 | dist 6 | 7 | # Log 8 | *.log 9 | *.log.* 10 | 11 | # Data 12 | !**/alpaca-data-conversation.json 13 | 14 | # Editor 15 | .idea 16 | *.swp 17 | 18 | # Other 19 | .DS_Store 20 | wandb 21 | output 22 | 23 | checkpoints 24 | ckpts* 25 | *.pth 26 | 27 | .ipynb_checkpoints 28 | *.ipynb 29 | 30 | # DevContainer 31 | !.devcontainer/* 32 | 33 | # Demo 34 | serve_images/ 35 | -------------------------------------------------------------------------------- /ola_vlm/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_phi3 import LlavaPhi3ForCausalLM, LlavaPhi3Config 3 | from .language_model.ola_llama import OlaLlavaLlamaForCausalLM, OlaLlavaLlamaConfig 4 | from .language_model.ola_phi3 import OlaLlavaPhi3ForCausalLM, OlaLlavaPhi3Config 5 | from .language_model.probe_llava_llama import ProbeDSGLlavaLlamaForCausalLM, ProbeDSGLlavaLlamaConfig -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_val 3 | test_split: val 4 | include: _default_template_vqa_yaml 5 | metric_list: 6 | - metric: exact_match 7 | aggregation: mean 8 | higher_is_better: true 9 | ignore_case: true 10 | ignore_punctuation: true 11 | # - metric: submission 12 | # aggregation: !function utils.vizwiz_vqa_aggreate_submissions 13 | # higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml: -------------------------------------------------------------------------------- 1 | group: mathverse 2 | task: 3 | - mathverse_testmini 4 | - mathverse_testmini_text_only 5 | - mathverse_testmini_text_lite 6 | - mathverse_testmini_text_dominant 7 | - mathverse_testmini_vision_intensive 8 | - mathverse_testmini_vision_dominant 9 | - mathverse_testmini_vision_only 10 | metadata: 11 | version: 0.0 12 | gpt_eval_model_name: "gpt-3.5-turbo" 13 | trunk_response: 30 14 | quick_match: false -------------------------------------------------------------------------------- /ola_vlm/eval/eval_mmstar.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from ola_vlm.eval.mmstar.evaluate import MMStar_eval 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--results_file', type=str, default="./playground/data/eval/mmstar_results.jsonl") 11 | return parser.parse_args() 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | args = parse_args() 17 | MMStar_eval(args.results_file) 18 | -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /lmms-eval/docs/README.md: -------------------------------------------------------------------------------- 1 | # LMMs Eval Documentation 2 | 3 | Welcome to the docs for `lmms-eval`! 4 | 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) 6 | 7 | ## Table of Contents 8 | 9 | * To learn about the command line flags, see the [commands](commands.md) 10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md). 11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VQAv2 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.vqav2_doc_to_visual 6 | doc_to_text: !function utils.vqav2_doc_to_text 7 | doc_to_target: "answer" 8 | generation_kwargs: 9 | max_new_tokens: 16 10 | metadata: 11 | - version: 0.0 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_test 3 | test_split: test 4 | include: _default_template_vqa_yaml 5 | process_results: !function utils.vizwiz_vqa_process_results 6 | metric_list: 7 | # - metric: exact_match 8 | # aggregation: mean 9 | # higher_is_better: true 10 | # ignore_case: true 11 | # ignore_punctuation: true 12 | - metric: submission 13 | aggregation: !function utils.vizwiz_vqa_aggreate_submissions 14 | higher_is_better: true 15 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: InfographicVQA 3 | dataset_kwargs: 4 | token: True 5 | doc_to_target: "answers" 6 | doc_to_visual: !function utils.infovqa_doc_to_visual 7 | doc_to_text: !function utils.infovqa_doc_to_text 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /scripts/probe/eval_probe_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | for IDX in $(seq 0 $((CHUNKS-1))); do 9 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.eval_probe_task \ 10 | --model-path $1 --json-file datasets/coco/annotations/captions_val2017.json \ 11 | --mode $2 --num-chunks $CHUNKS --chunk-idx $IDX & 12 | done 13 | 14 | wait 15 | 16 | python -m ola_vlm.eval.get_probe_task_scores --ckpt $1 --mode $2 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual 4 | doc_to_text: !function utils.screenspot_doc_to_text 5 | doc_to_target: "instruction" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.screenspot_process_result 10 | metric_list: 11 | - metric: screenspot_CIDEr 12 | aggregation : !function utils.screenspot_cider 13 | higher_is_better : true 14 | metadata: 15 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_test" 2 | test_split: test 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_test_results_for_submission 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | metric_list: 7 | - metric: submission 8 | aggregation: !function utils.cmmmu_test_aggregate_results_for_submission 9 | higher_is_better: false 10 | metadata: 11 | - version: 0.0 12 | include: _default_template_cmmmu_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/textvqa 2 | output_type: generate_until 3 | doc_to_visual: !function utils.textvqa_doc_to_visual 4 | doc_to_text: !function utils.textvqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.textvqa_process_results 10 | model_specific_prompt_kwargs: 11 | default: 12 | pre_prompt: "" 13 | post_prompt: "\nAnswer the question using a single word or phrase." 14 | ocr: false 15 | qwen_vl: 16 | pre_prompt: "" 17 | post_prompt: " Answer:" 18 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench_ppl" 5 | test_split: test 6 | output_type: multiple_choice 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text_mc 9 | doc_to_choice : !function utils.seed_doc_to_choice 10 | doc_to_target: !function utils.seed_doc_to_mc_target 11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 12 | metric_list: 13 | - metric: acc 14 | metadata: 15 | - version: 0.0 -------------------------------------------------------------------------------- /scripts/probe/eval_probe_cos_sim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | for IDX in $(seq 0 $((CHUNKS-1))); do 9 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.eval_probe_dsg \ 10 | --model-path $1 --json-file datasets/coco/annotations/captions_val2017.json \ 11 | --mode $2 --num-chunks $CHUNKS --chunk-idx $IDX & 12 | done 13 | 14 | wait 15 | 16 | python -m ola_vlm.eval.merge_json --ckpt $1 --mode $2 --num-chunks $CHUNKS 17 | 18 | python -m ola_vlm.eval.get_probe_dsg_scores --ckpt $1 --mode $2 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: DocVQA 3 | dataset_kwargs: 4 | token: True 5 | output_type: generate_until 6 | doc_to_visual: !function utils.docvqa_doc_to_visual 7 | doc_to_text: !function utils.docvqa_doc_to_text 8 | doc_to_target: "answers" 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | post_prompt: "\nAnswer the question using a single word or phrase." 17 | qwen_vl: 18 | pre_prompt: "" 19 | post_prompt: " Answer:" 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_val" 2 | test_split: val 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_results 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original 9 | metric_list: 10 | - metric: cmmmu_acc 11 | aggregation: !function utils.cmmmu_aggregate_results 12 | higher_is_better: true 13 | metadata: 14 | - version: 0.0 15 | include: _default_template_cmmmu_yaml 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VizWiz-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metadata: 10 | - version: 0.0 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 15 | process_results: !function utils.vizwiz_vqa_process_results 16 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": false, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ST-VQA 2 | task: "stvqa" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.stvqa_doc_to_visual 6 | doc_to_text: !function utils.stvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.stvqa_process_results 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.stvqa_aggregate_submissions 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/miscs/repr_scripts.sh: -------------------------------------------------------------------------------- 1 | # install lmms_eval without building dependencies 2 | cd lmms_eval; 3 | pip install --no-deps -U -e . 4 | 5 | # install LLaVA without building dependencies 6 | cd LLaVA 7 | pip install --no-deps -U -e . 8 | 9 | # install all the requirements that require for reproduce llava results 10 | pip install -r llava_repr_requirements.txt 11 | 12 | # Run and exactly reproduce llava_v1.5 results! 13 | # mme as an example 14 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto" --tasks mme --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/ -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: echo840/OCRBench 2 | dataset_kwargs: 3 | token: True 4 | task: "ocrbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual 8 | doc_to_text: !function utils.ocrbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.ocrbench_process_results 17 | metric_list: 18 | - metric: ocrbench_accuracy 19 | aggregation: !function utils.ocrbench_aggregate_accuracy 20 | higher_is_better: true 21 | metadata: 22 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AVAILABLE_MODELS = { 4 | "llava": "Llava", 5 | "llava_hf": "LlavaHf", 6 | "llava_sglang": "LlavaSglang", 7 | "qwen_vl": "Qwen_VL", 8 | "fuyu": "Fuyu", 9 | "gpt4v": "GPT4V", 10 | "instructblip": "InstructBLIP", 11 | "minicpm_v": "MiniCPM_V", 12 | "idefics2": "Idefics2", 13 | "qwen_vl_api": "Qwen_VL_API", 14 | "phi3v": "Phi3v", 15 | } 16 | 17 | for model_name, model_class in AVAILABLE_MODELS.items(): 18 | try: 19 | exec(f"from .{model_name} import {model_class}") 20 | except ImportError: 21 | pass 22 | 23 | 24 | import hf_transfer 25 | 26 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | dataset_name: "cn" 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text 9 | generation_kwargs: 10 | max_new_tokens: 256 11 | temperature: 0 12 | top_p: 0 13 | num_beams: 1 14 | do_sample: false 15 | process_results: !function cn_utils.mmbench_process_results 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 20 | model_specific_generation_kwargs: 21 | llava: 22 | image_aspect_ratio: original 23 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/decontamination.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class DecontaminationFilter(Filter): 5 | """ 6 | A filter which evaluates 7 | """ 8 | 9 | name = "track_decontamination" 10 | 11 | def __init__(self, path) -> None: 12 | """ 13 | 14 | TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 15 | should further cache result on a given (task_name, doc_id) 16 | """ 17 | self._decontam_results = None 18 | 19 | def apply(self, resps, docs) -> None: 20 | """ 21 | Return {"no_contamination", "only_contamination"} keys for the 2 different subsets 22 | """ 23 | pass 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_test_results_for_submission 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: original 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.mmmu_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc 2 | task: "websrc_val" 3 | test_split: dev 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: websrc_squad_f1 16 | aggregation: !function utils.websrc_aggregate_results 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc-test 2 | task: "websrc_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.websrc_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | model_specific_prompt_kwargs: 6 | default: 7 | pre_prompt: "" 8 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual 10 | doc_to_text: !function en_utils.mmbench_doc_to_text 11 | doc_to_target: "answer" 12 | process_results: !function en_utils.mmbench_process_results 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | output_type: generate_until 17 | dataset_name: "en" 18 | generation_kwargs: 19 | until: 20 | - "ASSISTANT:" 21 | max_new_tokens: 1024 22 | temperature: 0 23 | top_p: 0 24 | num_beams: 1 25 | do_sample: false 26 | -------------------------------------------------------------------------------- /ola_vlm/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | from .clip_convnext_encoder import CLIPConvNextVisionTower 4 | 5 | 6 | def build_vision_tower(vision_tower_cfg, **kwargs): 7 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 8 | if "clip" in vision_tower and "convnext" not in vision_tower: 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | elif "convnext" in vision_tower.lower(): 11 | return CLIPConvNextVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 12 | elif "sam" in vision_tower.lower(): 13 | return SAMVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_val" 3 | test_split: validation 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | metric_list: 17 | - metric: mmmu_acc 18 | aggregation: !function utils.mmmu_aggregate_results 19 | higher_is_better: true 20 | metadata: 21 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_cn" 5 | test_split: test_cn 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function cn_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function cn_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_en" 5 | test_split: test_en 6 | output_type: generate_until 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function en_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function en_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2017_cap_test" 5 | group : "coco_caption2017" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_val" 3 | test_split: val 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_results 13 | metric_list: 14 | - metric: anls 15 | aggregation: !function utils.multidocvqa_aggregate_results_anls 16 | higher_is_better: true 17 | - metric: accuracy 18 | aggregation: !function utils.multidocvqa_aggregate_results_accuracy 19 | higher_is_better: true 20 | model_specific_prompt_kwargs: 21 | default: 22 | pre_prompt: "" 23 | post_prompt: "\nAnswer the question using a single word or phrase." 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/gqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/GQA 2 | dataset_name: testdev_balanced_instructions 3 | dataset_kwargs: 4 | token: True 5 | task: "gqa" 6 | test_split: testdev 7 | output_type: generate_until 8 | doc_to_visual: !function utils.gqa_doc_to_visual 9 | doc_to_text: !function utils.gqa_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | ignore_case: true 22 | ignore_punctuation: true 23 | metadata: 24 | - version: 0.0 25 | 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2014_cap_test" 5 | group : "coco_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: "Provide a one-sentence caption for the provided image." 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/utils.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | GQA_RAW_IMAGE_DATASET = None 4 | GQA_ID2IMAGE = None 5 | 6 | 7 | def gqa_doc_to_visual(doc): 8 | global GQA_RAW_IMAGE_DATASET 9 | global GQA_ID2IMAGE 10 | if GQA_RAW_IMAGE_DATASET is None: 11 | GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True) 12 | GQA_ID2IMAGE = {} 13 | for row in GQA_RAW_IMAGE_DATASET: 14 | GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB") 15 | image = GQA_ID2IMAGE[doc["imageId"]] 16 | return [image] 17 | 18 | 19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs): 20 | question = doc["question"] 21 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 22 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 23 | return f"{pre_prompt}{question}{post_prompt}" 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OK-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual 4 | doc_to_text: !function utils.ok_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metric_list: 10 | - metric: exact_match 11 | aggregation: mean 12 | higher_is_better: true 13 | ignore_case: true 14 | ignore_punctuation: true 15 | - metric: submission 16 | aggregation: !function utils.ok_vqa_aggreate_submissions 17 | higher_is_better: true 18 | process_results: !function utils.ok_vqa_process_results 19 | model_specific_prompt_kwargs: 20 | default: 21 | pre_prompt: "" 22 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /scripts/zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "offload_optimizer": { 19 | "device": "cpu", 20 | "pin_memory": true 21 | }, 22 | "offload_param": { 23 | "device": "cpu", 24 | "pin_memory": true 25 | }, 26 | "overlap_comm": false, 27 | "contiguous_gradients": true, 28 | "sub_group_size": 1e9, 29 | "reduce_bucket_size": "auto" 30 | } 31 | } -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMVet 2 | dataset_kwargs: 3 | token: True 4 | task: "mmvet" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mmvet_doc_to_visual 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"] 9 | doc_to_target: "{{answer}}" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mmvet_process_results # apply gpt eval here 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mmvet_aggregate_results 22 | higher_is_better: true 23 | metadata: 24 | version: 0.0 25 | gpt_eval_model_name: "gpt-4" 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "" 30 | -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/NoCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "nocaps_test" 5 | group : "nocaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.nocaps_doc_to_visual 9 | doc_to_text: !function utils.nocaps_doc_to_text 10 | doc_to_target: "annotations_captions" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.nocaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: nocaps_passthrough 21 | aggregation : !function utils.nocaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_nocaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_test" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | model_specific_generation_kwargs: 28 | llava: 29 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "textcaps_test" 5 | group : "textcaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.textcaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: textcaps_passthrough 21 | aggregation : !function utils.textcaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitattributes 2 | # Set the default behavior, in case people don't have core.autocrlf set. 3 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion 4 | * text=auto 5 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes 6 | # Source files 7 | # ============ 8 | *.pxd text diff=python 9 | *.py text diff=python 10 | *.py3 text diff=python 11 | *.pyw text diff=python 12 | *.pyx text diff=python 13 | *.pyz text diff=python 14 | *.pyi text diff=python 15 | # Binary files 16 | # ============ 17 | *.db binary 18 | *.p binary 19 | *.pkl binary 20 | *.pickle binary 21 | *.pyc binary export-ignore 22 | *.pyo binary export-ignore 23 | *.pyd binary 24 | # Jupyter notebook 25 | *.ipynb text eol=lf 26 | datasets/ocr_vqa/dataset.json filter=lfs diff=lfs merge=lfs -text 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ICON-QA 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.doc_to_visual 6 | doc_to_text: !function utils.doc_to_text 7 | doc_to_target: "answers" 8 | # process_results: !function utils.test_process_results 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | statement: "Given a set of images and a question, please provide the answer to the question.\n" 17 | options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly." 18 | freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase." 19 | metric_list: 20 | - metric: anls 21 | aggregation: mean 22 | higher_is_better: true -------------------------------------------------------------------------------- /scripts/eval/mmstar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | for IDX in $(seq 0 $((CHUNKS-1))); do 9 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.model_mmstar_loader \ 10 | --model-path $1 --path datasets/eval/MMStar \ 11 | --answers-file datasets/eval/results/$2/mmstar/${CHUNKS}_${IDX}.jsonl \ 12 | --num-chunks $CHUNKS --chunk-idx $IDX --temperature 0 --conv-mode $3 & 13 | done 14 | 15 | wait 16 | 17 | output_file=datasets/eval/results/$2/mmstar/merge.jsonl 18 | 19 | # Clear out the output file if it exists. 20 | > "$output_file" 21 | 22 | # Loop through the indices and concatenate each file. 23 | for IDX in $(seq 0 $((CHUNKS-1))); do 24 | cat datasets/eval/results/$2/mmstar/${CHUNKS}_${IDX}.jsonl >> "$output_file" 25 | done 26 | 27 | python ola_vlm/eval/eval_mmstar.py --results_file $output_file 28 | 29 | -------------------------------------------------------------------------------- /lmms-eval/miscs/script.sh: -------------------------------------------------------------------------------- 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme_llava_prompt --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 2 | 3 | 4 | gpu = 8 bs 1: 5 | 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1 7 | | Tasks |Version|Filter|n-shot| Metric |Value| |Stderr | 8 | |----------------|-------|------|-----:|-----------|----:|---|------:| 9 | |mme_llava_prompt|Yaml |none | 0|exact_match| 1873|± |38.4331| 10 | 11 | gpu = 8 bs 1 use_flash_attention_2=True: 12 | 13 | 14 | 15 | 16 | 17 | gpu = 4 bs 1 use_flash_attention_2=True: 18 | 19 | 20 | 21 | accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks scienceqa --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 22 | -------------------------------------------------------------------------------- /scripts/eval/cv-bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | for IDX in $(seq 0 $((CHUNKS-1))); do 9 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.model_cvbench_loader \ 10 | --model-path $1 --path datasets/eval/CV-Bench \ 11 | --answers-file datasets/eval/results/$2/cv-bench/${CHUNKS}_${IDX}.jsonl \ 12 | --num-chunks $CHUNKS --chunk-idx $IDX --temperature 0 --conv-mode $3 & 13 | done 14 | 15 | wait 16 | 17 | output_file=datasets/eval/results/$2/cv-bench/merge.jsonl 18 | 19 | # Clear out the output file if it exists. 20 | > "$output_file" 21 | 22 | # Loop through the indices and concatenate each file. 23 | for IDX in $(seq 0 $((CHUNKS-1))); do 24 | cat datasets/eval/results/$2/cv-bench/${CHUNKS}_${IDX}.jsonl >> "$output_file" 25 | done 26 | 27 | python ola_vlm/eval/eval_cv_bench.py --results_file $output_file 28 | 29 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val2014"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"ok_vqa_{split}") 16 | 17 | save_path = f"./ok_vqa_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "ok_vqa", "task": dump_tasks} 23 | 24 | with open("./_ok_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/llava_repr_requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | datasets==2.16.1 3 | evaluate==0.4.1 4 | hf_transfer==0.1.6 5 | Jinja2==3.1.3 6 | numpy==1.26.4 7 | openai==1.13.3 8 | packaging==23.2 9 | pandas==2.2.1 10 | Pillow==10.2.0 11 | protobuf==4.25.3 12 | pycocoevalcap==1.2 13 | pycocotools==2.0.7 14 | pytablewriter==1.2.0 15 | pytest==8.0.2 16 | python_Levenshtein==0.25.0 17 | pytz==2024.1 18 | PyYAML==6.0.1 19 | PyYAML==6.0.1 20 | Requests==2.31.0 21 | sacrebleu==2.4.0 22 | scikit_learn==1.2.2 23 | sentencepiece==0.1.99 24 | setuptools==68.2.2 25 | sglang==0.1.12 26 | shortuuid==1.0.12 27 | sqlitedict==2.1.0 28 | tenacity==8.2.3 29 | torch==2.0.1 30 | openai>=1.0.0 31 | pycocoevalcap 32 | tokenizers==0.15.2 33 | tqdm==4.66.2 34 | tqdm-multiprocess 35 | transformers==4.37.2 36 | zstandard 37 | pillow 38 | pyyaml 39 | sympy 40 | mpmath 41 | Jinja2 42 | openpyxl 43 | Levenshtein 44 | hf_transfer 45 | tenacity 46 | wandb>=0.16.0 47 | transformers-stream-generator 48 | tiktoken 49 | pre-commit -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_testmini" 5 | test_split: testmini 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | phi3v: 28 | shot_type: "solution" 29 | model_specific_generation_kwargs: 30 | llava: 31 | image_aspect_ratio: original -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs): 9 | question = doc["question"] 10 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 11 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 12 | return f"{pre_prompt}{question}{post_prompt}" 13 | 14 | 15 | def stvqa_doc_to_visual(doc): 16 | return [doc["image"].convert("RGB")] 17 | 18 | 19 | def stvqa_process_results(doc, results): 20 | answer = results[0] 21 | return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}} 22 | 23 | 24 | def stvqa_aggregate_submissions(results, args): 25 | file = generate_submission_file("stvqa_test_for_submission.json", args) 26 | with open(file, "w") as f: 27 | json.dump(results, f) 28 | logging.getLogger("lmms-eval").info(f"Results saved to {file}") 29 | -------------------------------------------------------------------------------- /ola_vlm/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val", "test"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"vizwiz_{task}_{split}") 16 | 17 | save_path = f"./vizwiz_{task}_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "vizwiz_vqa", "task": dump_tasks} 23 | 24 | with open("./_vizwiz_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ChartQA 2 | dataset_kwargs: 3 | token: True 4 | task: "chartqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.chartqa_doc_to_visual 8 | doc_to_text: !function utils.chartqa_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | process_results: !function utils.chartqa_process_results 15 | metric_list: 16 | - metric: relaxed_overall 17 | aggregation: mean 18 | higher_is_better: true 19 | - metric: relaxed_human_split 20 | aggregation: mean 21 | higher_is_better: true 22 | - metric: relaxed_augmented_split 23 | aggregation: mean 24 | higher_is_better: true 25 | metadata: 26 | - version: 0.0 27 | model_specific_prompt_kwargs: 28 | default: 29 | pre_prompt: "" 30 | post_prompt: "\nAnswer the question with a single word." 31 | qwen_vl: 32 | pre_prompt: "" 33 | post_prompt: " Answer:" 34 | 35 | -------------------------------------------------------------------------------- /ola_vlm/eval/merge_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser( 6 | description='Probe eval') 7 | parser.add_argument('--ckpt', 8 | help='ckpt', 9 | default='probe_llava-1.5-vicuna-7b-lr-1e-3') 10 | parser.add_argument('--mode', 11 | help='mode', 12 | default='gen') 13 | parser.add_argument("--num-chunks", type=int, default=1) 14 | 15 | 16 | def save_merged_json(data, output_file): 17 | with open(output_file, 'w') as file: 18 | json.dump(data, file, indent=4) 19 | 20 | if __name__ == "__main__": 21 | args = parser.parse_args() 22 | merge_data = {} 23 | name = args.ckpt.split("/")[-1] 24 | 25 | for i in range(args.num_chunks): 26 | with open(f'plots/probe_scores/{name}/{args.mode}/{args.num_chunks}_{i}.json', 'r') as file: 27 | data = json.load(file) 28 | merge_data.update(data) 29 | 30 | save_merged_json(merge_data, f'plots/probe_scores/{name}/{args.mode}.json') -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ai2d 2 | task: "ai2d" 3 | dataset_kwargs: 4 | token: True 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ai2d_doc_to_visual 8 | doc_to_text: !function utils.ai2d_doc_to_text 9 | doc_to_target: !function utils.ai2d_doc_to_target 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | metric_list: 15 | - metric: exact_match 16 | aggregation: mean 17 | higher_is_better: true 18 | ignore_case: true 19 | ignore_punctuation: true 20 | metadata: 21 | - version: 0.0 22 | 23 | model_specific_prompt_kwargs: 24 | default: 25 | prompt_format: mcq 26 | pre_prompt: "" 27 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 28 | # qwen formulate ai2d as question answering instead of mcq 29 | qwen_vl: 30 | prompt_format: qa 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | 34 | model_specific_target_kwargs: 35 | default: "mcq" 36 | qwen_vl: "qa" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/instance.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Literal, Tuple 3 | 4 | 5 | @dataclass 6 | class Instance: 7 | request_type: Literal["loglikelihood", "generate_until"] 8 | arguments: tuple 9 | idx: int 10 | metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here 11 | resps: list = field(default_factory=list) 12 | filtered_resps: dict = field(default_factory=dict) 13 | 14 | # initialized after init 15 | task_name: str = None 16 | doc_id: str = None 17 | repeats: str = None 18 | doc: dict = None 19 | 20 | def __post_init__(self) -> None: 21 | # unpack metadata field 22 | self.task_name, self.doc_id, self.repeats = self.metadata 23 | 24 | @property 25 | def args(self): 26 | """ 27 | Returns (string,) where `string` is the string to calculate loglikelihood over 28 | """ 29 | return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) 30 | -------------------------------------------------------------------------------- /ola_vlm/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m ola_vlm.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from ola_vlm.model import * 10 | from ola_vlm.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-FULL 3 | task: "scienceqa" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | 33 | model_specific_generation_kwargs: 34 | llava: 35 | image_aspect_ratio: original 36 | 37 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_llava.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | 4 | import torch 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration 6 | 7 | model_id = "llava-hf/llava-1.5-7b-hf" 8 | 9 | prompt_1 = "USER: \nWhat does this image show?\nASSISTANT:" 10 | prompt_2 = "USER: \nWhat is the difference between these two images?\nASSISTANT:" 11 | image_file_1 = "image1.png" 12 | image_file_2 = "image2.png" 13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0) 14 | processor = AutoProcessor.from_pretrained(model_id) 15 | raw_image_1 = Image.open(image_file_1) 16 | raw_image_2 = Image.open(image_file_2) 17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16) 18 | import pdb 19 | 20 | pdb.set_trace() 21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False) 22 | print(processor.batch_decode(output, skip_special_tokens=True)) 23 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val"] 5 | splits = ["test", "val"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcocog_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcocog", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | # The return value of process_results will be used by metrics 15 | process_results: !function utils.seed_process_result 16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 17 | metric_list: 18 | - metric: seed_image 19 | aggregation: !function utils.seed_aggregation_result 20 | higher_is_better: true 21 | - metric: seed_video 22 | aggregation: !function utils.seed_aggregation_result 23 | higher_is_better: true 24 | - metric: seed_all 25 | aggregation: !function utils.seed_aggregation_result 26 | higher_is_better: true 27 | metadata: 28 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "val", "testA", "testB"] 5 | splits = ["val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco+_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco+", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val", "testA", "testB"] 5 | splits = ["test", "val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_name: cc 3 | dataset_kwargs: 4 | token: True 5 | task: "mmbench_cn_cc" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 256 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function cc_utils.mmbench_cn_cc_process_results 18 | metric_list: 19 | - metric: gpt_eval_score 20 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval 21 | higher_is_better: true 22 | - metric: submission 23 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results 24 | metadata: 25 | version: 0.0 26 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | pre_prompt: "" 31 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 7 | 8 | lmms_logger = logging.getLogger("lmms-eval") 9 | 10 | 11 | def infovqa_doc_to_visual(doc): 12 | return [doc["image"].convert("RGB")] 13 | 14 | 15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs): 16 | question = doc["question"] 17 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 18 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 19 | return f"{pre_prompt}{question}{post_prompt}" 20 | 21 | 22 | def infovqa_test_process_results(doc, results): 23 | pred = results[0] 24 | questionId = doc["questionId"] 25 | return {"submission": {"questionId": int(questionId), "answer": pred}} 26 | 27 | 28 | def infovqa_test_aggregate_results(results, args): 29 | # save results as json 30 | file = generate_submission_file("infovqa_test_for_submission.json", args) 31 | with open(file, "w") as f: 32 | json.dump(results, f) 33 | lmms_logger.info(f"Results saved to {file}") 34 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_kwargs: 3 | token: True 4 | task: "pope" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.pope_doc_to_visual 8 | doc_to_text: !function utils.pope_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.pope_process_results 17 | metric_list: 18 | - metric: pope_accuracy 19 | aggregation: !function utils.pope_aggregate_accuracy 20 | higher_is_better: true 21 | - metric: pope_precision 22 | aggregation: !function utils.pope_aggregate_precision 23 | higher_is_better: true 24 | - metric: pope_recall 25 | aggregation: !function utils.pope_aggregate_recall 26 | higher_is_better: true 27 | - metric: pope_f1_score 28 | aggregation: !function utils.pope_aggregate_f1_score 29 | higher_is_better: true 30 | - metric: pope_yes_ratio 31 | aggregation: !function utils.pope_aggregate_yes_ratio 32 | higher_is_better: true 33 | metadata: 34 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | logger = logging.getLogger("lmms-eval") 8 | 9 | 10 | def docvqa_doc_to_visual(doc): 11 | return [doc["image"].convert("RGB")] 12 | 13 | 14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs): 15 | question = doc["question"] 16 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 17 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 18 | return f"{pre_prompt}{question}{post_prompt}" 19 | 20 | 21 | def docvqa_test_process_results(doc, results): 22 | pred = results[0] 23 | questionId = doc["questionId"] 24 | return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}} 25 | 26 | 27 | def docvqa_test_aggregate_results(results, args): 28 | # save results as json 29 | path = generate_submission_file("docvqa_test_for_submission.json", args) 30 | with open(path, "w") as f: 31 | json.dump(results, f) 32 | logger.info(f"Results saved to {path}") 33 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-IMG 3 | task: "scienceqa_img" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | idefics2: 33 | format: default 34 | pre_prompt: "" 35 | post_prompt: "\nAnswer:" 36 | model_specific_generation_kwargs: 37 | llava: 38 | image_aspect_ratio: original 39 | 40 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini" 6 | test_split: testmini 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_adv.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_adv" 6 | test_split: adversarial 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_pop.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_pop" 6 | test_split: popular 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_random.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_random" 6 | test_split: random 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RealWorldQA 2 | dataset_kwargs: 3 | token: True 4 | task: "realworldqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual 8 | doc_to_text: !function utils.realworldqa_doc_to_text 9 | doc_to_target: "answer" 10 | 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | 18 | filter_list: 19 | - name: "flexible-extract" 20 | filter: 21 | - function: !function utils.NumberWordsToDigitsFilter 22 | - function: !function utils.MultiChoiceRegexFilter 23 | group_select: 0 24 | ignore_case: true 25 | ignore_punctuation: true 26 | regex_pattern: "(\\([A-Z]\\))" 27 | 28 | metric_list: 29 | - metric: exact_match 30 | aggregation: mean 31 | higher_is_better: true 32 | ignore_case: true 33 | ignore_punctuation: true 34 | 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" 39 | gpt4v: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | metadata: 43 | - version: 0.0 44 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_lite" 6 | test_split: text_lite 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_text_only 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_only" 6 | test_split: text_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_only" 6 | test_split: vision_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_dominant" 6 | test_split: text_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_dominant" 6 | test_split: vision_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_intensive" 6 | test_split: vision_intensive 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_default_template.yaml: -------------------------------------------------------------------------------- 1 | test_split: train 2 | output_type: generate_until 3 | doc_to_visual: !function utils.llava_doc_to_visual 4 | doc_to_text: !function utils.llava_doc_to_text 5 | doc_to_target: "gpt_answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | image_aspect_ratio: original 10 | max_new_tokens: 1024 11 | temperature: 0 12 | top_p: 0 13 | num_beams: 1 14 | do_sample: false 15 | process_results: !function utils.llava_process_results 16 | metric_list: 17 | - metric: gpt_eval_llava_all 18 | aggregation: !function utils.llava_all_aggregation 19 | higher_is_better: true 20 | - metric: gpt_eval_llava_conv 21 | aggregation: !function utils.llava_conv_aggregation 22 | higher_is_better: true 23 | - metric: gpt_eval_llava_detail 24 | aggregation: !function utils.llava_detail_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_complex 27 | aggregation: !function utils.llava_complex_aggregation 28 | higher_is_better: true 29 | metadata: 30 | version: 0.0 31 | gpt_eval_model_name: "gpt-4-0613" 32 | model_specific_prompt_kwargs: 33 | default: 34 | pre_prompt: "" 35 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mme/mme.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MME 2 | dataset_kwargs: 3 | token: True 4 | task: "mme" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mme_doc_to_visual 8 | doc_to_text: !function utils.mme_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | # The return value of process_results will be used by metrics 17 | process_results: !function utils.mme_process_results 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: mme_percetion_score 21 | aggregation: !function utils.mme_aggregate_results 22 | higher_is_better: true 23 | - metric: mme_cognition_score 24 | aggregation: !function utils.mme_aggregate_results 25 | higher_is_better: true 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | otterhd: 34 | pre_prompt: "" 35 | post_prompt: " Answer:" 36 | metadata: 37 | - version: 0.0 38 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-coco 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_bench_coco" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | process_results: !function utils.llava_process_results 19 | metric_list: 20 | - metric: gpt_eval_llava_all 21 | aggregation: !function utils.llava_all_aggregation 22 | higher_is_better: true 23 | - metric: gpt_eval_llava_conv 24 | aggregation: !function utils.llava_conv_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_detail 27 | aggregation: !function utils.llava_detail_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_complex 30 | aggregation: !function utils.llava_complex_aggregation 31 | higher_is_better: true 32 | metadata: 33 | version: 0.0 34 | gpt_eval_model_name: "gpt-4-0314" 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ferret/ferret.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/Ferret-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "ferret" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ferret_doc_to_visual 8 | doc_to_text: !function utils.ferret_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.ferret_process_results 20 | metric_list: 21 | - metric: gpt_eval_ferret_all 22 | aggregation: !function utils.ferret_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_ferret_refer_desc 25 | aggregation: !function utils.ferret_refer_desc_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_ferret_refer_reason 28 | aggregation: !function utils.ferret_refer_reason_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_ferret_ground_conv 31 | aggregation: !function utils.ferret_ground_conv_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "gpt-4-0314" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-in-the-wild 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_in_the_wild" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.llava_process_results 20 | metric_list: 21 | - metric: gpt_eval_llava_all 22 | aggregation: !function utils.llava_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_llava_conv 25 | aggregation: !function utils.llava_conv_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_llava_detail 28 | aggregation: !function utils.llava_detail_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_llava_complex 31 | aggregation: !function utils.llava_complex_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "magma" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" 40 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /ola_vlm/eval/mmstar/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 6 | logger = logging.getLogger(name) 7 | if name in logger_initialized: 8 | return logger 9 | 10 | for logger_name in logger_initialized: 11 | if name.startswith(logger_name): 12 | return logger 13 | 14 | stream_handler = logging.StreamHandler() 15 | handlers = [stream_handler] 16 | 17 | try: 18 | import torch.distributed as dist 19 | if dist.is_available() and dist.is_initialized(): 20 | rank = dist.get_rank() 21 | else: 22 | rank = 0 23 | except ImportError: 24 | rank = 0 25 | 26 | if rank == 0 and log_file is not None: 27 | file_handler = logging.FileHandler(log_file, file_mode) 28 | handlers.append(file_handler) 29 | 30 | formatter = logging.Formatter( 31 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 32 | for handler in handlers: 33 | handler.setFormatter(formatter) 34 | handler.setLevel(log_level) 35 | logger.addHandler(handler) 36 | 37 | if rank == 0: 38 | logger.setLevel(log_level) 39 | else: 40 | logger.setLevel(logging.ERROR) 41 | 42 | logger_initialized[name] = True 43 | return logger -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text 5 | doc_to_target: "bbox" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils_rec.screenspot_rec_process_result 10 | metric_list: 11 | - metric: screenspot_IoU 12 | aggregation : !function utils_rec.screenspot_rec_iou 13 | higher_is_better : true 14 | - metric: screenspot_ACC@0.1 15 | aggregation : !function utils_rec.screenspot_rec_acc01 16 | higher_is_better : true 17 | - metric: screenspot_ACC@0.3 18 | aggregation : !function utils_rec.screenspot_rec_acc03 19 | higher_is_better : true 20 | - metric: screenspot_ACC@0.5 21 | aggregation : !function utils_rec.screenspot_rec_acc05 22 | higher_is_better : true 23 | - metric: screenspot_ACC@0.7 24 | aggregation : !function utils_rec.screenspot_rec_acc07 25 | higher_is_better : true 26 | - metric: screenspot_ACC@0.9 27 | aggregation : !function utils_rec.screenspot_rec_acc09 28 | higher_is_better : true 29 | - metric: screenspot_Center_ACC 30 | aggregation : !function utils_rec.screenspot_rec_center_acc 31 | higher_is_better : true 32 | metadata: 33 | version: '0.0' -------------------------------------------------------------------------------- /docs/Evaluation.md: -------------------------------------------------------------------------------- 1 | ## Evaluation 2 | 3 | We evaluate our models on the CV-Bench, MMStar, RealWorldQA, and OK-VQA benchmarks. 4 | 5 | ```bash 6 | # install evaluation specific dependencies 7 | pip install -e .["eval"] 8 | pip install -e lmms-eval/ 9 | ``` 10 | 11 | ### CV-Bench 12 | 13 | ```bash 14 | # prepare benchmark 15 | git lfs install 16 | cd datasets/eval && git clone https://huggingface.co/datasets/nyu-visionx/CV-Bench & cd ../.. 17 | 18 | # run eval on 4 GPUs 19 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/cv_bench.sh shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b ola_vlm_clip_llama3 llava_llama_3 20 | ``` 21 | 22 | ### MMStar 23 | 24 | ```bash 25 | # prepare benchmark 26 | git lfs install 27 | cd datasets/eval && git clone https://huggingface.co/datasets/Lin-Chen/MMStar & cd ../.. 28 | 29 | # run eval on 4 GPUs 30 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/mmstar.sh shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b ola_vlm_clip_llama3 llava_llama_3 31 | ``` 32 | 33 | ### RelworldQA (RWQA) and OK-VQA 34 | 35 | ```bash 36 | # run on 4 GPUs 37 | accelerate launch --num_processes=4 -m lmms_eval --model llava --model_args pretrained=shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b,conv_template=llava_llama_3,attn_implementation="eager",device_map="" --tasks realworldqa,ok_vqa --batch_size 1 --log_samples --log_samples_suffix ola_vlm_clip_llama3 --output_path datasets/eval/results/ola_vlm_clip_llama3 38 | ``` 39 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/utils.py: -------------------------------------------------------------------------------- 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None): 2 | question, choices = doc["question"], doc["options"] 3 | len_choices = len(choices) 4 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 5 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 6 | if model_specific_prompt_kwargs["prompt_format"] == "mcq": 7 | options = [chr(ord("A") + i) for i in range(len_choices)] 8 | choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) 9 | return f"{pre_prompt}{question}\n{choices_str}{post_prompt}" 10 | elif model_specific_prompt_kwargs["prompt_format"] == "qa": 11 | options = "\n".join(choices) 12 | return f"{pre_prompt}{question}{options}{post_prompt}" 13 | else: 14 | raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}") 15 | 16 | 17 | def ai2d_doc_to_visual(doc): 18 | return [doc["image"].convert("RGB")] 19 | 20 | 21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs): 22 | if model_specific_target_kwargs == "mcq": 23 | len_choices = len(doc["options"]) 24 | options = [chr(ord("A") + i) for i in range(len_choices)] 25 | return options[int(doc["answer"])] 26 | elif model_specific_target_kwargs == "qa": 27 | return doc["options"][int(doc["answer"])] 28 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOPlus 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/HallusionBench 2 | dataset_kwargs: 3 | token: True 4 | task: "hallusion_bench_image" 5 | test_split: image 6 | output_type: generate_until 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text 9 | doc_to_target: "gt_answer_details" 10 | process_results: !function evaluate_hb.hb_process_results 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "" 15 | generation_kwargs: 16 | max_new_tokens: 128 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | metric_list: 22 | - metric: aAcc 23 | aggregation: !function evaluate_hb.hb_aggregation_result_aAcc 24 | higher_is_better: true 25 | - metric: qAcc 26 | aggregation: !function evaluate_hb.hb_aggregation_result_qAcc 27 | higher_is_better: true 28 | - metric: fAcc 29 | aggregation: !function evaluate_hb.hb_aggregation_result_fAcc 30 | higher_is_better: true 31 | # - metric: aAcc 32 | # aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern 33 | # higher_is_better: true 34 | # - metric: qAcc 35 | # aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern 36 | # higher_is_better: true 37 | # - metric: fAcc 38 | # aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern 39 | # higher_is_better: true 40 | metadata: 41 | - version: 0.0 42 | -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /ola_vlm/eval/get_probe_dsg_scores.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | import json 5 | import os 6 | from tqdm import tqdm 7 | from icecream import ic 8 | import warnings 9 | warnings.filterwarnings("ignore") 10 | import random 11 | import numpy as np 12 | 13 | 14 | def set_seed(seed): 15 | random.seed(seed) 16 | np.random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed_all(seed) 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--ckpt", type=str, default="llava-1.5-7b") 23 | parser.add_argument("--mode", type=str, default="gen") 24 | args = parser.parse_args() 25 | 26 | mode = args.mode 27 | name = args.ckpt.split("/")[-1] 28 | 29 | with open(f'plots/probe_scores/{name}/{args.mode}.json') as file: 30 | scores = json.load(file) 31 | 32 | layer_scores = {} 33 | 34 | for img, v in tqdm(scores.items()): 35 | for layer, score in v.items(): 36 | if layer not in layer_scores: 37 | layer_scores[layer] = [] 38 | layer_scores[layer].append(score) 39 | 40 | for layer, scores in layer_scores.items(): 41 | layer_scores[layer] = np.mean(scores) 42 | 43 | with open(f"plots/probe_scores/{name}/{mode}_scores.json", "w") as f: 44 | json.dump(layer_scores, f, indent=2) 45 | 46 | print(f"================Scores: {mode}===============") 47 | for layer, score in layer_scores.items(): 48 | print(f"Layer: {layer}, Score: {score}") 49 | print("===========================================") -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task: "coco2017_cap_val" 5 | group : "coco_caption2017" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_Bleu_4 21 | aggregation : !function utils.coco_bleu4 22 | higher_is_better : true 23 | - metric: coco_Bleu_3 24 | aggregation : !function utils.coco_bleu3 25 | higher_is_better : true 26 | - metric: coco_Bleu_2 27 | aggregation : !function utils.coco_bleu2 28 | higher_is_better : true 29 | - metric: coco_Bleu_1 30 | aggregation : !function utils.coco_bleu1 31 | higher_is_better : true 32 | - metric: coco_METEOR 33 | aggregation : !function utils.coco_meteor 34 | higher_is_better : true 35 | - metric: coco_ROUGE_L 36 | aggregation : !function utils.coco_rougel 37 | higher_is_better : true 38 | - metric: coco_CIDEr 39 | aggregation : !function utils.coco_cider 40 | higher_is_better : true 41 | #- metric: coco_SPICE 42 | # aggregation : !function utils.coco_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/flickr30k/flickr30k_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/flickr30k 2 | dataset_kwargs: 3 | token: True 4 | task : "flickr30k_test" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.flickr_doc_to_visual 8 | doc_to_text: !function utils.flickr_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 64 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.flickr_process_result 17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 18 | metric_list: 19 | - metric: flickr_Bleu_4 20 | aggregation : !function utils.flickr_bleu4 21 | higher_is_better : true 22 | - metric: flickr_Bleu_3 23 | aggregation : !function utils.flickr_bleu3 24 | higher_is_better : true 25 | - metric: flickr_Bleu_2 26 | aggregation : !function utils.flickr_bleu2 27 | higher_is_better : true 28 | - metric: flickr_Bleu_1 29 | aggregation : !function utils.flickr_bleu1 30 | higher_is_better : true 31 | - metric: flickr_METEOR 32 | aggregation : !function utils.flickr_meteor 33 | higher_is_better : true 34 | - metric: flickr_ROUGE_L 35 | aggregation : !function utils.flickr_rougel 36 | higher_is_better : true 37 | - metric: flickr_CIDEr 38 | aggregation : !function utils.flickr_cider 39 | higher_is_better : true 40 | #- metric: flickr_SPICE 41 | # aggregation : !function utils.flickr_spice 42 | # higher_is_better : true 43 | metadata: 44 | - version: 0.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="ola_vlm", 5 | version="1.0.0", 6 | long_description=open("README.md").read(), 7 | long_description_content_type="text/markdown", 8 | classifiers=[ 9 | "Programming Language :: Python :: 3", 10 | "License :: OSI Approved :: Apache Software License", 11 | ], 12 | python_requires=">=3.8", 13 | install_requires=[ 14 | "torch==2.2.0", "torchvision==0.17.0", 15 | "tokenizers==0.19.1", "sentencepiece==0.1.99", "shortuuid", 16 | "peft", "bitsandbytes", "open_clip_torch", "diffdist", 17 | "pydantic", "markdown2[all]", "numpy==1.26.2", 18 | "gradio==4.16.0", "gradio_client==0.8.1", "huggingface_hub", 19 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 20 | "einops==0.6.1", "einops-exts==0.0.4", "timm==1.0.8", 21 | "diffusers===0.27.2", "protobuf", "accelerate==0.27.2" 22 | ], 23 | extras_require={ 24 | "train": ["deepspeed==0.12.6", "ninja", "wandb", "huggingface-hub==0.24.2", "peft==0.12.0"], 25 | "eval": ["seaborn", "sty", "tabulate", "spacy", "word2number", "inflect"], 26 | "demo": ["pydantic==2.8.2", "pydantic-core==2.20.1", "fastapi==0.111.0"], 27 | "build": ["build", "twine"] 28 | }, 29 | url="https://praeclarumjj3.github.io/ola_vlm", 30 | project_urls={ 31 | "Bug Tracker": "https://github.com/SHI-Labs/VisPer-LM/issues" 32 | }, 33 | packages=find_packages(exclude=["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]), 34 | include_package_data=True, 35 | ) 36 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption 2 | dataset_kwargs: 3 | token: True 4 | task: "coco2014_cap_val" 5 | group : "coco_caption" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: "Provide a one-sentence caption for the provided image." 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_Bleu_4 21 | aggregation : !function utils.coco_bleu4 22 | higher_is_better : true 23 | - metric: coco_Bleu_3 24 | aggregation : !function utils.coco_bleu3 25 | higher_is_better : true 26 | - metric: coco_Bleu_2 27 | aggregation : !function utils.coco_bleu2 28 | higher_is_better : true 29 | - metric: coco_Bleu_1 30 | aggregation : !function utils.coco_bleu1 31 | higher_is_better : true 32 | - metric: coco_METEOR 33 | aggregation : !function utils.coco_meteor 34 | higher_is_better : true 35 | - metric: coco_ROUGE_L 36 | aggregation : !function utils.coco_rougel 37 | higher_is_better : true 38 | - metric: coco_CIDEr 39 | aggregation : !function utils.coco_cider 40 | higher_is_better : true 41 | #- metric: coco_SPICE 42 | # aggregation : !function utils.coco_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/NoCaps 2 | dataset_kwargs: 3 | token: True 4 | task: "nocaps_val" 5 | group : "nocaps_caption" 6 | test_split: validation 7 | output_type: generate_until 8 | doc_to_visual: !function utils.nocaps_doc_to_visual 9 | doc_to_text: !function utils.nocaps_doc_to_text 10 | doc_to_target: "annotations_captions" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.nocaps_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: nocaps_Bleu_4 21 | aggregation : !function utils.nocaps_bleu4 22 | higher_is_better : true 23 | - metric: nocaps_Bleu_3 24 | aggregation : !function utils.nocaps_bleu3 25 | higher_is_better : true 26 | - metric: nocaps_Bleu_2 27 | aggregation : !function utils.nocaps_bleu2 28 | higher_is_better : true 29 | - metric: nocaps_Bleu_1 30 | aggregation : !function utils.nocaps_bleu1 31 | higher_is_better : true 32 | - metric: nocaps_METEOR 33 | aggregation : !function utils.nocaps_meteor 34 | higher_is_better : true 35 | - metric: nocaps_ROUGE_L 36 | aggregation : !function utils.nocaps_rougel 37 | higher_is_better : true 38 | - metric: nocaps_CIDEr 39 | aggregation : !function utils.nocaps_cider 40 | higher_is_better : true 41 | #- metric: nocaps_SPICE 42 | # aggregation : !function utils.nocaps_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 46 | include: _default_template_nocaps_yaml -------------------------------------------------------------------------------- /scripts/probe/probe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT= "VisPer-LM" 4 | export WANDB_NAME="probe_depth_ola-vlm-pt-ift" 5 | 6 | # 8 GPUs 7 | deepspeed ola_vlm/train/probe_dsg_train_mem.py \ 8 | --deepspeed ./scripts/zero2.json \ 9 | --mode $1 \ 10 | --model_name_or_path shi-labs/OLA-VLM-CLIP-ConvNeXT-Llama3-8b \ 11 | --image_generator stabilityai/stable-diffusion-2-1-unclip \ 12 | --image_segmentor shi-labs/oneformer_coco_swin_large \ 13 | --depth_estimator depth_anything_v2_vitl.pth \ 14 | --version llava_llama_3 \ 15 | --data_path /mnt/vlpdatasets/sherlock/coco/annotations/captions_train2017.json \ 16 | --image_folder datasets/coco \ 17 | --vision_tower laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --mm_vision_select_layer -2 \ 20 | --mm_use_im_start_end False \ 21 | --mm_use_im_patch_token False \ 22 | --image_aspect_ratio pad \ 23 | --group_by_modality_length True \ 24 | --bf16 True \ 25 | --tf32 True \ 26 | --output_dir outputs/probe_${1}_ola-vlm-pt-ift \ 27 | --num_train_epochs 1 \ 28 | --per_device_train_batch_size 32 \ 29 | --per_device_eval_batch_size 4 \ 30 | --gradient_accumulation_steps 1 \ 31 | --evaluation_strategy "no" \ 32 | --save_strategy "steps" \ 33 | --save_steps 200 \ 34 | --save_total_limit 3 \ 35 | --learning_rate 1e-3 \ 36 | --weight_decay 0. \ 37 | --warmup_ratio 0.03 \ 38 | --lr_scheduler_type "cosine" \ 39 | --logging_steps 1 \ 40 | --model_max_length 4096 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task: "textcaps_val" 5 | group : "textcaps_caption" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.textcaps_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: textcaps_Bleu_4 21 | aggregation : !function utils.textcaps_bleu4 22 | higher_is_better : true 23 | - metric: textcaps_Bleu_3 24 | aggregation : !function utils.textcaps_bleu3 25 | higher_is_better : true 26 | - metric: textcaps_Bleu_2 27 | aggregation : !function utils.textcaps_bleu2 28 | higher_is_better : true 29 | - metric: textcaps_Bleu_1 30 | aggregation : !function utils.textcaps_bleu1 31 | higher_is_better : true 32 | - metric: textcaps_METEOR 33 | aggregation : !function utils.textcaps_meteor 34 | higher_is_better : true 35 | - metric: textcaps_ROUGE_L 36 | aggregation : !function utils.textcaps_rougel 37 | higher_is_better : true 38 | - metric: textcaps_CIDEr 39 | aggregation : !function utils.textcaps_cider 40 | higher_is_better : true 41 | #- metric: textcaps_SPICE 42 | # aggregation : !function utils.textcaps_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 46 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/transformation.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class LowercaseFilter(Filter): 5 | def __init__(self) -> None: 6 | pass 7 | 8 | def apply(self, resps, docs): 9 | def filter_set(inst): 10 | return [resp.lower() for resp in inst] 11 | 12 | return [filter_set(resp) for resp in resps] 13 | 14 | 15 | class UppercaseFilter(Filter): 16 | def __init__(self) -> None: 17 | pass 18 | 19 | def apply(self, resps, docs): 20 | def filter_set(inst): 21 | return [resp.upper() for resp in inst] 22 | 23 | return [filter_set(resp) for resp in resps] 24 | 25 | 26 | class MapFilter(Filter): 27 | def __init__(self, mapping_dict: dict = {}, default_value=None) -> None: 28 | """ 29 | Initializes the MapFilter with a given mapping dictionary and default value. 30 | 31 | Args: 32 | - mapping_dict (dict): A dictionary containing the key-value mappings. 33 | Default is an empty dictionary. 34 | - default_value (Any): The value to be returned when a key is not found in the mapping_dict. 35 | Default is None. 36 | 37 | Example: 38 | mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) 39 | """ 40 | assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary" 41 | self.mapping_dict = mapping_dict 42 | self.default_value = default_value 43 | 44 | def apply(self, resps, docs): 45 | def filter_set(inst): 46 | return [self.mapping_dict.get(resp, self.default_value) for resp in inst] 47 | 48 | return [filter_set(resp) for resp in resps] 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_train.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "textcaps_train" 5 | group : "textcaps_caption" 6 | test_split: train 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.textcaps_process_result 20 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 21 | metric_list: 22 | - metric: textcaps_Bleu_4 23 | aggregation : !function utils.textcaps_bleu4 24 | higher_is_better : true 25 | - metric: textcaps_Bleu_3 26 | aggregation : !function utils.textcaps_bleu3 27 | higher_is_better : true 28 | - metric: textcaps_Bleu_2 29 | aggregation : !function utils.textcaps_bleu2 30 | higher_is_better : true 31 | - metric: textcaps_Bleu_1 32 | aggregation : !function utils.textcaps_bleu1 33 | higher_is_better : true 34 | - metric: textcaps_METEOR 35 | aggregation : !function utils.textcaps_meteor 36 | higher_is_better : true 37 | - metric: textcaps_ROUGE_L 38 | aggregation : !function utils.textcaps_rougel 39 | higher_is_better : true 40 | - metric: textcaps_CIDEr 41 | aggregation : !function utils.textcaps_cider 42 | higher_is_better : true 43 | #- metric: textcaps_SPICE 44 | # aggregation : !function utils.textcaps_spice 45 | # higher_is_better : true 46 | metadata: 47 | - version: 0.0 48 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /scripts/train/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT= "VisPer-LM" 4 | export WANDB_NAME="VisPer-LM-CLIP-ViT-Llama3-8b" 5 | 6 | # Base LLM choices: 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3) 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3) 9 | 10 | # Base encoder choices: 11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336 12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 13 | 14 | # 8 GPUs 15 | deepspeed ola_vlm/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \ 18 | --version llava_llama_3 \ 19 | --data_path datasets/llava_v1_5_mix665k.json \ 20 | --image_folder datasets/ \ 21 | --vision_tower openai/clip-vit-large-patch14-336 \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --bf16 True \ 29 | --output_dir outputs/VisPer-LM-CLIP-ViT-Llama3-8b \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 200 \ 37 | --save_total_limit 3 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 4096 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/__init__.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import FilterEnsemble, Filter 2 | from . import selection 3 | from . import extraction 4 | from . import transformation 5 | 6 | 7 | FILTER_REGISTRY = { 8 | "take_first": selection.TakeFirstFilter, 9 | "regex": extraction.RegexFilter, 10 | "majority_vote": selection.MajorityVoteFilter, 11 | "take_first_k": selection.TakeKFilter, 12 | "remove_whitespace": extraction.WhitespaceFilter, 13 | "lowercase": transformation.LowercaseFilter, 14 | "uppercase": transformation.UppercaseFilter, 15 | "map": transformation.MapFilter, 16 | "multi_choice_regex": extraction.MultiChoiceRegexFilter, 17 | # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function 18 | # that takes an input and returns a scalar and then should select the max reward, 19 | # or should implement different filters for different ways of handling a reward model's inference. 20 | # "arg_max": selection.ArgMaxFilter, 21 | } 22 | 23 | 24 | def get_filter(filter_name): 25 | if filter_name in FILTER_REGISTRY: 26 | return FILTER_REGISTRY[filter_name] 27 | else: 28 | return filter_name 29 | 30 | 31 | def build_filter_ensemble(filter_name, components): 32 | """ 33 | Create a filtering pipeline. 34 | """ 35 | filters = [] 36 | for function, kwargs in components: 37 | if kwargs is None: 38 | f = get_filter(function)() 39 | else: 40 | # create a filter given its name in the registry 41 | f = get_filter(function)(**kwargs) # TODO: pass kwargs to filters properly 42 | # add the filter as a pipeline step 43 | filters.append(f) 44 | 45 | return FilterEnsemble(name=filter_name, filters=filters) 46 | -------------------------------------------------------------------------------- /scripts/train/vpt_ift.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT= "VisPer-LM" 4 | export WANDB_NAME="vpt_VisPer-LM-CLIP-ViT-Llama3-8b" 5 | 6 | # Base LLM choices: 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3) 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3) 9 | 10 | # Base encoder choices: 11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336 12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 13 | 14 | # 8 GPUs 15 | deepspeed ola_vlm/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path outputs/v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b \ 18 | --version llava_llama_3 \ 19 | --data_path datasets/llava_v1_5_mix665k.json \ 20 | --image_folder datasets/ \ 21 | --vision_tower openai/clip-vit-large-patch14-336 \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --bf16 True \ 29 | --output_dir outputs/vpt_VisPer-LM-CLIP-ViT-Llama3-8b \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 200 \ 37 | --save_total_limit 3 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 4096 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/selection.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from lmms_eval.api.filter import Filter 4 | 5 | 6 | class TakeFirstFilter(Filter): 7 | def __init__(self) -> None: 8 | """ 9 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 10 | """ 11 | 12 | def apply(self, resps, docs): 13 | """ 14 | Assuming each entry of `resps` is a list of model responses, we discard all but the first response. 15 | """ 16 | return map(lambda r: r[0], resps) 17 | 18 | 19 | class TakeKFilter(Filter): 20 | def __init__(self, *args, **kwargs) -> None: 21 | self.k = kwargs.pop("k") 22 | 23 | super().__init__(*args, **kwargs) 24 | 25 | def apply(self, resps, docs): 26 | # check we have at least k responses per doc, else we can't take the first k 27 | assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." 28 | return map(lambda r: r[: self.k], resps) 29 | 30 | 31 | class MajorityVoteFilter(Filter): 32 | def __init__(self) -> None: 33 | """ 34 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 35 | """ 36 | 37 | def apply(self, resps, docs): 38 | """ 39 | Each entry of `resps` is a list of model responses. 40 | We select the response that occurs most frequently in each entry of `resps`. 41 | """ 42 | 43 | def select_majority(resp): 44 | counts = Counter(resp) 45 | vote = counts.most_common(1)[0][0] 46 | return vote 47 | 48 | return map(lambda r: [select_majority(r)], resps) 49 | -------------------------------------------------------------------------------- /scripts/train/vpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT= "VisPer-LM" 4 | export WANDB_NAME="v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b" 5 | 6 | # Base LLM choices: 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3) 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3) 9 | 10 | # Base encoder choices: 11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336 12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 13 | 14 | # 8 GPUs 15 | deepspeed ola_vlm/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \ 18 | --version llava_llama_3 \ 19 | --data_path datasets/allava/ALLaVA-Caption.json \ 20 | --image_folder datasets/allava \ 21 | --vision_tower openai/clip-vit-large-patch14-336 \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --bf16 True \ 29 | --output_dir outputs/v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 200 \ 37 | --save_total_limit 3 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 4096 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/utils.py: -------------------------------------------------------------------------------- 1 | def sqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 2 | context, question, choices = doc["hint"], doc["question"], doc["choices"] 3 | len_choices = len(choices) 4 | options = [chr(ord("A") + i) for i in range(len_choices)] 5 | choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) 6 | if model_specific_prompt_kwargs["format"] == "default": 7 | if context: 8 | context = f"Context: {context}\n" 9 | 10 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 11 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 12 | return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}" 13 | elif model_specific_prompt_kwargs["format"] == "qwen_vl": 14 | prompt = "Context: {}\nQuestion: {}\nOptions: {}\nAnswer:" 15 | context = context if context else "N/A" 16 | prompt = prompt.format(context, question, choices_str) 17 | return prompt 18 | else: 19 | raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs}") 20 | 21 | 22 | def sqa_doc_to_visual(doc): 23 | if doc["image"] is None: 24 | return [] 25 | return [doc["image"].convert("RGB")] 26 | 27 | 28 | def sqa_doc_to_target(doc): 29 | len_choices = len(doc["choices"]) 30 | options = [chr(ord("A") + i) for i in range(len_choices)] 31 | return options[doc["answer"]] 32 | 33 | 34 | def sqa_process_results(doc, results): 35 | # I know this is weird, but it's how llava parse it. 36 | target = sqa_doc_to_target(doc) 37 | pred = results[0] 38 | if pred == target: 39 | return {"exact_match": 1.0} 40 | # pattern: ^[A-Z]\. .* 41 | if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".": 42 | result = 1.0 if pred[0] == target else 0.0 43 | return {"exact_match": result} 44 | return {"exact_match": 0.0} 45 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench_2/seedbench_2.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench-2 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench-2" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 16 14 | image_aspect_ratio: original 15 | # The return value of process_results will be used by metrics 16 | process_results: !function utils.seed_process_result 17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 18 | metric_list: 19 | - metric: seed_Video 20 | aggregation: !function utils.seed_aggregation_result 21 | higher_is_better: true 22 | - metric: seed_Multiple_Images 23 | aggregation: !function utils.seed_aggregation_result 24 | higher_is_better: true 25 | - metric: seed_Image_&_Text_Generation 26 | aggregation: !function utils.seed_aggregation_result 27 | higher_is_better: true 28 | - metric: seed_Single_Image 29 | aggregation: !function utils.seed_aggregation_result 30 | higher_is_better: true 31 | - metric: seed_Image_Generation 32 | aggregation: !function utils.seed_aggregation_result 33 | higher_is_better: true 34 | - metric: seed_Interleaved_Image 35 | aggregation: !function utils.seed_aggregation_result 36 | higher_is_better: true 37 | - metric: seed_all 38 | aggregation: !function utils.seed_aggregation_result 39 | higher_is_better: true 40 | metadata: 41 | - version: 0.0 42 | 43 | model_specific_prompt_kwargs: 44 | llava : 45 | img_token : 46 | post_prompt : "Answer with the option's letter from the given choices directly." 47 | gpt4V : 48 | img_token : 49 | post_prompt : "Answer with the option's letter from the given choices directly." -------------------------------------------------------------------------------- /lmms-eval/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 240 3 | 4 | [build-system] 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "lmms_eval" 10 | version = "0.1.2" 11 | authors = [ 12 | { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" }, 13 | ] 14 | description = "A framework for evaluating large multi-modality language models" 15 | readme = "README.md" 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ] 21 | requires-python = ">=3.8" 22 | license = { text = "MIT" } 23 | dependencies = [ 24 | "accelerate>=0.21.0", 25 | "black==24.1.0", 26 | "datasets==2.16.1", 27 | "evaluate>=0.4.0", 28 | "loguru", 29 | "jsonlines", 30 | "numexpr", 31 | "peft>=0.2.0", 32 | "pybind11>=2.6.2", 33 | "pytablewriter", 34 | "rouge-score>=0.0.4", 35 | "sacrebleu>=1.5.0", 36 | "scikit-learn>=0.24.1", 37 | "sqlitedict", 38 | "torch>=1.8", 39 | "openai>=1.0.0", 40 | "pycocoevalcap", 41 | "tqdm-multiprocess", 42 | "transformers", 43 | "zstandard", 44 | "pillow", 45 | "pyyaml", 46 | "sympy", 47 | "mpmath", 48 | "Jinja2", 49 | "openpyxl", 50 | "Levenshtein", 51 | "hf_transfer", 52 | "tenacity", 53 | "wandb>=0.16.0", 54 | "transformers-stream-generator", 55 | "tiktoken", 56 | "pre-commit", 57 | "pydantic", 58 | ] 59 | 60 | [tool.setuptools.packages.find] 61 | include = ["lmms_eval*"] 62 | 63 | [tool.setuptools.package-data] 64 | lmms_eval = ["**/*.yaml", "tasks/**/*"] 65 | 66 | [project.scripts] 67 | lmms-eval = "lmms_eval.__main__:cli_evaluate" 68 | lmms_eval = "lmms_eval.__main__:cli_evaluate" 69 | 70 | [project.urls] 71 | Homepage = "https://lmms-lab.github.io/lmms-eval-blog/" 72 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval" 73 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def seed_doc_to_visual(doc): 5 | return [image.convert("RGB") for image in doc["image"]] 6 | 7 | 8 | def seed_doc_to_text(doc): 9 | question = doc["question"] 10 | question += "\n" + f"A. {doc['choice_a']}\n" 11 | question += f"B. {doc['choice_b']}\n" 12 | question += f"C. {doc['choice_c']}\n" 13 | question += f"D. {doc['choice_d']}" 14 | return f"{question}\nAnswer with the option's letter from the given choices directly." 15 | 16 | 17 | def seed_process_result(doc, result): 18 | pred = result[0].strip() 19 | if len(pred) > 1: 20 | pred = pred[0] 21 | answer = doc["answer"] 22 | data_type = doc["data_type"] 23 | 24 | return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} 25 | 26 | 27 | def seed_aggregation_result(results): 28 | total_count = 0 29 | total_correct = 0 30 | for result in results: 31 | if result["pred"] == result["answer"]: 32 | total_correct += 1 33 | total_count += 1 34 | return total_correct / total_count 35 | 36 | 37 | def seed_aggregation_result_all(results): 38 | score = seed_aggregation_result(results) 39 | stored_results = [] 40 | for result in results: 41 | stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) 42 | with open("./seed_submission.json", "w") as f: 43 | json.dump(stored_results, f, indent=4) 44 | print("Storing files for seed_submission ...") 45 | 46 | return score 47 | 48 | 49 | def seed_doc_to_text_mc(doc): 50 | question = doc["question"] 51 | return f"{question} Answer :" 52 | 53 | 54 | def seed_doc_to_choice(doc): 55 | return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]] 56 | 57 | 58 | def seed_doc_to_mc_target(doc): 59 | answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"} 60 | return doc[answer2choice[doc["answer"]]] 61 | -------------------------------------------------------------------------------- /ola_vlm/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /scripts/train/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT= "VisPer-LM" 4 | export WANDB_NAME="pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b" 5 | 6 | # Base LLM choices: 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3) 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3) 9 | 10 | # Base encoder choices: 11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336 12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 13 | 14 | # 8 GPUs 15 | deepspeed ola_vlm/train/ola_vlm_train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \ 18 | --version llava_llama_3 \ 19 | --mode gen-depth-seg \ 20 | --layer_indices d18-20_s10-18_g12-20 \ 21 | --num_task_tokens 8 \ 22 | --loss_weights d0.5_s0.5_g0.5 \ 23 | --contrastive_loss_weight 0.3 \ 24 | --image_generator stabilityai/stable-diffusion-2-1-unclip \ 25 | --image_segmentor shi-labs/oneformer_coco_swin_large \ 26 | --depth_estimator depth_anything_v2_vitl.pth \ 27 | --data_path datasets/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 28 | --image_folder datasets/LLaVA-Pretrain/images \ 29 | --vision_tower openai/clip-vit-large-patch14-336 \ 30 | --mm_projector_type mlp2x_gelu \ 31 | --tune_mm_mlp_adapter True \ 32 | --mm_vision_select_layer -2 \ 33 | --mm_use_im_start_end False \ 34 | --mm_use_im_patch_token False \ 35 | --bf16 True \ 36 | --output_dir outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \ 37 | --num_train_epochs 1 \ 38 | --per_device_train_batch_size 32 \ 39 | --per_device_eval_batch_size 4 \ 40 | --gradient_accumulation_steps 1 \ 41 | --evaluation_strategy "no" \ 42 | --save_strategy "steps" \ 43 | --save_steps 200 \ 44 | --save_total_limit 3 \ 45 | --learning_rate 1e-3 \ 46 | --weight_decay 0. \ 47 | --warmup_ratio 0.03 \ 48 | --lr_scheduler_type "cosine" \ 49 | --logging_steps 1 \ 50 | --tf32 True \ 51 | --model_max_length 4096 \ 52 | --gradient_checkpointing True \ 53 | --dataloader_num_workers 4 \ 54 | --lazy_preprocess True \ 55 | --report_to wandb -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /ola_vlm/model/aux_heads/gen_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | from torch import nn 9 | from ola_vlm.model.multimodal_projector.resampler import Resampler, TaskTokenResampler 10 | 11 | 12 | class GenHead(nn.Module): 13 | 14 | def __init__( 15 | self, 16 | proj_config: dict = None, 17 | llm_hidden_size: int = 4096, 18 | ) -> None: 19 | super().__init__() 20 | 21 | self.projector = Resampler( 22 | dim=proj_config["output_dim"], 23 | depth=proj_config["depth"], 24 | dim_head=proj_config["dim_head"], 25 | heads=proj_config["num_heads"], 26 | num_queries=proj_config["num_tokens"], 27 | embedding_dim=llm_hidden_size, 28 | output_dim=proj_config["output_dim"], 29 | ff_mult=proj_config["ff_mult"], 30 | ) 31 | 32 | def forward( 33 | self, 34 | llm_feats: torch.Tensor, 35 | ): 36 | gen_feats = self.projector(llm_feats) 37 | return gen_feats 38 | 39 | class TaskTokenGenHead(nn.Module): 40 | 41 | def __init__( 42 | self, 43 | proj_config: dict = None, 44 | llm_hidden_size: int = 4096, 45 | ) -> None: 46 | super().__init__() 47 | 48 | self.projector = TaskTokenResampler( 49 | dim=proj_config["output_dim"], 50 | depth=proj_config["depth"], 51 | dim_head=proj_config["dim_head"], 52 | heads=proj_config["num_heads"], 53 | num_queries=proj_config["num_tokens"], 54 | embedding_dim=llm_hidden_size, 55 | output_dim=proj_config["output_dim"], 56 | ff_mult=proj_config["ff_mult"], 57 | ) 58 | 59 | def forward( 60 | self, 61 | llm_feats: torch.Tensor, 62 | latents: torch.Tensor 63 | ): 64 | gen_feats = self.projector(llm_feats, latents) 65 | return gen_feats -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/README.md: -------------------------------------------------------------------------------- 1 | # WebSRC 2 | 3 | ## Paper 4 | 5 | Title: WebSRC: A Dataset for Web-Based Structural Reading Comprehension 6 | 7 | Abstract: https://arxiv.org/abs/2101.09465 8 | 9 | Homepage: https://x-lance.github.io/WebSRC/# 10 | 11 | WebSRC is a dataset for web-based structural reading comprehension. 12 | Its full train/dev/test split contains over 400k questions across 6.4k webpages. 13 | This version of the dataset does not contain OCR or original HTML, it simply treats WebSRC as a image-and-text-based multimodal Q&A benchmark on webpage screenshots. 14 | 15 | ## Citation 16 | 17 | ```bibtex 18 | @inproceedings{chen2021websrc, 19 | title={WebSRC: A Dataset for Web-Based Structural Reading Comprehension}, 20 | author={Chen, Xingyu and Zhao, Zihan and Chen, Lu and Ji, Jiabao and Zhang, Danyang and Luo, Ao and Xiong, Yuxuan and Yu, Kai}, 21 | booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, 22 | pages={4173--4185}, 23 | year={2021} 24 | } 25 | ``` 26 | 27 | ## Groups & Tasks 28 | 29 | ### Groups 30 | 31 | - `websrc`: Evaluates `websrc-val` and generates a submission file for `websrc-test`. 32 | 33 | ### Tasks 34 | 35 | - `websrc-val`: Given a question and a web page, predict the answer. 36 | - `websrc-test`: Given a question and a web page, predict the answer. Ground truth is not provided for this task. 37 | 38 | ## Metrics 39 | 40 | This task uses SQUAD-style evaluation metrics, of which F1 score over tokens is used. 41 | The orignal paper also uses Exact Match (EM) score, but this is not implemented here as that metric is more conducive for Encoder-only extraction models. 42 | 43 | ### F1 Score 44 | 45 | F1 Score is the harmonic mean of precision and recall. 46 | We calculate precision and recall at the token level, then compute the F1 score as normal using these values. 47 | 48 | ### Test Submission 49 | 50 | When evaluaing on the test split, a prediction JSON will be compiled instead of metrics computed. 51 | Instructions for submission are available on the [WebSRC homepage](https://x-lance.github.io/WebSRC/#) and in their [Original GitHub Repo](https://github.com/X-LANCE/WebSRC-Baseline#obtain-test-result). -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/filter.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | 4 | from lmms_eval.api.instance import Instance 5 | from datasets import Dataset 6 | 7 | 8 | class Filter: 9 | """ 10 | Filter classes operate on a per-task level. 11 | They take all model outputs (`instance.resps` for all `task.instances`) 12 | across all instances of a task, and perform operations. 13 | In a single run, one can configure any number of separate filters or lists of filters. 14 | 15 | """ 16 | 17 | def __init__(self, *args, **kwargs) -> None: 18 | """ 19 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 20 | """ 21 | 22 | def apply(self, resps, docs): 23 | """ 24 | Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. 25 | Should return the list of (filtered) response lists *in the same order as they were input*, e.g. 26 | if pass in [, ] should return 27 | [, ] 28 | """ 29 | return resps 30 | 31 | 32 | @dataclass 33 | class FilterEnsemble: 34 | """ 35 | FilterEnsemble creates a pipeline applying multiple filters. 36 | Its intended usage is to stack multiple post-processing steps in order. 37 | `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 38 | pipeline separately. 39 | """ 40 | 41 | name: str 42 | filters: List[Filter] 43 | 44 | def apply(self, instances: List[Instance], docs: List[Dataset]) -> None: 45 | resps = [inst.resps for inst in instances] # operate just on the model responses 46 | for f in self.filters: 47 | # apply filters in sequence 48 | resps = f.apply(resps, docs) 49 | 50 | # add the end results after filtering to filtered_requests of their respective source instances. 51 | # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. 52 | for inst, resp in zip(instances, resps): 53 | inst.filtered_resps[self.name] = resp 54 | -------------------------------------------------------------------------------- /ola_vlm/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | from ola_vlm.model.multimodal_projector.resampler import Resampler 5 | 6 | 7 | class IdentityMap(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def forward(self, x, *args, **kwargs): 12 | return x 13 | 14 | @property 15 | def config(self): 16 | return {"mm_projector_type": 'identity'} 17 | 18 | 19 | class SimpleResBlock(nn.Module): 20 | def __init__(self, channels): 21 | super().__init__() 22 | self.pre_norm = nn.LayerNorm(channels) 23 | 24 | self.proj = nn.Sequential( 25 | nn.Linear(channels, channels), 26 | nn.GELU(), 27 | nn.Linear(channels, channels) 28 | ) 29 | def forward(self, x): 30 | x = self.pre_norm(x) 31 | return x + self.proj(x) 32 | 33 | 34 | def build_resampler(config, num_queries=None): 35 | return Resampler( 36 | dim=config["probe_output_dim"], 37 | depth=config["probe_depth"], 38 | dim_head=config["probe_dim_head"], 39 | heads=config["probe_num_heads"], 40 | num_queries=config["num_queries"] if num_queries is None else num_queries, 41 | embedding_dim=config.hidden_size, 42 | output_dim=config["probe_output_dim"], 43 | ff_mult=config["probe_ff_mult"], 44 | ) 45 | 46 | 47 | def build_vision_projector(config, delay_load=False, **kwargs): 48 | projector_type = getattr(config, 'mm_projector_type', 'linear') 49 | 50 | if projector_type == 'linear': 51 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 52 | 53 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 54 | if mlp_gelu_match: 55 | mlp_depth = int(mlp_gelu_match.group(1)) 56 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 57 | for _ in range(1, mlp_depth): 58 | modules.append(nn.GELU()) 59 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 60 | return nn.Sequential(*modules) 61 | 62 | if projector_type == 'identity': 63 | return IdentityMap() 64 | 65 | raise ValueError(f'Unknown projector type: {projector_type}') 66 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def options_to_str(options_prompt): 6 | option_prompt_str = "" 7 | for i, option in enumerate(options_prompt): 8 | option_choice = chr(ord("A") + i) 9 | option_prompt_str += f"{option_choice}. {option}\n" 10 | 11 | option_prompt_str = option_prompt_str.rstrip("\n") 12 | return option_prompt_str 13 | 14 | 15 | def doc_to_visual(doc): 16 | image_list = [] 17 | if "query_image" in doc: 18 | image_list.append(doc["query_image"].convert("RGB")) 19 | for i in range(5): 20 | id = f"choice_image_{i}" 21 | if id in doc and doc[id] is not None: 22 | image_list.append(doc[id].convert("RGB")) 23 | assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA" 24 | return image_list 25 | 26 | 27 | def doc_to_text(doc, model_specific_prompt_kwargs): 28 | question = doc["question"] 29 | ques_type = doc["ques_type"] 30 | options_prompt = [] 31 | 32 | if ques_type == "choose_img": 33 | options_prompt.append("The first image.") 34 | options_prompt.append("The second image.") 35 | 36 | options_str = options_to_str(options_prompt) 37 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" 38 | 39 | elif ques_type == "choose_txt": 40 | choices = doc["choices"].split(",") 41 | for i, choice in enumerate(choices): 42 | options_prompt.append(f"{choice}") 43 | 44 | options_str = options_to_str(options_prompt) 45 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" 46 | 47 | elif ques_type == "fill_in_blank": 48 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}" 49 | 50 | return full_prompt 51 | 52 | 53 | def test_process_results(doc, results): 54 | pred = results[0] 55 | questionId = doc["question_id"] 56 | answer = doc["answer"] 57 | return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}} 58 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench_2/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def seed_doc_to_visual(doc): 5 | return [image.convert("RGB") for image in doc["image"]] 6 | 7 | 8 | def parse_choice_img(choice: str, img_token: str): 9 | if "jpg" in choice or "png" in choice: 10 | return img_token 11 | return choice 12 | 13 | 14 | def seed_doc_to_text(doc, model_specific_kwargs=None): 15 | question = doc["question"] 16 | question.replace("", model_specific_kwargs["img_token"]) 17 | question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n" 18 | question += f"B. {parse_choice_img(doc['choice_b'], model_specific_kwargs['img_token'])}\n" 19 | question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n" 20 | question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}" 21 | if doc["data_type"] == "Image Generation": 22 | num_img_in_question = len(doc["data_id"]) - 4 23 | prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question 24 | question = " ".join(prepend_tokens) + "\n" + question 25 | return f"{question}\n{model_specific_kwargs['post_prompt']}" 26 | 27 | 28 | def seed_process_result(doc, result): 29 | pred = result[0].strip() 30 | if len(pred) > 1: 31 | pred = pred[0] 32 | answer = doc["answer"] 33 | data_type = doc["data_type"].split(" ") 34 | data_type = "_".join(data_type) 35 | 36 | return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} 37 | 38 | 39 | def seed_aggregation_result(results): 40 | total_count = 0 41 | total_correct = 0 42 | for result in results: 43 | if result["pred"] == result["answer"]: 44 | total_correct += 1 45 | total_count += 1 46 | return total_correct / total_count if total_count != 0 else 0 47 | 48 | 49 | def seed_aggregation_result_all(results): 50 | score = seed_aggregation_result(results) 51 | stored_results = [] 52 | for result in results: 53 | stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) 54 | with open("./seed_submission.json", "w") as f: 55 | json.dump(stored_results, f, indent=4) 56 | print("Storing files for seed_submission ...") 57 | 58 | return score 59 | -------------------------------------------------------------------------------- /ola_vlm/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m ola_vlm.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from ola_vlm.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/naturalbench/naturalbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: BaiqiL/NaturalBench-lmms-eval # The name of the dataset as listed by HF in the datasets Hub. 2 | dataset_kwargs: 3 | token: True # Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv. 4 | task: "naturalbench" # The name of the task, this should be registered in the task manager. If successful, you can call lmms_eval with this task name by setting `--tasks mme`. 5 | test_split: test # The split of the dataset to use as the test split. 6 | output_type: generate_until # The type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`. 7 | doc_to_visual: !function utils.naturalbench_doc_to_visual # The function to process a sample into the appropriate input for the model. 8 | doc_to_text: !function utils.naturalbench_doc_to_text # The function to process a sample into the appropriate target output for the model. 9 | doc_to_target: "answer" # The function to process a sample into a list of possible string choices for `multiple_choice` tasks. 10 | generation_kwargs: # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files. 11 | max_new_tokens: 16 12 | temperature: 0 13 | top_p: 1.0 14 | num_beams: 1 15 | do_sample: false 16 | # The return value of process_results will be used by metrics 17 | process_results: !function utils.naturalbench_process_results 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | # e.g. Following metrics `mme_perception_score` is custom defined. 20 | # So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }` 21 | # And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy. 22 | metric_list: 23 | - metric: naturalbench_score # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score. 24 | aggregation: !function utils.naturalbench_aggregate_results # The name of the aggregation function to use for evaluation. 25 | higher_is_better: true # Whether the metric is better when the value is higher. -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import yaml 5 | import pathlib 6 | import logging 7 | import datetime 8 | import statistics 9 | 10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor 12 | 13 | eval_logger = logging.getLogger("lmms-eval") 14 | 15 | 16 | def ok_vqa_doc_to_visual(doc): 17 | return [doc["image"].convert("RGB")] 18 | 19 | 20 | def ok_vqa_process_results(doc, result): 21 | eval_ai_processor = EvalAIAnswerProcessor() 22 | assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." 23 | resAns = eval_ai_processor(result[0]) 24 | accuracy = 0 25 | 26 | if "answers" in doc and doc["answers"] is not None: 27 | gtAcc = [] 28 | 29 | for i in range(len(doc["answers"])): 30 | doc["answers"][i] = eval_ai_processor(doc["answers"][i]) 31 | 32 | for i in range(len(doc["answers"])): 33 | otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] 34 | matchingAns = [item for item in otherGTAns if item == resAns] 35 | acc = min(1, float(len(matchingAns)) / 3) 36 | gtAcc.append(acc) 37 | if gtAcc: 38 | accuracy = statistics.mean(gtAcc) 39 | else: 40 | accuracy = 0 41 | 42 | return { 43 | "exact_match": accuracy, 44 | "submission": { 45 | "image": f"{doc['question_id']}.jpg", 46 | "answer": resAns, 47 | }, 48 | } 49 | 50 | 51 | def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 52 | question = doc["question"] 53 | if model_specific_prompt_kwargs is None: 54 | model_specific_prompt_kwargs = {} 55 | pre_prompt = "" 56 | post_prompt = "" 57 | if "pre_prompt" in model_specific_prompt_kwargs: 58 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 59 | if "post_prompt" in model_specific_prompt_kwargs: 60 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 61 | return f"{pre_prompt}{question}{post_prompt}" 62 | 63 | 64 | def ok_vqa_aggreate_submissions(results, args): 65 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 66 | file = f"ok_vqa-test-submission-{now_date_time}.json" 67 | path = generate_submission_file(file, args) 68 | with open(path, "w") as f: 69 | json.dump(results, f) 70 | print(f"Submission file saved to {path}") 71 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import yaml 5 | import pathlib 6 | import logging 7 | import datetime 8 | import statistics 9 | 10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor 12 | 13 | eval_logger = logging.getLogger("lmms-eval") 14 | 15 | 16 | def vizwiz_vqa_doc_to_visual(doc): 17 | return [doc["image"].convert("RGB")] 18 | 19 | 20 | def vizwiz_vqa_process_results(doc, result): 21 | eval_ai_processor = EvalAIAnswerProcessor() 22 | assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." 23 | resAns = eval_ai_processor(result[0]) 24 | accuracy = 0 25 | 26 | if "answers" in doc and doc["answers"] is not None: 27 | gtAcc = [] 28 | 29 | for i in range(len(doc["answers"])): 30 | doc["answers"][i] = eval_ai_processor(doc["answers"][i]) 31 | 32 | for i in range(len(doc["answers"])): 33 | otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] 34 | matchingAns = [item for item in otherGTAns if item == resAns] 35 | acc = min(1, float(len(matchingAns)) / 3) 36 | gtAcc.append(acc) 37 | if gtAcc: 38 | accuracy = statistics.mean(gtAcc) 39 | else: 40 | accuracy = 0 41 | 42 | return { 43 | "exact_match": accuracy, 44 | "submission": { 45 | "image": f"{doc['question_id']}.jpg", 46 | "answer": resAns, 47 | }, 48 | } 49 | 50 | 51 | def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 52 | if model_specific_prompt_kwargs is None: 53 | model_specific_prompt_kwargs = {} 54 | pre_prompt = "" 55 | post_prompt = "" 56 | if "pre_prompt" in model_specific_prompt_kwargs: 57 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 58 | if "post_prompt" in model_specific_prompt_kwargs: 59 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 60 | text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}" 61 | return text 62 | 63 | 64 | def vizwiz_vqa_aggreate_submissions(results, args): 65 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 66 | submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json" 67 | path = generate_submission_file(submission_file_name, args) 68 | with open(path, "w") as f: 69 | json.dump(results, f) 70 | print(f"Submission file saved to {path}") 71 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/cn_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import datetime 4 | from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | import logging 8 | 9 | eval_logger = logging.getLogger("lmms-eval") 10 | dir_name = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | try: 13 | olympiadbench_evaluator = OlympiadBenchEvaluator() 14 | except: 15 | pass 16 | 17 | 18 | def olympiadbench_doc_to_visual(doc): 19 | return [image.convert("RGB") for image in doc["images"]] 20 | 21 | 22 | def olympiadbench_doc_to_text(doc): 23 | question = doc["question"] 24 | subject = doc["subfield"] 25 | mul_ans = doc["is_multiple_answer"] 26 | if mul_ans is None: 27 | mul_ans = False 28 | ans_type = doc["answer_type"] 29 | if ans_type == "Need_human_evaluate": 30 | ans_type = "proof based" 31 | 32 | pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n" 33 | 34 | post_prompt = "" 35 | if not mul_ans: 36 | post_prompt += f"答案类型为{ans_type}。\n" 37 | else: 38 | post_prompt += f"题目有多个答案,答案类型均为{ans_type}。\n" 39 | post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以" 40 | if not mul_ans: 41 | post_prompt += '"所以最终答案是\\boxed{答案}。"\n' 42 | else: 43 | post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n' 44 | 45 | final_question = pre_prompt + question + "\n" + post_prompt 46 | return final_question 47 | 48 | 49 | def olympiadbench_process_results(doc, results): 50 | precision = doc["error"] 51 | is_proving = "TP" in doc["source"] 52 | if precision is None: 53 | precision = 0 54 | prediction = results[0].strip() 55 | 56 | if is_proving: 57 | return {"submission": prediction} 58 | else: 59 | prediction = prediction.split("所以最终答案是")[-1] 60 | prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") 61 | accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) 62 | accuracy = int(accuracy) 63 | return {"exact_match": accuracy} 64 | 65 | 66 | def olympiadbench_aggregate_results(results, args): 67 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 68 | submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json" 69 | path = generate_submission_file(submission_file_name, args) 70 | with open(path, "w") as f: 71 | json.dump(results, f, ensure_ascii=False) 72 | print(f"Submission file saved to {path}") -------------------------------------------------------------------------------- /ola_vlm/eval/eval_cv_bench.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import argparse 4 | 5 | def load_jsonl(f): 6 | lines = open(f, encoding='utf-8').readlines() 7 | lines = [x.strip() for x in lines] 8 | if lines[-1] == '': 9 | lines = lines[:-1] 10 | data = [json.loads(x) for x in lines] 11 | return data 12 | 13 | if __name__ == '__main__': 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--results_file", type=str, default="cv-bench_answer.jsonl") 17 | args = parser.parse_args() 18 | 19 | answers = load_jsonl(args.results_file) 20 | 21 | data = { 22 | "source": [], 23 | "result": [], 24 | "task": [], 25 | } 26 | import re 27 | for a in answers: 28 | data["source"].append(a["source"][0]) 29 | if "(" in a["prediction"]: 30 | match = re.search(r'\(([A-Z])\)', a["prediction"]) 31 | if match: 32 | pred = "(" + match.group(1) + ")" 33 | else: 34 | pred = "(" + a["prediction"][0] + ")" 35 | data["result"].append(pred == a["answer"][0]) 36 | data["task"].append(a["task"][0]) 37 | 38 | df = pd.DataFrame(data) 39 | 40 | def calculate_accuracy(df, source): 41 | source_df = df[df['source'] == source] 42 | accuracy = (source_df['result']).mean() 43 | return accuracy 44 | 45 | def calculate_task_accuracy(df, task): 46 | source_df = df[df['task'] == task] 47 | accuracy = (source_df['result']).mean() 48 | return accuracy 49 | 50 | accuracy_2d_ade = calculate_accuracy(df, 'ADE20K') 51 | accuracy_2d_coco = calculate_accuracy(df, 'COCO') 52 | accuracy_3d_omni = calculate_accuracy(df, 'Omni3D') 53 | 54 | tasks = ["Count", "Depth", "Relation", "Distance"] 55 | 56 | scores = {} 57 | 58 | accuracy_2d = (accuracy_2d_ade + accuracy_2d_coco) / 2 59 | accuracy_3d = accuracy_3d_omni 60 | 61 | combined_accuracy = (accuracy_2d + accuracy_3d) / 2 62 | 63 | scores["Overall"] = combined_accuracy 64 | 65 | scores["3D"] = accuracy_3d 66 | scores["2D"] = accuracy_2d 67 | 68 | for t in tasks: 69 | accuracy = calculate_task_accuracy(df, t) 70 | scores[t] = accuracy 71 | 72 | print("\n=========================CV-Bench Scores===============================") 73 | for key, value in scores.items(): 74 | print(f"{key} -> {value}") 75 | print("================================================================") 76 | 77 | with open(args.results_file.replace('.jsonl', '_score.json'), "w") as f: 78 | json.dump(scores, f, indent=2) -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/chartqa/utils.py: -------------------------------------------------------------------------------- 1 | def chartqa_doc_to_visual(doc): 2 | return [doc["image"].convert("RGB")] 3 | 4 | 5 | def chartqa_doc_to_text(doc, model_specific_prompt_kwargs): 6 | question = doc["question"] 7 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 8 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 9 | return f"{pre_prompt}{question}{post_prompt}" 10 | 11 | 12 | def chartqa_process_results(doc, results): 13 | pred = results[0] 14 | type = doc["type"] 15 | score = relaxed_correctness(pred, doc["answer"]) 16 | score = 1.0 if score else 0.0 17 | return_dict = {"relaxed_overall": score} 18 | if type == "human_test": 19 | return_dict["relaxed_human_split"] = score 20 | else: 21 | return_dict["relaxed_augmented_split"] = score 22 | return return_dict 23 | 24 | 25 | def relaxed_correctness(prediction, target, max_relative_change: float = 0.05) -> bool: 26 | """Calculates relaxed correctness. 27 | 28 | The correctness tolerates certain error ratio defined by max_relative_change. 29 | See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1: 30 | “Following Methani et al. (2020), we use a relaxed accuracy measure for the 31 | numeric answers to allow a minor inaccuracy that may result from the automatic 32 | data extraction process. We consider an answer to be correct if it is within 33 | 5% of the gold answer. For non-numeric answers, we still need an exact match 34 | to consider an answer to be correct.” 35 | 36 | This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113 37 | Args: 38 | target: List of target string. 39 | prediction: List of predicted string. 40 | max_relative_change: Maximum relative change. 41 | 42 | Returns: 43 | Whether the prediction was correct given the specified tolerance. 44 | """ 45 | 46 | def _to_float(text: str): 47 | try: 48 | if text.endswith("%"): 49 | # Convert percentages to floats. 50 | return float(text.rstrip("%")) / 100.0 51 | else: 52 | return float(text) 53 | except ValueError: 54 | return None 55 | 56 | prediction_float = _to_float(prediction) 57 | target_float = _to_float(target) 58 | if prediction_float is not None and target_float: 59 | relative_change = abs(prediction_float - target_float) / abs(target_float) 60 | return relative_change <= max_relative_change 61 | else: 62 | return prediction.lower() == target.lower() 63 | --------------------------------------------------------------------------------