├── lmms-eval
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── instance.py
    │   │   └── filter.py
    │   ├── models
    │   │   ├── model_utils
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── tasks
    │   │   ├── _task_utils
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   └── file_utils.py
    │   │   ├── multilingual-llava-bench-in-the-wild
    │   │   │   ├── README.md
    │   │   │   ├── urdu_llava_in_the_wild.yaml
    │   │   │   ├── arabic_llava_in_the_wild.yaml
    │   │   │   ├── french_llava_in_the_wild.yaml
    │   │   │   ├── hindi_llava_in_the_wild.yaml
    │   │   │   ├── spanish_llava_in_the_wild.yaml
    │   │   │   ├── bengali_llava_in_the_wild.yaml
    │   │   │   ├── chinese_llava_in_the_wild.yaml
    │   │   │   ├── russian_llava_in_the_wild.yaml
    │   │   │   ├── japanese_llava_in_the_wild.yaml
    │   │   │   └── _default_template.yaml
    │   │   ├── ok_vqa
    │   │   │   ├── _ok_vqa.yaml
    │   │   │   ├── ok_vqa_val2014.yaml
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   └── utils.py
    │   │   ├── flickr30k
    │   │   │   ├── flickr30k.yaml
    │   │   │   └── flickr30k_test.yaml
    │   │   ├── mmmu
    │   │   │   ├── mmmu.yaml
    │   │   │   ├── mmmu_test.yaml
    │   │   │   └── mmmu_val.yaml
    │   │   ├── vqav2
    │   │   │   ├── _vqav2.yaml
    │   │   │   ├── vqav2_test.yaml
    │   │   │   ├── vqav2_val.yaml
    │   │   │   └── _default_template_vqav2_yaml
    │   │   ├── cmmmu
    │   │   │   ├── _cmmmu.yaml
    │   │   │   ├── _default_template_cmmmu_yaml
    │   │   │   ├── cmmmu_test.yaml
    │   │   │   └── cmmmu_val.yaml
    │   │   ├── docvqa
    │   │   │   ├── docvqa.yaml
    │   │   │   ├── docvqa_val.yaml
    │   │   │   ├── docvqa_test.yaml
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   └── utils.py
    │   │   ├── iconqa
    │   │   │   ├── iconqa.yaml
    │   │   │   ├── iconqa_test.yaml
    │   │   │   ├── iconqa_val.yaml
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   └── utils.py
    │   │   ├── nocaps
    │   │   │   ├── nocaps.yaml
    │   │   │   ├── _default_template_nocaps_yaml
    │   │   │   ├── nocaps_test.yaml
    │   │   │   └── nocaps_val.yaml
    │   │   ├── textvqa
    │   │   │   ├── _textvqa.yaml
    │   │   │   ├── textvqa_test.yaml
    │   │   │   ├── textvqa_val.yaml
    │   │   │   └── _default_template_textvqa_yaml
    │   │   ├── websrc
    │   │   │   ├── websrc.yaml
    │   │   │   ├── websrc_val.yaml
    │   │   │   ├── websrc_test.yaml
    │   │   │   └── README.md
    │   │   ├── infovqa
    │   │   │   ├── infovqa.yaml
    │   │   │   ├── infovqa_val.yaml
    │   │   │   ├── infovqa_test.yaml
    │   │   │   ├── _default_template_infovqa_yaml
    │   │   │   └── utils.py
    │   │   ├── textcaps
    │   │   │   ├── textcaps.yaml
    │   │   │   ├── _default_template_textcaps_yaml
    │   │   │   ├── textcaps_test.yaml
    │   │   │   ├── textcaps_val.yaml
    │   │   │   └── textcaps_train.yaml
    │   │   ├── vizwiz_vqa
    │   │   │   ├── _vizwiz_vqa.yaml
    │   │   │   ├── vizwiz_vqa_val.yaml
    │   │   │   ├── vizwiz_vqa_test.yaml
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   └── utils.py
    │   │   ├── pope
    │   │   │   ├── pope_full.yaml
    │   │   │   ├── pope.yaml
    │   │   │   ├── pope_adv.yaml
    │   │   │   ├── pope_pop.yaml
    │   │   │   └── pope_random.yaml
    │   │   ├── scienceqa
    │   │   │   ├── scienceqa_full.yaml
    │   │   │   ├── scienceqa.yaml
    │   │   │   ├── scienceqa_img.yaml
    │   │   │   └── utils.py
    │   │   ├── coco_cap
    │   │   │   ├── coco2014_cap.yaml
    │   │   │   ├── coco2017_cap.yaml
    │   │   │   ├── coco_cap.yaml
    │   │   │   ├── coco2017_cap_test.yaml
    │   │   │   ├── coco2014_cap_test.yaml
    │   │   │   ├── coco2017_cap_val.yaml
    │   │   │   └── coco2014_cap_val.yaml
    │   │   ├── multidocvqa
    │   │   │   ├── multidocvqa.yaml
    │   │   │   ├── multidocvqa_test.yaml
    │   │   │   └── multidocvqa_val.yaml
    │   │   ├── screenspot
    │   │   │   ├── _screenspot.yaml
    │   │   │   ├── screenspot_rec_test.yaml
    │   │   │   ├── screenspot_reg_test.yaml
    │   │   │   ├── _default_template_reg_yaml
    │   │   │   └── _default_template_rec_yaml
    │   │   ├── refcoco
    │   │   │   ├── refcoco_seg_val.yaml
    │   │   │   ├── refcoco_bbox_val.yaml
    │   │   │   ├── refcoco_seg_test.yaml
    │   │   │   ├── refcoco_bbox_test.yaml
    │   │   │   ├── refcoco_bbox_testA.yaml
    │   │   │   ├── refcoco_bbox_testB.yaml
    │   │   │   ├── refcoco_seg_testA.yaml
    │   │   │   ├── refcoco_seg_testB.yaml
    │   │   │   ├── refcoco_bbox_rec_val.yaml
    │   │   │   ├── refcoco_bbox_rec_test.yaml
    │   │   │   ├── refcoco_bbox_rec_testA.yaml
    │   │   │   ├── refcoco_bbox_rec_testB.yaml
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   └── _default_template_bbox_rec_yaml
    │   │   ├── refcoco+
    │   │   │   ├── refcoco+_seg_val.yaml
    │   │   │   ├── refcoco+_bbox_val.yaml
    │   │   │   ├── refcoco+_seg_testA.yaml
    │   │   │   ├── refcoco+_seg_testB.yaml
    │   │   │   ├── refcoco+_bbox_testA.yaml
    │   │   │   ├── refcoco+_bbox_testB.yaml
    │   │   │   ├── refcoco+_bbox_rec_val.yaml
    │   │   │   ├── refcoco+_bbox_rec_testA.yaml
    │   │   │   ├── refcoco+_bbox_rec_testB.yaml
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   └── _default_template_bbox_rec_yaml
    │   │   ├── refcocog
    │   │   │   ├── refcocog_seg_val.yaml
    │   │   │   ├── refcocog_bbox_val.yaml
    │   │   │   ├── refcocog_seg_test.yaml
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcocog_bbox_test.yaml
    │   │   │   ├── refcocog_bbox_rec_val.yaml
    │   │   │   ├── refcocog_bbox_rec_test.yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   └── _default_template_bbox_rec_yaml
    │   │   ├── olympiadbench
    │   │   │   ├── olympiadbench.yaml
    │   │   │   ├── olympiadbench_test_cn.yaml
    │   │   │   ├── olympiadbench_test_en.yaml
    │   │   │   └── cn_utils.py
    │   │   ├── mathvista
    │   │   │   ├── mathvista.yaml
    │   │   │   ├── mathvista_test.yaml
    │   │   │   └── mathvista_testmini.yaml
    │   │   ├── mmbench
    │   │   │   ├── mmbench_cn.yaml
    │   │   │   ├── mmbench_en.yaml
    │   │   │   ├── mmbench_cn_test.yaml
    │   │   │   ├── mmbench_en_test.yaml
    │   │   │   ├── mmbench.yaml
    │   │   │   ├── mmbench_cn_dev.yaml
    │   │   │   ├── mmbench_en_dev.yaml
    │   │   │   ├── _default_template_mmbench_cn_yaml
    │   │   │   ├── _default_template_mmbench_en_yaml
    │   │   │   └── mmbench_cc.yaml
    │   │   ├── mathverse
    │   │   │   ├── mathverse.yaml
    │   │   │   ├── mathverse_testmini.yaml
    │   │   │   ├── mathverse_testmini_text_lite.yaml
    │   │   │   ├── mathverse_testmini_text_only.yaml
    │   │   │   ├── mathverse_testmini_vision_only.yaml
    │   │   │   ├── mathverse_testmini_text_dominant.yaml
    │   │   │   ├── mathverse_testmini_vision_dominant.yaml
    │   │   │   └── mathverse_testmini_vision_intensive.yaml
    │   │   ├── seedbench
    │   │   │   ├── seedbench_ppl.yaml
    │   │   │   ├── seedbench.yaml
    │   │   │   └── utils.py
    │   │   ├── stvqa
    │   │   │   ├── stvqa.yaml
    │   │   │   └── utils.py
    │   │   ├── ocrbench
    │   │   │   └── ocrbench.yaml
    │   │   ├── gqa
    │   │   │   ├── gqa.yaml
    │   │   │   └── utils.py
    │   │   ├── mmvet
    │   │   │   └── mmvet.yaml
    │   │   ├── chartqa
    │   │   │   ├── chartqa.yaml
    │   │   │   └── utils.py
    │   │   ├── ai2d
    │   │   │   ├── ai2d.yaml
    │   │   │   └── utils.py
    │   │   ├── realworldqa
    │   │   │   └── realworldqa.yaml
    │   │   ├── mme
    │   │   │   └── mme.yaml
    │   │   ├── llava-bench-coco
    │   │   │   └── llava-bench-coco.yaml
    │   │   ├── ferret
    │   │   │   └── ferret.yaml
    │   │   ├── llava-in-the-wild
    │   │   │   └── llava-in-the-wild.yaml
    │   │   ├── hallusion_bench
    │   │   │   └── hallusion_bench_image.yaml
    │   │   ├── seedbench_2
    │   │   │   ├── seedbench_2.yaml
    │   │   │   └── utils.py
    │   │   └── naturalbench
    │   │   │   └── naturalbench.yaml
    │   └── filters
    │   │   ├── decontamination.py
    │   │   ├── transformation.py
    │   │   ├── __init__.py
    │   │   └── selection.py
    ├── miscs
    │   ├── llava_result_check.md
    │   ├── test_scienceqa.py
    │   ├── repr_scripts.sh
    │   ├── script.sh
    │   └── test_llava.py
    ├── setup.py
    ├── docs
    │   └── README.md
    ├── llava_repr_requirements.txt
    └── pyproject.toml
├── ola_vlm
    ├── eval
    │   ├── mmstar
    │   │   ├── evaluate
    │   │   │   └── __init__.py
    │   │   └── smp
    │   │   │   ├── __init__.py
    │   │   │   └── log.py
    │   ├── eval_mmstar.py
    │   ├── merge_json.py
    │   ├── get_probe_dsg_scores.py
    │   └── eval_cv_bench.py
    ├── __init__.py
    ├── model
    │   ├── aux_heads
    │   │   ├── __init__.py
    │   │   ├── depth_anything_v2
    │   │   │   └── dinov2_layers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── layer_scale.py
    │   │   │   │   ├── drop_path.py
    │   │   │   │   ├── mlp.py
    │   │   │   │   └── swiglu_ffn.py
    │   │   └── gen_head.py
    │   ├── __init__.py
    │   ├── multimodal_encoder
    │   │   └── builder.py
    │   ├── utils.py
    │   ├── consolidate.py
    │   ├── apply_delta.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── make_delta.py
    ├── train
    │   ├── train_mem.py
    │   ├── ola_vlm_train_mem.py
    │   └── probe_dsg_train_mem.py
    └── constants.py
├── assets
    ├── pb.jpg
    ├── arch.png
    ├── cars.jpg
    ├── teaser.png
    └── probe_plots.png
├── datasets
    └── ocr_vqa
    │   └── dataset.json
├── .gitignore
├── scripts
    ├── probe
    │   ├── eval_probe_task.sh
    │   ├── eval_probe_cos_sim.sh
    │   └── probe.sh
    ├── zero2.json
    ├── zero2_offload.json
    ├── zero3.json
    ├── eval
    │   ├── mmstar.sh
    │   └── cv-bench.sh
    ├── zero3_offload.json
    └── train
    │   ├── finetune.sh
    │   ├── vpt_ift.sh
    │   ├── vpt.sh
    │   └── pretrain.sh
├── .gitattributes
├── docs
    └── Evaluation.md
└── setup.py


/lmms-eval/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_result_check.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ola_vlm/eval/mmstar/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .mmstar import MMStar_eval


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/assets/pb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/pb.jpg


--------------------------------------------------------------------------------
/assets/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/arch.png


--------------------------------------------------------------------------------
/assets/cars.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/cars.jpg


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/teaser.png


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task:
3 | - ok_vqa_val2014


--------------------------------------------------------------------------------
/assets/probe_plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHI-Labs/VisPer-LM/HEAD/assets/probe_plots.png


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml:
--------------------------------------------------------------------------------
1 | group: flickr30k
2 | task:
3 | - flickr30k_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu
2 | task:
3 | - mmmu_val
4 | - mmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml:
--------------------------------------------------------------------------------
1 | group: vqav2
2 | task:
3 | - vqav2_val
4 | - vqav2_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml:
--------------------------------------------------------------------------------
1 | group: cmmmu
2 | task:
3 | - cmmmu_val
4 | - cmmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml:
--------------------------------------------------------------------------------
1 | group: docvqa
2 | task:
3 | - docvqa_val
4 | - docvqa_test


--------------------------------------------------------------------------------
/ola_vlm/eval/mmstar/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .misc import *
3 | from .log import *


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml:
--------------------------------------------------------------------------------
1 | group: iconqa
2 | task:
3 | - iconqa_val
4 | - iconqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml:
--------------------------------------------------------------------------------
1 | group : nocaps
2 | task:
3 |   - nocaps_test
4 |   - nocaps_val


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml:
--------------------------------------------------------------------------------
1 | group: textvqa
2 | task:
3 | - textvqa_val
4 | - textvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc.yaml:
--------------------------------------------------------------------------------
1 | group: websrc
2 | task:
3 | - websrc_val
4 | - websrc_test
5 | 


--------------------------------------------------------------------------------
/ola_vlm/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | from .model import LlavaPhi3ForCausalLM
3 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml:
--------------------------------------------------------------------------------
1 | group: infovqa
2 | task:
3 | - infovqa_val
4 | - infovqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml:
--------------------------------------------------------------------------------
1 | group : textcaps
2 | task:
3 |   - textcaps_val
4 |   - textcaps_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: vizwiz_vqa
2 | task:
3 | - vizwiz_vqa_val
4 | - vizwiz_vqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_full.yaml:
--------------------------------------------------------------------------------
1 | group : pope_full
2 | task:
3 |   - pope_adv
4 |   - pope_pop
5 |   - pope_random


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml:
--------------------------------------------------------------------------------
1 | group: scienceqa_full
2 | task:
3 |   - scienceqa
4 |   - scienceqa_img


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2014_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2017_cap
2 | task:
3 |   - coco2017_cap_val
4 |   - coco2017_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_test"
2 | test_split: test
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_val"
2 | test_split: val
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml:
--------------------------------------------------------------------------------
1 | group: multidocvqa
2 | task:
3 | - multidocvqa_val
4 | - multidocvqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot
2 | task:
3 | - screenspot_reg_test
4 | - screenspot_rec_test


--------------------------------------------------------------------------------
/lmms-eval/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | # This is to make sure that the package supports editable installs
4 | setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task: ok_vqa_val2014
3 | test_split: val2014
4 | include: _default_template_vqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_val
3 | test_split: val
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_val
3 | test_split: val
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_test
3 | test_split: test
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testA
3 | include: _default_template_seg_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testB
3 | include: _default_template_seg_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_test
3 | test_split: test
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testA
3 | test_split: testA
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testB
3 | test_split: testB
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testA
3 | test_split: testA
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testB
3 | test_split: testB
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_test
3 | include: _default_template_seg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/datasets/ocr_vqa/dataset.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c9d2bb4c67462e2649be5099a3b790c95ad073fe46243310b79a1d4c8bee75ed
3 | size 112962519
4 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testA
3 | include: _default_template_bbox_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testB
3 | include: _default_template_bbox_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog
2 | task:
3 | - refcocog_seg_test
4 | - refcocog_seg_val
5 | - refcocog_bbox_test
6 | - refcocog_bbox_val
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_test
3 | include: _default_template_bbox_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_rec
2 | task: screenspot_rec_test
3 | include: _default_template_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_reg
2 | task: screenspot_reg_test
3 | include: _default_template_reg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test
5 |   - coco2017_cap_val
6 |   - coco2017_cap_test
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: "Provide a one-sentence caption for the provided image."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_val
3 | test_split: val
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: Provide a one-sentence caption for the provided image.


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml:
--------------------------------------------------------------------------------
1 | group: olympiadbench
2 | task:
3 | - olympiadbench_test_en
4 | - olympiadbench_test_cn
5 | metadata:
6 |   - version: 0.0
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_test
3 | test_split: test
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testA
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testB
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testA
3 | test_split: testA
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testB
3 | test_split: testB
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_test
3 | include: _default_template_bbox_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_scienceqa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"]
4 | for doc in dataset:
5 |     print(doc["id"])
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml:
--------------------------------------------------------------------------------
1 | group: mathvista
2 | task:
3 |   - mathvista_testmini
4 |   - mathvista_test
5 | metadata:
6 |   version: 0.0
7 |   gpt_eval_model_name: "gpt-4-0613"
8 |   quick_extract: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+
2 | task:
3 | - refcoco+_seg_val
4 | - refcoco+_seg_testA
5 | - refcoco+_seg_testB
6 | - refcoco+_bbox_val
7 | - refcoco+_bbox_testA
8 | - refcoco+_bbox_testB
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_val"
2 | test_split: validation
3 | metric_list:
4 |   - metric: anls
5 |     aggregation: mean
6 |     higher_is_better: true
7 | include: _default_template_docvqa_yaml
8 | 


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .da_v2_head import DepthHead, DAv2_Head, DepthProbeHead, TaskTokenDepthHead
2 | from .oneformer_head import OneFormerSegHead, OneFormerTaskTokenSegHead
3 | from .gen_head import GenHead, TaskTokenGenHead


--------------------------------------------------------------------------------
/ola_vlm/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from ola_vlm.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     try:
5 |         train(attn_implementation="flash_attention_2")
6 |     except:
7 |         train(attn_implementation="eager")
8 | 


--------------------------------------------------------------------------------
/ola_vlm/train/ola_vlm_train_mem.py:
--------------------------------------------------------------------------------
1 | from ola_vlm.train.ola_vlm_train import train
2 | 
3 | if __name__ == "__main__":
4 |     try:
5 |         train(attn_implementation="flash_attention_2")
6 |     except:
7 |         train(attn_implementation="eager")


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_cn
2 | task:
3 |   - mmbench_cn_dev
4 |   - mmbench_cn_test
5 |   - mmbench_cn_cc
6 | metadata:
7 |   version: 0.0
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 |   sys_prompt: "有如下几个选项："


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_en
2 | task:
3 |   - mmbench_en_dev
4 |   - mmbench_en_test
5 | metadata:
6 |   version: 0.0
7 |   sys_prompt: "There are several options:"
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 | 


--------------------------------------------------------------------------------
/ola_vlm/train/probe_dsg_train_mem.py:
--------------------------------------------------------------------------------
1 | from ola_vlm.train.probe_dsg_train import train
2 | 
3 | if __name__ == "__main__":
4 |     try:
5 |         train(attn_implementation="flash_attention_2")
6 |     except:
7 |         train(attn_implementation="eager")


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "infovqa_val"
2 | test_split: validation
3 | output_type: generate_until
4 | metric_list:
5 |   - metric: anls
6 |     aggregation: mean
7 |     higher_is_better: true
8 | include: _default_template_infovqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: urdu
4 |     token: True
5 | task: "llava_in_the_wild_urdu"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: arabic
4 |     token: True
5 | task: "llava_in_the_wild_arabic"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: french
4 |     token: True
5 | task: "llava_in_the_wild_french"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: hindi
4 |     token: True
5 | task: "llava_in_the_wild_hindi"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |   config: spanish
4 |   token: True
5 | task: "llava_in_the_wild_spanish"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml:
--------------------------------------------------------------------------------
 1 | group: refcoco
 2 | task:
 3 | - refcoco_seg_test
 4 | - refcoco_seg_val
 5 | - refcoco_seg_testA
 6 | - refcoco_seg_testB
 7 | - refcoco_bbox_test
 8 | - refcoco_bbox_val
 9 | - refcoco_bbox_testA
10 | - refcoco_bbox_testB
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: bengali
4 |     token: True
5 | task: "llava_in_the_wild_bengali"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: chinese
4 |     token: True
5 | task: "llava_in_the_wild_chinese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: russian
4 |     token: True
5 | task: "llava_in_the_wild_russian"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: textvqa_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function utils.textvqa_aggreate_submissions
6 |     higher_is_better: true
7 | include: _default_template_textvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: japanese
4 |     token: True
5 | task: "llava_in_the_wild_japanese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml:
--------------------------------------------------------------------------------
1 | task: mmbench_cn_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function cn_utils.mmbench_aggregate_test_results
6 |     higher_is_better: true
7 | include: _default_template_mmbench_cn_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml:
--------------------------------------------------------------------------------
1 | task: "mmbench_en_test"
2 | test_split: test
3 | include: _default_template_mmbench_en_yaml
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function en_utils.mmbench_aggregate_test_results
7 |     higher_is_better: true
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml:
--------------------------------------------------------------------------------
 1 | group: mmbench
 2 | task:
 3 |   - mmbench_en_dev
 4 |   - mmbench_en_test
 5 |   - mmbench_cn_dev
 6 |   - mmbench_cn_test
 7 |   - mmbench_cn_cc
 8 | metadata:
 9 |   version: 0.0
10 |   sys_prompt: "There are several options:"
11 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/CMMMU
2 | output_type: generate_until
3 | doc_to_visual: !function utils.cmmmu_doc_to_visual
4 | doc_to_text: !function utils.cmmmu_doc_to_text
5 | doc_to_target: "answer"
6 | generation_kwargs:
7 |   max_new_tokens: 16
8 |   image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml:
--------------------------------------------------------------------------------
1 | task: "vqav2_test"
2 | include: _default_template_vqav2_yaml
3 | test_split: test
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.vqav2_aggreate_submissions
7 |     higher_is_better: true
8 | process_results: !function utils.vqav2_process_results_test
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_test"
2 | test_split: test
3 | process_results: !function utils.docvqa_test_process_results
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.docvqa_test_aggregate_results
7 |     higher_is_better: true
8 | include: _default_template_docvqa_yaml
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "vqav2_val"
 2 | include: _default_template_vqav2_yaml
 3 | test_split: validation
 4 | metric_list:
 5 |   - metric: exact_match
 6 |     aggregation: mean
 7 |     higher_is_better: true
 8 |     ignore_case: true
 9 |     ignore_punctuation: true
10 | process_results: !function utils.vqav2_process_results_val
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "infovqa_test"
 2 | test_split: test
 3 | output_type: generate_until
 4 | process_results: !function utils.infovqa_test_process_results
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.infovqa_test_aggregate_results
 8 |     higher_is_better: true
 9 | include: _default_template_infovqa_yaml
10 |   


--------------------------------------------------------------------------------
/ola_vlm/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_cn_dev"
 2 | test_split: "dev"
 3 | metric_list:
 4 |   - metric: gpt_eval_score
 5 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
 6 |     higher_is_better: true
 7 |   - metric: submission
 8 |     higher_is_better: true
 9 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results
10 | include: _default_template_mmbench_cn_yaml
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | task: textvqa_val
 2 | test_split: validation
 3 | metric_list:
 4 |   - metric: exact_match
 5 |     aggregation: mean
 6 |     higher_is_better: true
 7 |     ignore_case: true
 8 |     ignore_punctuation: true
 9 |   - metric: submission
10 |     aggregation: !function utils.textvqa_aggreate_submissions
11 |     higher_is_better: true
12 | include: _default_template_textvqa_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_en_dev"
 2 | test_split: dev
 3 | include: _default_template_mmbench_en_yaml
 4 | metric_list:
 5 |   - metric: gpt_eval_score
 6 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
 7 |     higher_is_better: true
 8 |   - metric: submission
 9 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
10 |     higher_is_better: true


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | 
11 | # Data
12 | !**/alpaca-data-conversation.json
13 | 
14 | # Editor
15 | .idea
16 | *.swp
17 | 
18 | # Other
19 | .DS_Store
20 | wandb
21 | output
22 | 
23 | checkpoints
24 | ckpts*
25 | *.pth
26 | 
27 | .ipynb_checkpoints
28 | *.ipynb
29 | 
30 | # DevContainer
31 | !.devcontainer/*
32 | 
33 | # Demo
34 | serve_images/
35 | 


--------------------------------------------------------------------------------
/ola_vlm/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_phi3 import LlavaPhi3ForCausalLM, LlavaPhi3Config
3 | from .language_model.ola_llama import OlaLlavaLlamaForCausalLM, OlaLlavaLlamaConfig
4 | from .language_model.ola_phi3 import OlaLlavaPhi3ForCausalLM, OlaLlavaPhi3Config
5 | from .language_model.probe_llava_llama import ProbeDSGLlavaLlamaForCausalLM, ProbeDSGLlavaLlamaConfig


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_val
 3 | test_split: val
 4 | include: _default_template_vqa_yaml
 5 | metric_list:
 6 |   - metric: exact_match
 7 |     aggregation: mean
 8 |     higher_is_better: true
 9 |     ignore_case: true
10 |     ignore_punctuation: true
11 |   # - metric: submission
12 |   #   aggregation: !function utils.vizwiz_vqa_aggreate_submissions
13 |   #   higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml:
--------------------------------------------------------------------------------
 1 | group: mathverse
 2 | task:
 3 |   - mathverse_testmini
 4 |   - mathverse_testmini_text_only
 5 |   - mathverse_testmini_text_lite
 6 |   - mathverse_testmini_text_dominant
 7 |   - mathverse_testmini_vision_intensive
 8 |   - mathverse_testmini_vision_dominant
 9 |   - mathverse_testmini_vision_only
10 | metadata:
11 |   version: 0.0
12 |   gpt_eval_model_name: "gpt-3.5-turbo"
13 |   trunk_response: 30
14 |   quick_match: false


--------------------------------------------------------------------------------
/ola_vlm/eval/eval_mmstar.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from ola_vlm.eval.mmstar.evaluate import MMStar_eval
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--results_file', type=str, default="./playground/data/eval/mmstar_results.jsonl")
11 |     return parser.parse_args()
12 | 
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     args = parse_args()
17 |     MMStar_eval(args.results_file)
18 | 


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .mlp import Mlp
 8 | from .patch_embed import PatchEmbed
 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 | 


--------------------------------------------------------------------------------
/lmms-eval/docs/README.md:
--------------------------------------------------------------------------------
 1 | # LMMs Eval Documentation
 2 | 
 3 | Welcome to the docs for `lmms-eval`!
 4 | 
 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VQAv2
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vqav2_doc_to_visual
 6 | doc_to_text: !function utils.vqav2_doc_to_text
 7 | doc_to_target: "answer"
 8 | generation_kwargs:
 9 |   max_new_tokens: 16
10 | metadata:
11 |   - version: 0.0
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_test
 3 | test_split: test
 4 | include: _default_template_vqa_yaml
 5 | process_results: !function utils.vizwiz_vqa_process_results
 6 | metric_list:
 7 |   # - metric: exact_match
 8 |   #   aggregation: mean
 9 |   #   higher_is_better: true
10 |   #   ignore_case: true
11 |   #   ignore_punctuation: true
12 |   - metric: submission
13 |     aggregation: !function utils.vizwiz_vqa_aggreate_submissions
14 |     higher_is_better: true
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: InfographicVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | doc_to_target: "answers"
 6 | doc_to_visual: !function utils.infovqa_doc_to_visual
 7 | doc_to_text: !function utils.infovqa_doc_to_text
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/scripts/probe/eval_probe_task.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | for IDX in $(seq 0 $((CHUNKS-1))); do
 9 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.eval_probe_task \
10 |         --model-path $1 --json-file datasets/coco/annotations/captions_val2017.json \
11 |         --mode $2 --num-chunks $CHUNKS --chunk-idx $IDX &
12 | done
13 | 
14 | wait
15 | 
16 | python -m ola_vlm.eval.get_probe_task_scores --ckpt $1 --mode $2


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual
 4 | doc_to_text: !function utils.screenspot_doc_to_text
 5 | doc_to_target: "instruction"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.screenspot_process_result
10 | metric_list:
11 |   - metric: screenspot_CIDEr
12 |     aggregation : !function utils.screenspot_cider
13 |     higher_is_better : true
14 | metadata:
15 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_test"
 2 | test_split: test
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_test_results_for_submission
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | metric_list:
 7 |   - metric: submission
 8 |     aggregation: !function utils.cmmmu_test_aggregate_results_for_submission
 9 |     higher_is_better: false
10 | metadata:
11 |   - version: 0.0
12 | include: _default_template_cmmmu_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/textvqa
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.textvqa_doc_to_visual
 4 | doc_to_text: !function utils.textvqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.textvqa_process_results
10 | model_specific_prompt_kwargs:
11 |   default:
12 |     pre_prompt: ""
13 |     post_prompt: "\nAnswer the question using a single word or phrase."
14 |     ocr: false
15 |   qwen_vl:
16 |     pre_prompt: ""
17 |     post_prompt: " Answer:"
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench_ppl"
 5 | test_split: test
 6 | output_type: multiple_choice
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text_mc
 9 | doc_to_choice : !function utils.seed_doc_to_choice
10 | doc_to_target: !function utils.seed_doc_to_mc_target
11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
12 | metric_list:
13 |   - metric: acc
14 | metadata:
15 |   - version: 0.0


--------------------------------------------------------------------------------
/scripts/probe/eval_probe_cos_sim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | for IDX in $(seq 0 $((CHUNKS-1))); do
 9 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.eval_probe_dsg \
10 |         --model-path $1 --json-file datasets/coco/annotations/captions_val2017.json \
11 |         --mode $2 --num-chunks $CHUNKS --chunk-idx $IDX &
12 | done
13 | 
14 | wait
15 | 
16 | python -m ola_vlm.eval.merge_json --ckpt $1 --mode $2 --num-chunks $CHUNKS
17 | 
18 | python -m ola_vlm.eval.get_probe_dsg_scores --ckpt $1 --mode $2


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: DocVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | output_type: generate_until
 6 | doc_to_visual: !function utils.docvqa_doc_to_visual
 7 | doc_to_text: !function utils.docvqa_doc_to_text
 8 | doc_to_target: "answers"
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer the question using a single word or phrase."
17 |   qwen_vl:
18 |     pre_prompt: ""
19 |     post_prompt: " Answer:"
20 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_val"
 2 | test_split: val
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_results
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   image_aspect_ratio: original
 9 | metric_list:
10 |   - metric: cmmmu_acc
11 |     aggregation: !function utils.cmmmu_aggregate_results
12 |     higher_is_better: true
13 | metadata:
14 |   - version: 0.0
15 | include: _default_template_cmmmu_yaml
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VizWiz-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metadata:
10 |   - version: 0.0
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
15 | process_results: !function utils.vizwiz_vqa_process_results
16 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": false,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ST-VQA
 2 | task: "stvqa"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.stvqa_doc_to_visual
 6 | doc_to_text: !function utils.stvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.stvqa_process_results
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.stvqa_aggregate_submissions
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/miscs/repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install LLaVA without building dependencies
 6 | cd LLaVA
 7 | pip install --no-deps -U -e .
 8 | 
 9 | # install all the requirements that require for reproduce llava results
10 | pip install -r llava_repr_requirements.txt
11 | 
12 | # Run and exactly reproduce llava_v1.5 results!
13 | # mme as an example
14 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: echo840/OCRBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ocrbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual
 8 | doc_to_text: !function utils.ocrbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.ocrbench_process_results
17 | metric_list:
18 |   - metric: ocrbench_accuracy
19 |     aggregation: !function utils.ocrbench_aggregate_accuracy
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | AVAILABLE_MODELS = {
 4 |     "llava": "Llava",
 5 |     "llava_hf": "LlavaHf",
 6 |     "llava_sglang": "LlavaSglang",
 7 |     "qwen_vl": "Qwen_VL",
 8 |     "fuyu": "Fuyu",
 9 |     "gpt4v": "GPT4V",
10 |     "instructblip": "InstructBLIP",
11 |     "minicpm_v": "MiniCPM_V",
12 |     "idefics2": "Idefics2",
13 |     "qwen_vl_api": "Qwen_VL_API",
14 |     "phi3v": "Phi3v",
15 | }
16 | 
17 | for model_name, model_class in AVAILABLE_MODELS.items():
18 |     try:
19 |         exec(f"from .{model_name} import {model_class}")
20 |     except ImportError:
21 |         pass
22 | 
23 | 
24 | import hf_transfer
25 | 
26 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | dataset_name: "cn"
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text
 9 | generation_kwargs:
10 |   max_new_tokens: 256
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function cn_utils.mmbench_process_results
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
20 | model_specific_generation_kwargs:
21 |   llava:
22 |     image_aspect_ratio: original
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_test_results_for_submission
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc
 2 | task: "websrc_val"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: websrc_squad_f1
16 |     aggregation: !function utils.websrc_aggregate_results
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc-test
 2 | task: "websrc_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.websrc_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | model_specific_prompt_kwargs:
 6 |   default:
 7 |     pre_prompt: ""
 8 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual
10 | doc_to_text: !function en_utils.mmbench_doc_to_text
11 | doc_to_target: "answer"
12 | process_results: !function en_utils.mmbench_process_results
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | output_type: generate_until
17 | dataset_name: "en"
18 | generation_kwargs:
19 |   until:
20 |     - "ASSISTANT:"
21 |   max_new_tokens: 1024
22 |   temperature: 0
23 |   top_p: 0
24 |   num_beams: 1
25 |   do_sample: false
26 | 


--------------------------------------------------------------------------------
/ola_vlm/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | from .clip_convnext_encoder import CLIPConvNextVisionTower
 4 | 
 5 | 
 6 | def build_vision_tower(vision_tower_cfg, **kwargs):
 7 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 8 |     if "clip" in vision_tower and "convnext" not in vision_tower:
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 |     elif "convnext" in vision_tower.lower():
11 |         return CLIPConvNextVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12 |     elif "sam" in vision_tower.lower():
13 |         return SAMVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | metric_list:
17 |   - metric: mmmu_acc
18 |     aggregation: !function utils.mmmu_aggregate_results
19 |     higher_is_better: true
20 | metadata:
21 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_cn"
 5 | test_split: test_cn
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function cn_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function cn_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_en"
 5 | test_split: test_en
 6 | output_type: generate_until
 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function en_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function en_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2017_cap_test"
 5 | group : "coco_caption2017"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_val"
 3 | test_split: val
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_results
13 | metric_list:
14 |   - metric: anls
15 |     aggregation: !function utils.multidocvqa_aggregate_results_anls
16 |     higher_is_better: true
17 |   - metric: accuracy
18 |     aggregation: !function utils.multidocvqa_aggregate_results_accuracy
19 |     higher_is_better: true
20 | model_specific_prompt_kwargs:
21 |   default:
22 |     pre_prompt: ""
23 |     post_prompt: "\nAnswer the question using a single word or phrase."
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/gqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/GQA
 2 | dataset_name: testdev_balanced_instructions
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "gqa"
 6 | test_split: testdev
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.gqa_doc_to_visual
 9 | doc_to_text: !function utils.gqa_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |     ignore_case: true
22 |     ignore_punctuation: true
23 | metadata:
24 |   - version: 0.0
25 |   
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2014_cap_test"
 5 | group : "coco_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | GQA_RAW_IMAGE_DATASET = None
 4 | GQA_ID2IMAGE = None
 5 | 
 6 | 
 7 | def gqa_doc_to_visual(doc):
 8 |     global GQA_RAW_IMAGE_DATASET
 9 |     global GQA_ID2IMAGE
10 |     if GQA_RAW_IMAGE_DATASET is None:
11 |         GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
12 |         GQA_ID2IMAGE = {}
13 |         for row in GQA_RAW_IMAGE_DATASET:
14 |             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
15 |     image = GQA_ID2IMAGE[doc["imageId"]]
16 |     return [image]
17 | 
18 | 
19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
20 |     question = doc["question"]
21 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
22 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
23 |     return f"{pre_prompt}{question}{post_prompt}"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OK-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual
 4 | doc_to_text: !function utils.ok_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metric_list:
10 |   - metric: exact_match
11 |     aggregation: mean
12 |     higher_is_better: true
13 |     ignore_case: true
14 |     ignore_punctuation: true
15 |   - metric: submission
16 |     aggregation: !function utils.ok_vqa_aggreate_submissions
17 |     higher_is_better: true
18 | process_results: !function utils.ok_vqa_process_results
19 | model_specific_prompt_kwargs:
20 |   default:
21 |     pre_prompt: ""
22 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "offload_optimizer": {
19 |             "device": "cpu",
20 |             "pin_memory": true
21 |           },
22 |           "offload_param": {
23 |             "device": "cpu",
24 |             "pin_memory": true
25 |           },
26 |         "overlap_comm": false,
27 |         "contiguous_gradients": true,
28 |         "sub_group_size": 1e9,
29 |         "reduce_bucket_size": "auto"
30 |     }
31 | }


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMVet
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mmvet"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mmvet_doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"]
 9 | doc_to_target: "{{answer}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mmvet_process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mmvet_aggregate_results
22 |     higher_is_better: true
23 | metadata:
24 |   version: 0.0
25 |   gpt_eval_model_name: "gpt-4"
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "nocaps_test"
 5 | group : "nocaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_passthrough 
21 |     aggregation : !function utils.nocaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 | model_specific_generation_kwargs:
28 |   llava:
29 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_test"
 5 | group : "textcaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_passthrough 
21 |     aggregation : !function utils.textcaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # https://git-scm.com/docs/gitattributes
 2 | # Set the default behavior, in case people don't have core.autocrlf set.
 3 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion
 4 | * text=auto
 5 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
 6 | # Source files
 7 | # ============
 8 | *.pxd    text diff=python
 9 | *.py     text diff=python
10 | *.py3    text diff=python
11 | *.pyw    text diff=python
12 | *.pyx    text diff=python
13 | *.pyz    text diff=python
14 | *.pyi    text diff=python
15 | # Binary files
16 | # ============
17 | *.db     binary
18 | *.p      binary
19 | *.pkl    binary
20 | *.pickle binary
21 | *.pyc    binary export-ignore
22 | *.pyo    binary export-ignore
23 | *.pyd    binary
24 | # Jupyter notebook
25 | *.ipynb  text eol=lf
26 | datasets/ocr_vqa/dataset.json filter=lfs diff=lfs merge=lfs -text
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ICON-QA
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.doc_to_visual
 6 | doc_to_text: !function utils.doc_to_text
 7 | doc_to_target: "answers"
 8 | # process_results: !function utils.test_process_results
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     statement: "Given a set of images and a question, please provide the answer to the question.\n"
17 |     options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
18 |     freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
19 | metric_list:
20 |   - metric: anls
21 |     aggregation: mean
22 |     higher_is_better: true


--------------------------------------------------------------------------------
/scripts/eval/mmstar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | for IDX in $(seq 0 $((CHUNKS-1))); do
 9 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.model_mmstar_loader \
10 |         --model-path $1 --path datasets/eval/MMStar \
11 |         --answers-file datasets/eval/results/$2/mmstar/${CHUNKS}_${IDX}.jsonl \
12 |         --num-chunks $CHUNKS --chunk-idx $IDX --temperature 0 --conv-mode $3 &
13 | done
14 | 
15 | wait
16 | 
17 | output_file=datasets/eval/results/$2/mmstar/merge.jsonl
18 | 
19 | # Clear out the output file if it exists.
20 | > "$output_file"
21 | 
22 | # Loop through the indices and concatenate each file.
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |     cat datasets/eval/results/$2/mmstar/${CHUNKS}_${IDX}.jsonl >> "$output_file"
25 | done
26 | 
27 | python ola_vlm/eval/eval_mmstar.py --results_file $output_file
28 | 
29 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/script.sh:
--------------------------------------------------------------------------------
 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b"   --tasks mme_llava_prompt  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
 2 | 
 3 | 
 4 | gpu = 8 bs 1:
 5 | 
 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 7 | |     Tasks      |Version|Filter|n-shot|  Metric   |Value|   |Stderr |
 8 | |----------------|-------|------|-----:|-----------|----:|---|------:|
 9 | |mme_llava_prompt|Yaml   |none  |     0|exact_match| 1873|±  |38.4331|
10 | 
11 | gpu = 8 bs 1 use_flash_attention_2=True:
12 | 
13 | 
14 | 
15 | 
16 | 
17 | gpu = 4 bs 1 use_flash_attention_2=True:
18 | 
19 | 
20 | 
21 | accelerate launch --num_processes=8 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-13b"   --tasks scienceqa  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
22 | 


--------------------------------------------------------------------------------
/scripts/eval/cv-bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | for IDX in $(seq 0 $((CHUNKS-1))); do
 9 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ola_vlm.eval.model_cvbench_loader \
10 |         --model-path $1 --path datasets/eval/CV-Bench \
11 |         --answers-file datasets/eval/results/$2/cv-bench/${CHUNKS}_${IDX}.jsonl \
12 |         --num-chunks $CHUNKS --chunk-idx $IDX --temperature 0 --conv-mode $3 &
13 | done
14 | 
15 | wait
16 | 
17 | output_file=datasets/eval/results/$2/cv-bench/merge.jsonl
18 | 
19 | # Clear out the output file if it exists.
20 | > "$output_file"
21 | 
22 | # Loop through the indices and concatenate each file.
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |     cat datasets/eval/results/$2/cv-bench/${CHUNKS}_${IDX}.jsonl >> "$output_file"
25 | done
26 | 
27 | python ola_vlm/eval/eval_cv_bench.py --results_file $output_file
28 | 
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val2014"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"ok_vqa_{split}")
16 | 
17 |             save_path = f"./ok_vqa_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "ok_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_ok_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/llava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | datasets==2.16.1
 3 | evaluate==0.4.1
 4 | hf_transfer==0.1.6
 5 | Jinja2==3.1.3
 6 | numpy==1.26.4
 7 | openai==1.13.3
 8 | packaging==23.2
 9 | pandas==2.2.1
10 | Pillow==10.2.0
11 | protobuf==4.25.3
12 | pycocoevalcap==1.2
13 | pycocotools==2.0.7
14 | pytablewriter==1.2.0
15 | pytest==8.0.2
16 | python_Levenshtein==0.25.0
17 | pytz==2024.1
18 | PyYAML==6.0.1
19 | PyYAML==6.0.1
20 | Requests==2.31.0
21 | sacrebleu==2.4.0
22 | scikit_learn==1.2.2
23 | sentencepiece==0.1.99
24 | setuptools==68.2.2
25 | sglang==0.1.12
26 | shortuuid==1.0.12
27 | sqlitedict==2.1.0
28 | tenacity==8.2.3
29 | torch==2.0.1
30 | openai>=1.0.0
31 | pycocoevalcap
32 | tokenizers==0.15.2
33 | tqdm==4.66.2
34 | tqdm-multiprocess
35 | transformers==4.37.2
36 | zstandard
37 | pillow
38 | pyyaml
39 | sympy
40 | mpmath
41 | Jinja2
42 | openpyxl
43 | Levenshtein
44 | hf_transfer
45 | tenacity
46 | wandb>=0.16.0
47 | transformers-stream-generator
48 | tiktoken
49 | pre-commit


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_testmini"
 5 | test_split: testmini
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 |   phi3v:
28 |     shot_type: "solution"
29 | model_specific_generation_kwargs:
30 |   llava:
31 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | 
 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs):
 9 |     question = doc["question"]
10 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
11 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
12 |     return f"{pre_prompt}{question}{post_prompt}"
13 | 
14 | 
15 | def stvqa_doc_to_visual(doc):
16 |     return [doc["image"].convert("RGB")]
17 | 
18 | 
19 | def stvqa_process_results(doc, results):
20 |     answer = results[0]
21 |     return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}}
22 | 
23 | 
24 | def stvqa_aggregate_submissions(results, args):
25 |     file = generate_submission_file("stvqa_test_for_submission.json", args)
26 |     with open(file, "w") as f:
27 |         json.dump(results, f)
28 |     logging.getLogger("lmms-eval").info(f"Results saved to {file}")
29 | 


--------------------------------------------------------------------------------
/ola_vlm/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val", "test"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"vizwiz_{task}_{split}")
16 | 
17 |             save_path = f"./vizwiz_{task}_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_vizwiz_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ChartQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "chartqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.chartqa_doc_to_visual
 8 | doc_to_text: !function utils.chartqa_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | process_results: !function utils.chartqa_process_results
15 | metric_list:
16 |   - metric: relaxed_overall
17 |     aggregation: mean
18 |     higher_is_better: true
19 |   - metric: relaxed_human_split
20 |     aggregation: mean
21 |     higher_is_better: true
22 |   - metric: relaxed_augmented_split
23 |     aggregation: mean
24 |     higher_is_better: true
25 | metadata:
26 |   - version: 0.0
27 | model_specific_prompt_kwargs:
28 |   default:
29 |     pre_prompt: ""
30 |     post_prompt: "\nAnswer the question with a single word."
31 |   qwen_vl:
32 |     pre_prompt: ""
33 |     post_prompt: " Answer:"
34 | 
35 | 


--------------------------------------------------------------------------------
/ola_vlm/eval/merge_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(
 6 |     description='Probe eval')
 7 | parser.add_argument('--ckpt',
 8 |                     help='ckpt',
 9 |                     default='probe_llava-1.5-vicuna-7b-lr-1e-3')
10 | parser.add_argument('--mode',
11 |                     help='mode',
12 |                     default='gen')
13 | parser.add_argument("--num-chunks", type=int, default=1)
14 | 
15 | 
16 | def save_merged_json(data, output_file):
17 |     with open(output_file, 'w') as file:
18 |         json.dump(data, file, indent=4)
19 | 
20 | if __name__ == "__main__":
21 |     args = parser.parse_args()
22 |     merge_data = {}
23 |     name = args.ckpt.split("/")[-1]
24 | 
25 |     for i in range(args.num_chunks):
26 |         with open(f'plots/probe_scores/{name}/{args.mode}/{args.num_chunks}_{i}.json', 'r') as file:
27 |            data = json.load(file)
28 |         merge_data.update(data)
29 |     
30 |     save_merged_json(merge_data, f'plots/probe_scores/{name}/{args.mode}.json')


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ai2d
 2 | task: "ai2d"
 3 | dataset_kwargs:
 4 |   token: True
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ai2d_doc_to_visual
 8 | doc_to_text: !function utils.ai2d_doc_to_text
 9 | doc_to_target: !function utils.ai2d_doc_to_target
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | metric_list:
15 |   - metric: exact_match
16 |     aggregation: mean
17 |     higher_is_better: true
18 |     ignore_case: true
19 |     ignore_punctuation: true
20 | metadata:
21 |   - version: 0.0
22 | 
23 | model_specific_prompt_kwargs:
24 |   default:
25 |     prompt_format: mcq
26 |     pre_prompt: ""
27 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
28 |   # qwen formulate ai2d as question answering instead of mcq
29 |   qwen_vl:
30 |     prompt_format: qa
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 | 
34 | model_specific_target_kwargs:
35 |   default: "mcq"
36 |   qwen_vl: "qa"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/ola_vlm/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m ola_vlm.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from ola_vlm.model import *
10 | from ola_vlm.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-FULL
 3 | task: "scienceqa"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   
33 | model_specific_generation_kwargs:
34 |   llava:
35 |     image_aspect_ratio: original
36 |   
37 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_llava.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | 
 4 | import torch
 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 6 | 
 7 | model_id = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | prompt_1 = "USER: <image>\nWhat does this image show?\nASSISTANT:"
10 | prompt_2 = "USER: <image> <image> \nWhat is the difference between these two images?\nASSISTANT:"
11 | image_file_1 = "image1.png"
12 | image_file_2 = "image2.png"
13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 | raw_image_1 = Image.open(image_file_1)
16 | raw_image_2 = Image.open(image_file_2)
17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16)
18 | import pdb
19 | 
20 | pdb.set_trace()
21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
22 | print(processor.batch_decode(output, skip_special_tokens=True))
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val"]
 5 | splits = ["test", "val"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcocog_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcocog", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 | # The return value of process_results will be used by metrics
15 | process_results: !function utils.seed_process_result
16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17 | metric_list:
18 |   - metric: seed_image
19 |     aggregation: !function utils.seed_aggregation_result
20 |     higher_is_better: true
21 |   - metric: seed_video
22 |     aggregation: !function utils.seed_aggregation_result
23 |     higher_is_better: true
24 |   - metric: seed_all
25 |     aggregation: !function utils.seed_aggregation_result
26 |     higher_is_better: true
27 | metadata:
28 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "val", "testA", "testB"]
 5 | splits = ["val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco+_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco+", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val", "testA", "testB"]
 5 | splits = ["test", "val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_name: cc
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "mmbench_cn_cc"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual
 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 256
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function cc_utils.mmbench_cn_cc_process_results
18 | metric_list:
19 |   - metric: gpt_eval_score
20 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
21 |     higher_is_better: true
22 |   - metric: submission
23 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
24 | metadata:
25 |   version: 0.0
26 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     pre_prompt: ""
31 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | 
 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 7 | 
 8 | lmms_logger = logging.getLogger("lmms-eval")
 9 | 
10 | 
11 | def infovqa_doc_to_visual(doc):
12 |     return [doc["image"].convert("RGB")]
13 | 
14 | 
15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs):
16 |     question = doc["question"]
17 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
18 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
19 |     return f"{pre_prompt}{question}{post_prompt}"
20 | 
21 | 
22 | def infovqa_test_process_results(doc, results):
23 |     pred = results[0]
24 |     questionId = doc["questionId"]
25 |     return {"submission": {"questionId": int(questionId), "answer": pred}}
26 | 
27 | 
28 | def infovqa_test_aggregate_results(results, args):
29 |     # save results as json
30 |     file = generate_submission_file("infovqa_test_for_submission.json", args)
31 |     with open(file, "w") as f:
32 |         json.dump(results, f)
33 |     lmms_logger.info(f"Results saved to {file}")
34 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "pope"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.pope_doc_to_visual
 8 | doc_to_text: !function utils.pope_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.pope_process_results
17 | metric_list:
18 |   - metric: pope_accuracy
19 |     aggregation: !function utils.pope_aggregate_accuracy
20 |     higher_is_better: true
21 |   - metric: pope_precision
22 |     aggregation: !function utils.pope_aggregate_precision
23 |     higher_is_better: true
24 |   - metric: pope_recall
25 |     aggregation: !function utils.pope_aggregate_recall
26 |     higher_is_better: true
27 |   - metric: pope_f1_score
28 |     aggregation: !function utils.pope_aggregate_f1_score
29 |     higher_is_better: true
30 |   - metric: pope_yes_ratio
31 |     aggregation: !function utils.pope_aggregate_yes_ratio
32 |     higher_is_better: true
33 | metadata:
34 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | logger = logging.getLogger("lmms-eval")
 8 | 
 9 | 
10 | def docvqa_doc_to_visual(doc):
11 |     return [doc["image"].convert("RGB")]
12 | 
13 | 
14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs):
15 |     question = doc["question"]
16 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
17 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
18 |     return f"{pre_prompt}{question}{post_prompt}"
19 | 
20 | 
21 | def docvqa_test_process_results(doc, results):
22 |     pred = results[0]
23 |     questionId = doc["questionId"]
24 |     return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}}
25 | 
26 | 
27 | def docvqa_test_aggregate_results(results, args):
28 |     # save results as json
29 |     path = generate_submission_file("docvqa_test_for_submission.json", args)
30 |     with open(path, "w") as f:
31 |         json.dump(results, f)
32 |     logger.info(f"Results saved to {path}")
33 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-IMG
 3 | task: "scienceqa_img"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   idefics2:
33 |     format: default
34 |     pre_prompt: ""
35 |     post_prompt: "\nAnswer:"
36 | model_specific_generation_kwargs:
37 |   llava:
38 |     image_aspect_ratio: original
39 |   
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini"
 6 | test_split: testmini
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_adv.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_adv"
 6 | test_split: adversarial 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_pop.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_pop"
 6 | test_split: popular 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_random.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_random"
 6 | test_split: random 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RealWorldQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "realworldqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual
 8 | doc_to_text: !function utils.realworldqa_doc_to_text
 9 | doc_to_target: "answer"
10 | 
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | 
18 | filter_list:
19 |   - name: "flexible-extract"
20 |     filter:
21 |       - function: !function utils.NumberWordsToDigitsFilter
22 |       - function: !function utils.MultiChoiceRegexFilter
23 |         group_select: 0
24 |         ignore_case: true
25 |         ignore_punctuation: true
26 |         regex_pattern: "(\\([A-Z]\\))"
27 | 
28 | metric_list:
29 |   - metric: exact_match
30 |     aggregation: mean
31 |     higher_is_better: true
32 |     ignore_case: true
33 |     ignore_punctuation: true
34 |       
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""
39 |   gpt4v:
40 |     pre_prompt: ""
41 |     post_prompt: ""
42 | metadata:
43 |   - version: 0.0
44 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_lite"
 6 | test_split: text_lite
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_text_only
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_only"
 6 | test_split: text_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_only"
 6 | test_split: vision_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_dominant"
 6 | test_split: text_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_dominant"
 6 | test_split: vision_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_intensive"
 6 | test_split: vision_intensive
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | test_split: train
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.llava_doc_to_visual
 4 | doc_to_text: !function utils.llava_doc_to_text
 5 | doc_to_target: "gpt_answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 |   image_aspect_ratio: original
10 |   max_new_tokens: 1024
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function utils.llava_process_results
16 | metric_list:
17 |   - metric: gpt_eval_llava_all
18 |     aggregation: !function utils.llava_all_aggregation
19 |     higher_is_better: true
20 |   - metric: gpt_eval_llava_conv
21 |     aggregation: !function utils.llava_conv_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_detail
24 |     aggregation: !function utils.llava_detail_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_complex
27 |     aggregation: !function utils.llava_complex_aggregation
28 |     higher_is_better: true
29 | metadata:
30 |   version: 0.0
31 |   gpt_eval_model_name: "gpt-4-0613"
32 | model_specific_prompt_kwargs:
33 |   default:
34 |     pre_prompt: ""
35 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mme/mme.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MME
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mme"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mme_doc_to_visual
 8 | doc_to_text: !function utils.mme_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | # The return value of process_results will be used by metrics
17 | process_results: !function utils.mme_process_results
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: mme_percetion_score
21 |     aggregation: !function utils.mme_aggregate_results
22 |     higher_is_better: true
23 |   - metric: mme_cognition_score
24 |     aggregation: !function utils.mme_aggregate_results
25 |     higher_is_better: true
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:  
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 |   otterhd:
34 |     pre_prompt: ""
35 |     post_prompt: " Answer:"
36 | metadata:
37 |   - version: 0.0
38 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-coco
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_bench_coco"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 | process_results: !function utils.llava_process_results
19 | metric_list:
20 |   - metric: gpt_eval_llava_all
21 |     aggregation: !function utils.llava_all_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_conv
24 |     aggregation: !function utils.llava_conv_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_detail
27 |     aggregation: !function utils.llava_detail_aggregation
28 |     higher_is_better: true
29 |   - metric: gpt_eval_llava_complex
30 |     aggregation: !function utils.llava_complex_aggregation
31 |     higher_is_better: true
32 | metadata:
33 |   version: 0.0
34 |   gpt_eval_model_name: "gpt-4-0314"
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ferret/ferret.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/Ferret-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ferret"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ferret_doc_to_visual
 8 | doc_to_text: !function utils.ferret_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.ferret_process_results
20 | metric_list:
21 |   - metric: gpt_eval_ferret_all
22 |     aggregation: !function utils.ferret_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_ferret_refer_desc
25 |     aggregation: !function utils.ferret_refer_desc_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_ferret_refer_reason
28 |     aggregation: !function utils.ferret_refer_reason_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_ferret_ground_conv
31 |     aggregation: !function utils.ferret_ground_conv_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0314"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-in-the-wild
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_in_the_wild"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.llava_process_results
20 | metric_list:
21 |   - metric: gpt_eval_llava_all
22 |     aggregation: !function utils.llava_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_llava_conv
25 |     aggregation: !function utils.llava_conv_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_llava_detail
28 |     aggregation: !function utils.llava_detail_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_llava_complex
31 |     aggregation: !function utils.llava_complex_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "magma"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/ola_vlm/eval/mmstar/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger_initialized = {}
 4 | 
 5 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
 6 |     logger = logging.getLogger(name)
 7 |     if name in logger_initialized:
 8 |         return logger
 9 |     
10 |     for logger_name in logger_initialized:
11 |         if name.startswith(logger_name):
12 |             return logger
13 | 
14 |     stream_handler = logging.StreamHandler()
15 |     handlers = [stream_handler]
16 | 
17 |     try:
18 |         import torch.distributed as dist
19 |         if dist.is_available() and dist.is_initialized():
20 |             rank = dist.get_rank()
21 |         else:
22 |             rank = 0
23 |     except ImportError:
24 |         rank = 0
25 | 
26 |     if rank == 0 and log_file is not None:
27 |         file_handler = logging.FileHandler(log_file, file_mode)
28 |         handlers.append(file_handler)
29 | 
30 |     formatter = logging.Formatter(
31 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
32 |     for handler in handlers:
33 |         handler.setFormatter(formatter)
34 |         handler.setLevel(log_level)
35 |         logger.addHandler(handler)
36 | 
37 |     if rank == 0:
38 |         logger.setLevel(log_level)
39 |     else:
40 |         logger.setLevel(logging.ERROR)
41 | 
42 |     logger_initialized[name] = True
43 |     return logger


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | from torch import nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
20 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 |     if keep_prob > 0.0:
22 |         random_tensor.div_(keep_prob)
23 |     output = x * random_tensor
24 |     return output
25 | 
26 | 
27 | class DropPath(nn.Module):
28 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 | 
30 |     def __init__(self, drop_prob=None):
31 |         super(DropPath, self).__init__()
32 |         self.drop_prob = drop_prob
33 | 
34 |     def forward(self, x):
35 |         return drop_path(x, self.drop_prob, self.training)
36 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual
 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text
 5 | doc_to_target: "bbox"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils_rec.screenspot_rec_process_result
10 | metric_list:
11 |   - metric: screenspot_IoU
12 |     aggregation : !function utils_rec.screenspot_rec_iou
13 |     higher_is_better : true
14 |   - metric: screenspot_ACC@0.1
15 |     aggregation : !function utils_rec.screenspot_rec_acc01
16 |     higher_is_better : true
17 |   - metric: screenspot_ACC@0.3
18 |     aggregation : !function utils_rec.screenspot_rec_acc03
19 |     higher_is_better : true
20 |   - metric: screenspot_ACC@0.5
21 |     aggregation : !function utils_rec.screenspot_rec_acc05
22 |     higher_is_better : true
23 |   - metric: screenspot_ACC@0.7
24 |     aggregation : !function utils_rec.screenspot_rec_acc07
25 |     higher_is_better : true
26 |   - metric: screenspot_ACC@0.9
27 |     aggregation : !function utils_rec.screenspot_rec_acc09
28 |     higher_is_better : true
29 |   - metric: screenspot_Center_ACC
30 |     aggregation : !function utils_rec.screenspot_rec_center_acc
31 |     higher_is_better : true
32 | metadata:
33 |   version: '0.0'


--------------------------------------------------------------------------------
/docs/Evaluation.md:
--------------------------------------------------------------------------------
 1 | ## Evaluation
 2 | 
 3 | We evaluate our models on the CV-Bench, MMStar, RealWorldQA, and OK-VQA benchmarks.
 4 | 
 5 | ```bash
 6 | # install evaluation specific dependencies
 7 | pip install -e .["eval"]
 8 | pip install -e lmms-eval/
 9 | ```
10 | 
11 | ### CV-Bench
12 | 
13 | ```bash
14 | # prepare benchmark
15 | git lfs install
16 | cd datasets/eval && git clone https://huggingface.co/datasets/nyu-visionx/CV-Bench & cd ../..
17 | 
18 | # run eval on 4 GPUs
19 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/cv_bench.sh shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b ola_vlm_clip_llama3 llava_llama_3
20 | ```
21 | 
22 | ### MMStar
23 | 
24 | ```bash
25 | # prepare benchmark
26 | git lfs install
27 | cd datasets/eval && git clone https://huggingface.co/datasets/Lin-Chen/MMStar & cd ../..
28 | 
29 | # run eval on 4 GPUs
30 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/eval/mmstar.sh shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b ola_vlm_clip_llama3 llava_llama_3
31 | ```
32 | 
33 | ### RelworldQA (RWQA) and OK-VQA
34 | 
35 | ```bash
36 | # run on 4 GPUs
37 | accelerate launch --num_processes=4 -m lmms_eval --model llava --model_args pretrained=shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b,conv_template=llava_llama_3,attn_implementation="eager",device_map="" --tasks realworldqa,ok_vqa --batch_size 1 --log_samples --log_samples_suffix ola_vlm_clip_llama3 --output_path datasets/eval/results/ola_vlm_clip_llama3
38 | ```
39 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/utils.py:
--------------------------------------------------------------------------------
 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None):
 2 |     question, choices = doc["question"], doc["options"]
 3 |     len_choices = len(choices)
 4 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
 5 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
 6 |     if model_specific_prompt_kwargs["prompt_format"] == "mcq":
 7 |         options = [chr(ord("A") + i) for i in range(len_choices)]
 8 |         choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
 9 |         return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"
10 |     elif model_specific_prompt_kwargs["prompt_format"] == "qa":
11 |         options = "\n".join(choices)
12 |         return f"{pre_prompt}{question}{options}{post_prompt}"
13 |     else:
14 |         raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}")
15 | 
16 | 
17 | def ai2d_doc_to_visual(doc):
18 |     return [doc["image"].convert("RGB")]
19 | 
20 | 
21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs):
22 |     if model_specific_target_kwargs == "mcq":
23 |         len_choices = len(doc["options"])
24 |         options = [chr(ord("A") + i) for i in range(len_choices)]
25 |         return options[int(doc["answer"])]
26 |     elif model_specific_target_kwargs == "qa":
27 |         return doc["options"][int(doc["answer"])]
28 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOPlus
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/HallusionBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "hallusion_bench_image"
 5 | test_split: image
 6 | output_type: generate_until
 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual
 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text
 9 | doc_to_target: "gt_answer_details"
10 | process_results: !function evaluate_hb.hb_process_results
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: ""
15 | generation_kwargs:
16 |   max_new_tokens: 128
17 |   temperature: 0
18 |   top_p: 0
19 |   num_beams: 1
20 |   do_sample: false
21 | metric_list:
22 |   - metric: aAcc
23 |     aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
24 |     higher_is_better: true
25 |   - metric: qAcc
26 |     aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
27 |     higher_is_better: true
28 |   - metric: fAcc
29 |     aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
30 |     higher_is_better: true
31 |   # - metric: aAcc
32 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
33 |   #  higher_is_better: true
34 |   # - metric: qAcc
35 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
36 |   #  higher_is_better: true
37 |   # - metric: fAcc
38 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
39 |   #  higher_is_better: true
40 | metadata:
41 |   - version: 0.0
42 | 


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/ola_vlm/eval/get_probe_dsg_scores.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | import json
 5 | import os
 6 | from tqdm import tqdm
 7 | from icecream import ic
 8 | import warnings
 9 | warnings.filterwarnings("ignore")
10 | import random
11 | import numpy as np
12 | 
13 | 
14 | def set_seed(seed):
15 |     random.seed(seed)
16 |     np.random.seed(seed)
17 |     torch.manual_seed(seed)
18 |     torch.cuda.manual_seed_all(seed)
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--ckpt", type=str, default="llava-1.5-7b")
23 |     parser.add_argument("--mode", type=str, default="gen")
24 |     args = parser.parse_args()
25 | 
26 |     mode = args.mode
27 |     name = args.ckpt.split("/")[-1]
28 | 
29 |     with open(f'plots/probe_scores/{name}/{args.mode}.json') as file:
30 |         scores = json.load(file)
31 |     
32 |     layer_scores = {}
33 | 
34 |     for img, v in tqdm(scores.items()):
35 |         for layer, score in v.items():
36 |             if layer not in layer_scores:
37 |                 layer_scores[layer] = []
38 |             layer_scores[layer].append(score)
39 |     
40 |     for layer, scores in layer_scores.items():
41 |         layer_scores[layer] = np.mean(scores)
42 | 
43 |     with open(f"plots/probe_scores/{name}/{mode}_scores.json", "w") as f:
44 |         json.dump(layer_scores, f, indent=2)
45 |     
46 |     print(f"================Scores: {mode}===============")
47 |     for layer, score in layer_scores.items():
48 |         print(f"Layer: {layer}, Score: {score}")
49 |     print("===========================================")


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "coco2017_cap_val"
 5 | group : "coco_caption2017"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_Bleu_4 
21 |     aggregation : !function utils.coco_bleu4
22 |     higher_is_better : true
23 |   - metric: coco_Bleu_3
24 |     aggregation : !function utils.coco_bleu3
25 |     higher_is_better : true
26 |   - metric: coco_Bleu_2
27 |     aggregation : !function utils.coco_bleu2
28 |     higher_is_better : true
29 |   - metric: coco_Bleu_1
30 |     aggregation : !function utils.coco_bleu1
31 |     higher_is_better : true
32 |   - metric: coco_METEOR
33 |     aggregation : !function utils.coco_meteor
34 |     higher_is_better : true
35 |   - metric: coco_ROUGE_L
36 |     aggregation : !function utils.coco_rougel
37 |     higher_is_better : true
38 |   - metric: coco_CIDEr
39 |     aggregation : !function utils.coco_cider
40 |     higher_is_better : true
41 |   #- metric: coco_SPICE
42 |   #  aggregation : !function utils.coco_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/flickr30k
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "flickr30k_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.flickr_doc_to_visual
 8 | doc_to_text: !function utils.flickr_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 64
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.flickr_process_result
17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
18 | metric_list:
19 |   - metric: flickr_Bleu_4 
20 |     aggregation : !function utils.flickr_bleu4
21 |     higher_is_better : true
22 |   - metric: flickr_Bleu_3
23 |     aggregation : !function utils.flickr_bleu3
24 |     higher_is_better : true
25 |   - metric: flickr_Bleu_2
26 |     aggregation : !function utils.flickr_bleu2
27 |     higher_is_better : true
28 |   - metric: flickr_Bleu_1
29 |     aggregation : !function utils.flickr_bleu1
30 |     higher_is_better : true
31 |   - metric: flickr_METEOR
32 |     aggregation : !function utils.flickr_meteor
33 |     higher_is_better : true
34 |   - metric: flickr_ROUGE_L
35 |     aggregation : !function utils.flickr_rougel
36 |     higher_is_better : true
37 |   - metric: flickr_CIDEr
38 |     aggregation : !function utils.flickr_cider
39 |     higher_is_better : true
40 |   #- metric: flickr_SPICE
41 |   #  aggregation : !function utils.flickr_spice
42 |   #  higher_is_better : true
43 | metadata:
44 |   - version: 0.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="ola_vlm",
 5 |     version="1.0.0",
 6 |     long_description=open("README.md").read(),
 7 |     long_description_content_type="text/markdown",
 8 |     classifiers=[
 9 |         "Programming Language :: Python :: 3",
10 |         "License :: OSI Approved :: Apache Software License",
11 |     ],
12 |     python_requires=">=3.8",
13 |     install_requires=[
14 |         "torch==2.2.0", "torchvision==0.17.0",
15 |         "tokenizers==0.19.1", "sentencepiece==0.1.99", "shortuuid",
16 |         "peft", "bitsandbytes", "open_clip_torch", "diffdist",
17 |         "pydantic", "markdown2[all]", "numpy==1.26.2",
18 |         "gradio==4.16.0", "gradio_client==0.8.1", "huggingface_hub",
19 |         "requests", "httpx==0.24.0", "uvicorn", "fastapi",
20 |         "einops==0.6.1", "einops-exts==0.0.4", "timm==1.0.8",
21 |         "diffusers===0.27.2", "protobuf", "accelerate==0.27.2"
22 |     ],
23 |     extras_require={
24 |         "train": ["deepspeed==0.12.6", "ninja", "wandb", "huggingface-hub==0.24.2", "peft==0.12.0"],
25 |         "eval": ["seaborn", "sty", "tabulate", "spacy", "word2number", "inflect"],
26 |         "demo": ["pydantic==2.8.2", "pydantic-core==2.20.1", "fastapi==0.111.0"],
27 |         "build": ["build", "twine"]
28 |     },
29 |     url="https://praeclarumjj3.github.io/ola_vlm",
30 |     project_urls={
31 |         "Bug Tracker": "https://github.com/SHI-Labs/VisPer-LM/issues"
32 |     },
33 |     packages=find_packages(exclude=["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]),
34 |     include_package_data=True,
35 | )
36 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "coco2014_cap_val"
 5 | group : "coco_caption"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_Bleu_4 
21 |     aggregation : !function utils.coco_bleu4
22 |     higher_is_better : true
23 |   - metric: coco_Bleu_3
24 |     aggregation : !function utils.coco_bleu3
25 |     higher_is_better : true
26 |   - metric: coco_Bleu_2
27 |     aggregation : !function utils.coco_bleu2
28 |     higher_is_better : true
29 |   - metric: coco_Bleu_1
30 |     aggregation : !function utils.coco_bleu1
31 |     higher_is_better : true
32 |   - metric: coco_METEOR
33 |     aggregation : !function utils.coco_meteor
34 |     higher_is_better : true
35 |   - metric: coco_ROUGE_L
36 |     aggregation : !function utils.coco_rougel
37 |     higher_is_better : true
38 |   - metric: coco_CIDEr
39 |     aggregation : !function utils.coco_cider
40 |     higher_is_better : true
41 |   #- metric: coco_SPICE
42 |   #  aggregation : !function utils.coco_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |       "enabled": "auto",
 4 |       "loss_scale": 0,
 5 |       "loss_scale_window": 1000,
 6 |       "initial_scale_power": 16,
 7 |       "hysteresis": 2,
 8 |       "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |       "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |       "type": "AdamW",
15 |       "params": {
16 |         "lr": "auto",
17 |         "betas": "auto",
18 |         "eps": "auto",
19 |         "weight_decay": "auto"
20 |       }
21 |     },
22 |     "scheduler": {
23 |       "type": "WarmupLR",
24 |       "params": {
25 |         "warmup_min_lr": "auto",
26 |         "warmup_max_lr": "auto",
27 |         "warmup_num_steps": "auto"
28 |       }
29 |     },
30 |     "zero_optimization": {
31 |       "stage": 3,
32 |       "offload_optimizer": {
33 |         "device": "cpu",
34 |         "pin_memory": true
35 |       },
36 |       "offload_param": {
37 |         "device": "cpu",
38 |         "pin_memory": true
39 |       },
40 |       "overlap_comm": true,
41 |       "contiguous_gradients": true,
42 |       "sub_group_size": 1e9,
43 |       "reduce_bucket_size": "auto",
44 |       "stage3_prefetch_bucket_size": "auto",
45 |       "stage3_param_persistence_threshold": "auto",
46 |       "stage3_max_live_parameters": 1e9,
47 |       "stage3_max_reuse_distance": 1e9,
48 |       "gather_16bit_weights_on_model_save": true
49 |     },
50 |     "gradient_accumulation_steps": "auto",
51 |     "gradient_clipping": "auto",
52 |     "train_batch_size": "auto",
53 |     "train_micro_batch_size_per_gpu": "auto",
54 |     "steps_per_print": 1e5,
55 |     "wall_clock_breakdown": false
56 |   }


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "nocaps_val"
 5 | group : "nocaps_caption"
 6 | test_split: validation
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_Bleu_4 
21 |     aggregation : !function utils.nocaps_bleu4
22 |     higher_is_better : true
23 |   - metric: nocaps_Bleu_3
24 |     aggregation : !function utils.nocaps_bleu3
25 |     higher_is_better : true
26 |   - metric: nocaps_Bleu_2
27 |     aggregation : !function utils.nocaps_bleu2
28 |     higher_is_better : true
29 |   - metric: nocaps_Bleu_1
30 |     aggregation : !function utils.nocaps_bleu1
31 |     higher_is_better : true
32 |   - metric: nocaps_METEOR
33 |     aggregation : !function utils.nocaps_meteor
34 |     higher_is_better : true
35 |   - metric: nocaps_ROUGE_L
36 |     aggregation : !function utils.nocaps_rougel
37 |     higher_is_better : true
38 |   - metric: nocaps_CIDEr
39 |     aggregation : !function utils.nocaps_cider
40 |     higher_is_better : true
41 |   #- metric: nocaps_SPICE
42 |   #  aggregation : !function utils.nocaps_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0
46 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/scripts/probe/probe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WANDB_PROJECT= "VisPer-LM"
 4 | export WANDB_NAME="probe_depth_ola-vlm-pt-ift"
 5 | 
 6 | # 8 GPUs
 7 | deepspeed ola_vlm/train/probe_dsg_train_mem.py \
 8 |     --deepspeed ./scripts/zero2.json \
 9 |     --mode $1 \
10 |     --model_name_or_path shi-labs/OLA-VLM-CLIP-ConvNeXT-Llama3-8b \
11 |     --image_generator stabilityai/stable-diffusion-2-1-unclip \
12 |     --image_segmentor shi-labs/oneformer_coco_swin_large \
13 |     --depth_estimator depth_anything_v2_vitl.pth \
14 |     --version llava_llama_3 \
15 |     --data_path /mnt/vlpdatasets/sherlock/coco/annotations/captions_train2017.json \
16 |     --image_folder datasets/coco \
17 |     --vision_tower laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768 \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --mm_vision_select_layer -2 \
20 |     --mm_use_im_start_end False \
21 |     --mm_use_im_patch_token False \
22 |     --image_aspect_ratio pad \
23 |     --group_by_modality_length True \
24 |     --bf16 True \
25 |     --tf32 True \
26 |     --output_dir outputs/probe_${1}_ola-vlm-pt-ift \
27 |     --num_train_epochs 1 \
28 |     --per_device_train_batch_size 32 \
29 |     --per_device_eval_batch_size 4 \
30 |     --gradient_accumulation_steps 1 \
31 |     --evaluation_strategy "no" \
32 |     --save_strategy "steps" \
33 |     --save_steps 200 \
34 |     --save_total_limit 3 \
35 |     --learning_rate 1e-3 \
36 |     --weight_decay 0. \
37 |     --warmup_ratio 0.03 \
38 |     --lr_scheduler_type "cosine" \
39 |     --logging_steps 1 \
40 |     --model_max_length 4096 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "textcaps_val"
 5 | group : "textcaps_caption"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_Bleu_4 
21 |     aggregation : !function utils.textcaps_bleu4
22 |     higher_is_better : true
23 |   - metric: textcaps_Bleu_3
24 |     aggregation : !function utils.textcaps_bleu3
25 |     higher_is_better : true
26 |   - metric: textcaps_Bleu_2
27 |     aggregation : !function utils.textcaps_bleu2
28 |     higher_is_better : true
29 |   - metric: textcaps_Bleu_1
30 |     aggregation : !function utils.textcaps_bleu1
31 |     higher_is_better : true
32 |   - metric: textcaps_METEOR
33 |     aggregation : !function utils.textcaps_meteor
34 |     higher_is_better : true
35 |   - metric: textcaps_ROUGE_L
36 |     aggregation : !function utils.textcaps_rougel
37 |     higher_is_better : true
38 |   - metric: textcaps_CIDEr
39 |     aggregation : !function utils.textcaps_cider
40 |     higher_is_better : true
41 |   #- metric: textcaps_SPICE
42 |   #  aggregation : !function utils.textcaps_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0
46 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/transformation.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class LowercaseFilter(Filter):
 5 |     def __init__(self) -> None:
 6 |         pass
 7 | 
 8 |     def apply(self, resps, docs):
 9 |         def filter_set(inst):
10 |             return [resp.lower() for resp in inst]
11 | 
12 |         return [filter_set(resp) for resp in resps]
13 | 
14 | 
15 | class UppercaseFilter(Filter):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def apply(self, resps, docs):
20 |         def filter_set(inst):
21 |             return [resp.upper() for resp in inst]
22 | 
23 |         return [filter_set(resp) for resp in resps]
24 | 
25 | 
26 | class MapFilter(Filter):
27 |     def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
28 |         """
29 |         Initializes the MapFilter with a given mapping dictionary and default value.
30 | 
31 |         Args:
32 |         - mapping_dict (dict): A dictionary containing the key-value mappings.
33 |                                Default is an empty dictionary.
34 |         - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
35 |                                Default is None.
36 | 
37 |         Example:
38 |         mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
39 |         """
40 |         assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
41 |         self.mapping_dict = mapping_dict
42 |         self.default_value = default_value
43 | 
44 |     def apply(self, resps, docs):
45 |         def filter_set(inst):
46 |             return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
47 | 
48 |         return [filter_set(resp) for resp in resps]
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_train.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_train"
 5 | group : "textcaps_caption"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.textcaps_process_result
20 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
21 | metric_list:
22 |   - metric: textcaps_Bleu_4 
23 |     aggregation : !function utils.textcaps_bleu4
24 |     higher_is_better : true
25 |   - metric: textcaps_Bleu_3
26 |     aggregation : !function utils.textcaps_bleu3
27 |     higher_is_better : true
28 |   - metric: textcaps_Bleu_2
29 |     aggregation : !function utils.textcaps_bleu2
30 |     higher_is_better : true
31 |   - metric: textcaps_Bleu_1
32 |     aggregation : !function utils.textcaps_bleu1
33 |     higher_is_better : true
34 |   - metric: textcaps_METEOR
35 |     aggregation : !function utils.textcaps_meteor
36 |     higher_is_better : true
37 |   - metric: textcaps_ROUGE_L
38 |     aggregation : !function utils.textcaps_rougel
39 |     higher_is_better : true
40 |   - metric: textcaps_CIDEr
41 |     aggregation : !function utils.textcaps_cider
42 |     higher_is_better : true
43 |   #- metric: textcaps_SPICE
44 |   #  aggregation : !function utils.textcaps_spice
45 |   #  higher_is_better : true
46 | metadata:
47 |   - version: 0.0
48 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/scripts/train/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WANDB_PROJECT= "VisPer-LM"
 4 | export WANDB_NAME="VisPer-LM-CLIP-ViT-Llama3-8b"
 5 | 
 6 | # Base LLM choices: 
 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3)
 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3)
 9 | 
10 | # Base encoder choices:
11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336
12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768
13 | 
14 | # 8 GPUs
15 | deepspeed ola_vlm/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \
18 |     --version llava_llama_3 \
19 |     --data_path datasets/llava_v1_5_mix665k.json \
20 |     --image_folder datasets/ \
21 |     --vision_tower openai/clip-vit-large-patch14-336 \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --bf16 True \
29 |     --output_dir outputs/VisPer-LM-CLIP-ViT-Llama3-8b \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 200 \
37 |     --save_total_limit 3 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 4096 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import FilterEnsemble, Filter
 2 | from . import selection
 3 | from . import extraction
 4 | from . import transformation
 5 | 
 6 | 
 7 | FILTER_REGISTRY = {
 8 |     "take_first": selection.TakeFirstFilter,
 9 |     "regex": extraction.RegexFilter,
10 |     "majority_vote": selection.MajorityVoteFilter,
11 |     "take_first_k": selection.TakeKFilter,
12 |     "remove_whitespace": extraction.WhitespaceFilter,
13 |     "lowercase": transformation.LowercaseFilter,
14 |     "uppercase": transformation.UppercaseFilter,
15 |     "map": transformation.MapFilter,
16 |     "multi_choice_regex": extraction.MultiChoiceRegexFilter,
17 |     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
18 |     # that takes an input and returns a scalar and then should select the max reward,
19 |     # or should implement different filters for different ways of handling a reward model's inference.
20 |     # "arg_max": selection.ArgMaxFilter,
21 | }
22 | 
23 | 
24 | def get_filter(filter_name):
25 |     if filter_name in FILTER_REGISTRY:
26 |         return FILTER_REGISTRY[filter_name]
27 |     else:
28 |         return filter_name
29 | 
30 | 
31 | def build_filter_ensemble(filter_name, components):
32 |     """
33 |     Create a filtering pipeline.
34 |     """
35 |     filters = []
36 |     for function, kwargs in components:
37 |         if kwargs is None:
38 |             f = get_filter(function)()
39 |         else:
40 |             # create a filter given its name in the registry
41 |             f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
42 |         # add the filter as a pipeline step
43 |         filters.append(f)
44 | 
45 |     return FilterEnsemble(name=filter_name, filters=filters)
46 | 


--------------------------------------------------------------------------------
/scripts/train/vpt_ift.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WANDB_PROJECT= "VisPer-LM"
 4 | export WANDB_NAME="vpt_VisPer-LM-CLIP-ViT-Llama3-8b"
 5 | 
 6 | # Base LLM choices: 
 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3)
 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3)
 9 | 
10 | # Base encoder choices:
11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336
12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768
13 | 
14 | # 8 GPUs
15 | deepspeed ola_vlm/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path outputs/v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b \
18 |     --version llava_llama_3 \
19 |     --data_path datasets/llava_v1_5_mix665k.json \
20 |     --image_folder datasets/ \
21 |     --vision_tower openai/clip-vit-large-patch14-336 \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --bf16 True \
29 |     --output_dir outputs/vpt_VisPer-LM-CLIP-ViT-Llama3-8b \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 200 \
37 |     --save_total_limit 3 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 4096 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/selection.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from lmms_eval.api.filter import Filter
 4 | 
 5 | 
 6 | class TakeFirstFilter(Filter):
 7 |     def __init__(self) -> None:
 8 |         """
 9 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
10 |         """
11 | 
12 |     def apply(self, resps, docs):
13 |         """
14 |         Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
15 |         """
16 |         return map(lambda r: r[0], resps)
17 | 
18 | 
19 | class TakeKFilter(Filter):
20 |     def __init__(self, *args, **kwargs) -> None:
21 |         self.k = kwargs.pop("k")
22 | 
23 |         super().__init__(*args, **kwargs)
24 | 
25 |     def apply(self, resps, docs):
26 |         # check we have at least k responses per doc, else we can't take the first k
27 |         assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
28 |         return map(lambda r: r[: self.k], resps)
29 | 
30 | 
31 | class MajorityVoteFilter(Filter):
32 |     def __init__(self) -> None:
33 |         """
34 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
35 |         """
36 | 
37 |     def apply(self, resps, docs):
38 |         """
39 |         Each entry of `resps` is a list of model responses.
40 |         We select the response that occurs most frequently in each entry of `resps`.
41 |         """
42 | 
43 |         def select_majority(resp):
44 |             counts = Counter(resp)
45 |             vote = counts.most_common(1)[0][0]
46 |             return vote
47 | 
48 |         return map(lambda r: [select_majority(r)], resps)
49 | 


--------------------------------------------------------------------------------
/scripts/train/vpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WANDB_PROJECT= "VisPer-LM"
 4 | export WANDB_NAME="v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b"
 5 | 
 6 | # Base LLM choices: 
 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3)
 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3)
 9 | 
10 | # Base encoder choices:
11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336
12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768
13 | 
14 | # 8 GPUs
15 | deepspeed ola_vlm/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \
18 |     --version llava_llama_3 \
19 |     --data_path datasets/allava/ALLaVA-Caption.json \
20 |     --image_folder datasets/allava \
21 |     --vision_tower openai/clip-vit-large-patch14-336 \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --bf16 True \
29 |     --output_dir outputs/v-pretrain_VisPer-LM-CLIP-ViT-Llama3-8b \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 200 \
37 |     --save_total_limit 3 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 4096 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/utils.py:
--------------------------------------------------------------------------------
 1 | def sqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
 2 |     context, question, choices = doc["hint"], doc["question"], doc["choices"]
 3 |     len_choices = len(choices)
 4 |     options = [chr(ord("A") + i) for i in range(len_choices)]
 5 |     choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
 6 |     if model_specific_prompt_kwargs["format"] == "default":
 7 |         if context:
 8 |             context = f"Context: {context}\n"
 9 | 
10 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
11 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
12 |         return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}"
13 |     elif model_specific_prompt_kwargs["format"] == "qwen_vl":
14 |         prompt = "Context: {}\nQuestion: {}\nOptions: {}\nAnswer:"
15 |         context = context if context else "N/A"
16 |         prompt = prompt.format(context, question, choices_str)
17 |         return prompt
18 |     else:
19 |         raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs}")
20 | 
21 | 
22 | def sqa_doc_to_visual(doc):
23 |     if doc["image"] is None:
24 |         return []
25 |     return [doc["image"].convert("RGB")]
26 | 
27 | 
28 | def sqa_doc_to_target(doc):
29 |     len_choices = len(doc["choices"])
30 |     options = [chr(ord("A") + i) for i in range(len_choices)]
31 |     return options[doc["answer"]]
32 | 
33 | 
34 | def sqa_process_results(doc, results):
35 |     # I know this is weird, but it's how llava parse it.
36 |     target = sqa_doc_to_target(doc)
37 |     pred = results[0]
38 |     if pred == target:
39 |         return {"exact_match": 1.0}
40 |     # pattern: ^[A-Z]\. .*
41 |     if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".":
42 |         result = 1.0 if pred[0] == target else 0.0
43 |         return {"exact_match": result}
44 |     return {"exact_match": 0.0}
45 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench_2/seedbench_2.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench-2
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench-2"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 16
14 |   image_aspect_ratio: original
15 | # The return value of process_results will be used by metrics
16 | process_results: !function utils.seed_process_result
17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
18 | metric_list:
19 |   - metric: seed_Video
20 |     aggregation: !function utils.seed_aggregation_result
21 |     higher_is_better: true
22 |   - metric: seed_Multiple_Images
23 |     aggregation: !function utils.seed_aggregation_result
24 |     higher_is_better: true
25 |   - metric: seed_Image_&_Text_Generation
26 |     aggregation: !function utils.seed_aggregation_result
27 |     higher_is_better: true
28 |   - metric: seed_Single_Image
29 |     aggregation: !function utils.seed_aggregation_result
30 |     higher_is_better: true
31 |   - metric: seed_Image_Generation
32 |     aggregation: !function utils.seed_aggregation_result
33 |     higher_is_better: true
34 |   - metric: seed_Interleaved_Image
35 |     aggregation: !function utils.seed_aggregation_result
36 |     higher_is_better: true
37 |   - metric: seed_all
38 |     aggregation: !function utils.seed_aggregation_result
39 |     higher_is_better: true
40 | metadata:
41 |   - version: 0.0
42 | 
43 | model_specific_prompt_kwargs:
44 |   llava :
45 |     img_token : <image>
46 |     post_prompt : "Answer with the option's letter from the given choices directly."
47 |   gpt4V :
48 |     img_token : <image>
49 |     post_prompt : "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 240
 3 | 
 4 | [build-system]
 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "lmms_eval"
10 | version = "0.1.2"
11 | authors = [
12 |     { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
13 | ]
14 | description = "A framework for evaluating large multi-modality language models"
15 | readme = "README.md"
16 | classifiers = [
17 |     "Programming Language :: Python :: 3",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Operating System :: OS Independent",
20 | ]
21 | requires-python = ">=3.8"
22 | license = { text = "MIT" }
23 | dependencies = [
24 |     "accelerate>=0.21.0",
25 |     "black==24.1.0",
26 |     "datasets==2.16.1",
27 |     "evaluate>=0.4.0",
28 |     "loguru",
29 |     "jsonlines",
30 |     "numexpr",
31 |     "peft>=0.2.0",
32 |     "pybind11>=2.6.2",
33 |     "pytablewriter",
34 |     "rouge-score>=0.0.4",
35 |     "sacrebleu>=1.5.0",
36 |     "scikit-learn>=0.24.1",
37 |     "sqlitedict",
38 |     "torch>=1.8",
39 |     "openai>=1.0.0",
40 |     "pycocoevalcap",
41 |     "tqdm-multiprocess",
42 |     "transformers",
43 |     "zstandard",
44 |     "pillow",
45 |     "pyyaml",
46 |     "sympy",
47 |     "mpmath",
48 |     "Jinja2",
49 |     "openpyxl",
50 |     "Levenshtein",
51 |     "hf_transfer",
52 |     "tenacity",
53 |     "wandb>=0.16.0",
54 |     "transformers-stream-generator",
55 |     "tiktoken",
56 |     "pre-commit",
57 |     "pydantic",
58 | ]
59 | 
60 | [tool.setuptools.packages.find]
61 | include = ["lmms_eval*"]
62 | 
63 | [tool.setuptools.package-data]
64 | lmms_eval = ["**/*.yaml", "tasks/**/*"]
65 | 
66 | [project.scripts]
67 | lmms-eval = "lmms_eval.__main__:cli_evaluate"
68 | lmms_eval = "lmms_eval.__main__:cli_evaluate"
69 | 
70 | [project.urls]
71 | Homepage = "https://lmms-lab.github.io/lmms-eval-blog/"
72 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
73 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def seed_doc_to_visual(doc):
 5 |     return [image.convert("RGB") for image in doc["image"]]
 6 | 
 7 | 
 8 | def seed_doc_to_text(doc):
 9 |     question = doc["question"]
10 |     question += "\n" + f"A. {doc['choice_a']}\n"
11 |     question += f"B. {doc['choice_b']}\n"
12 |     question += f"C. {doc['choice_c']}\n"
13 |     question += f"D. {doc['choice_d']}"
14 |     return f"{question}\nAnswer with the option's letter from the given choices directly."
15 | 
16 | 
17 | def seed_process_result(doc, result):
18 |     pred = result[0].strip()
19 |     if len(pred) > 1:
20 |         pred = pred[0]
21 |     answer = doc["answer"]
22 |     data_type = doc["data_type"]
23 | 
24 |     return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
25 | 
26 | 
27 | def seed_aggregation_result(results):
28 |     total_count = 0
29 |     total_correct = 0
30 |     for result in results:
31 |         if result["pred"] == result["answer"]:
32 |             total_correct += 1
33 |         total_count += 1
34 |     return total_correct / total_count
35 | 
36 | 
37 | def seed_aggregation_result_all(results):
38 |     score = seed_aggregation_result(results)
39 |     stored_results = []
40 |     for result in results:
41 |         stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
42 |     with open("./seed_submission.json", "w") as f:
43 |         json.dump(stored_results, f, indent=4)
44 |     print("Storing files for seed_submission ...")
45 | 
46 |     return score
47 | 
48 | 
49 | def seed_doc_to_text_mc(doc):
50 |     question = doc["question"]
51 |     return f"{question} Answer :"
52 | 
53 | 
54 | def seed_doc_to_choice(doc):
55 |     return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
56 | 
57 | 
58 | def seed_doc_to_mc_target(doc):
59 |     answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"}
60 |     return doc[answer2choice[doc["answer"]]]
61 | 


--------------------------------------------------------------------------------
/ola_vlm/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/scripts/train/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WANDB_PROJECT= "VisPer-LM"
 4 | export WANDB_NAME="pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b"
 5 | 
 6 | # Base LLM choices: 
 7 | # Llama3-8b: meta-llama/Meta-Llama-3-8B-Instruct (llava_llama_3)
 8 | # Phi3-4k-mini: microsoft/Phi-3-mini-4k-instruct (llava_phi_3)
 9 | 
10 | # Base encoder choices:
11 | # CLIP-ViT-L: openai/clip-vit-large-patch14-336
12 | # CLIP-ConvNeXT-XXL: laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup-res768
13 | 
14 | # 8 GPUs
15 | deepspeed ola_vlm/train/ola_vlm_train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
18 |     --version llava_llama_3 \
19 |     --mode gen-depth-seg \
20 |     --layer_indices d18-20_s10-18_g12-20 \
21 |     --num_task_tokens 8 \
22 |     --loss_weights d0.5_s0.5_g0.5 \
23 |     --contrastive_loss_weight 0.3 \
24 |     --image_generator stabilityai/stable-diffusion-2-1-unclip \
25 |     --image_segmentor shi-labs/oneformer_coco_swin_large \
26 |     --depth_estimator depth_anything_v2_vitl.pth \
27 |     --data_path datasets/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
28 |     --image_folder datasets/LLaVA-Pretrain/images \
29 |     --vision_tower openai/clip-vit-large-patch14-336 \
30 |     --mm_projector_type mlp2x_gelu \
31 |     --tune_mm_mlp_adapter True \
32 |     --mm_vision_select_layer -2 \
33 |     --mm_use_im_start_end False \
34 |     --mm_use_im_patch_token False \
35 |     --bf16 True \
36 |     --output_dir outputs/pretrain_dsg_VisPer-LM-CLIP-ViT-Llama3-8b \
37 |     --num_train_epochs 1 \
38 |     --per_device_train_batch_size 32 \
39 |     --per_device_eval_batch_size 4 \
40 |     --gradient_accumulation_steps 1 \
41 |     --evaluation_strategy "no" \
42 |     --save_strategy "steps" \
43 |     --save_steps 200 \
44 |     --save_total_limit 3 \
45 |     --learning_rate 1e-3 \
46 |     --weight_decay 0. \
47 |     --warmup_ratio 0.03 \
48 |     --lr_scheduler_type "cosine" \
49 |     --logging_steps 1 \
50 |     --tf32 True \
51 |     --model_max_length 4096 \
52 |     --gradient_checkpointing True \
53 |     --dataloader_num_workers 4 \
54 |     --lazy_preprocess True \
55 |     --report_to wandb


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Callable, Optional
 8 | 
 9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 | 
12 | 
13 | class SwiGLUFFN(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         hidden_features: Optional[int] = None,
18 |         out_features: Optional[int] = None,
19 |         act_layer: Callable[..., nn.Module] = None,
20 |         drop: float = 0.0,
21 |         bias: bool = True,
22 |     ) -> None:
23 |         super().__init__()
24 |         out_features = out_features or in_features
25 |         hidden_features = hidden_features or in_features
26 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 | 
29 |     def forward(self, x: Tensor) -> Tensor:
30 |         x12 = self.w12(x)
31 |         x1, x2 = x12.chunk(2, dim=-1)
32 |         hidden = F.silu(x1) * x2
33 |         return self.w3(hidden)
34 | 
35 | 
36 | try:
37 |     from xformers.ops import SwiGLU
38 | 
39 |     XFORMERS_AVAILABLE = True
40 | except ImportError:
41 |     SwiGLU = SwiGLUFFN
42 |     XFORMERS_AVAILABLE = False
43 | 
44 | 
45 | class SwiGLUFFNFused(SwiGLU):
46 |     def __init__(
47 |         self,
48 |         in_features: int,
49 |         hidden_features: Optional[int] = None,
50 |         out_features: Optional[int] = None,
51 |         act_layer: Callable[..., nn.Module] = None,
52 |         drop: float = 0.0,
53 |         bias: bool = True,
54 |     ) -> None:
55 |         out_features = out_features or in_features
56 |         hidden_features = hidden_features or in_features
57 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 |         super().__init__(
59 |             in_features=in_features,
60 |             hidden_features=hidden_features,
61 |             out_features=out_features,
62 |             bias=bias,
63 |         )
64 | 


--------------------------------------------------------------------------------
/ola_vlm/model/aux_heads/gen_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | from torch import nn
 9 | from ola_vlm.model.multimodal_projector.resampler import Resampler, TaskTokenResampler
10 | 
11 | 
12 | class GenHead(nn.Module):
13 | 
14 |     def __init__(
15 |         self,
16 |         proj_config: dict = None,
17 |         llm_hidden_size: int = 4096,
18 |     ) -> None:
19 |         super().__init__()
20 | 
21 |         self.projector = Resampler(
22 |                 dim=proj_config["output_dim"],
23 |                 depth=proj_config["depth"],
24 |                 dim_head=proj_config["dim_head"],
25 |                 heads=proj_config["num_heads"],
26 |                 num_queries=proj_config["num_tokens"],
27 |                 embedding_dim=llm_hidden_size,
28 |                 output_dim=proj_config["output_dim"],
29 |                 ff_mult=proj_config["ff_mult"],
30 |             )
31 |     
32 |     def forward(
33 |         self,
34 |         llm_feats: torch.Tensor,
35 |     ):
36 |         gen_feats = self.projector(llm_feats)
37 |         return gen_feats
38 | 
39 | class TaskTokenGenHead(nn.Module):
40 | 
41 |     def __init__(
42 |         self,
43 |         proj_config: dict = None,
44 |         llm_hidden_size: int = 4096,
45 |     ) -> None:
46 |         super().__init__()
47 | 
48 |         self.projector = TaskTokenResampler(
49 |                 dim=proj_config["output_dim"],
50 |                 depth=proj_config["depth"],
51 |                 dim_head=proj_config["dim_head"],
52 |                 heads=proj_config["num_heads"],
53 |                 num_queries=proj_config["num_tokens"],
54 |                 embedding_dim=llm_hidden_size,
55 |                 output_dim=proj_config["output_dim"],
56 |                 ff_mult=proj_config["ff_mult"],
57 |             )
58 |     
59 |     def forward(
60 |         self,
61 |         llm_feats: torch.Tensor,
62 |         latents: torch.Tensor
63 |     ):
64 |         gen_feats = self.projector(llm_feats, latents)
65 |         return gen_feats


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/README.md:
--------------------------------------------------------------------------------
 1 | # WebSRC
 2 | 
 3 | ## Paper 
 4 | 
 5 | Title: WebSRC: A Dataset for Web-Based Structural Reading Comprehension
 6 | 
 7 | Abstract: https://arxiv.org/abs/2101.09465
 8 | 
 9 | Homepage: https://x-lance.github.io/WebSRC/#
10 | 
11 | WebSRC is a dataset for web-based structural reading comprehension.
12 | Its full train/dev/test split contains over 400k questions across 6.4k webpages. 
13 | This version of the dataset does not contain OCR or original HTML, it simply treats WebSRC as a image-and-text-based multimodal Q&A benchmark on webpage screenshots.
14 | 
15 | ## Citation
16 | 
17 | ```bibtex
18 | @inproceedings{chen2021websrc,
19 |   title={WebSRC: A Dataset for Web-Based Structural Reading Comprehension},
20 |   author={Chen, Xingyu and Zhao, Zihan and Chen, Lu and Ji, Jiabao and Zhang, Danyang and Luo, Ao and Xiong, Yuxuan and Yu, Kai},
21 |   booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
22 |   pages={4173--4185},
23 |   year={2021}
24 | }
25 | ```
26 | 
27 | ## Groups & Tasks
28 | 
29 | ### Groups 
30 | 
31 | - `websrc`: Evaluates `websrc-val` and generates a submission file for `websrc-test`.
32 | 
33 | ### Tasks
34 | 
35 | - `websrc-val`: Given a question and a web page, predict the answer.
36 | - `websrc-test`: Given a question and a web page, predict the answer. Ground truth is not provided for this task.
37 | 
38 | ## Metrics
39 | 
40 | This task uses SQUAD-style evaluation metrics, of which F1 score over tokens is used. 
41 | The orignal paper also uses Exact Match (EM) score, but this is not implemented here as that metric is more conducive for Encoder-only extraction models.
42 | 
43 | ### F1 Score
44 | 
45 | F1 Score is the harmonic mean of precision and recall.
46 | We calculate precision and recall at the token level, then compute the F1 score as normal using these values.
47 | 
48 | ### Test Submission
49 | 
50 | When evaluaing on the test split, a prediction JSON will be compiled instead of metrics computed. 
51 | Instructions for submission are available on the [WebSRC homepage](https://x-lance.github.io/WebSRC/#) and in their [Original GitHub Repo](https://github.com/X-LANCE/WebSRC-Baseline#obtain-test-result).


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/filter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List
 3 | 
 4 | from lmms_eval.api.instance import Instance
 5 | from datasets import Dataset
 6 | 
 7 | 
 8 | class Filter:
 9 |     """
10 |     Filter classes operate on a per-task level.
11 |     They take all model outputs (`instance.resps` for all `task.instances`)
12 |     across all instances of a task, and perform operations.
13 |     In a single run, one can configure any number of separate filters or lists of filters.
14 | 
15 |     """
16 | 
17 |     def __init__(self, *args, **kwargs) -> None:
18 |         """
19 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
20 |         """
21 | 
22 |     def apply(self, resps, docs):
23 |         """
24 |         Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
25 |         Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
26 |         if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
27 |         [<filtered resps for instance 0>, <filtered resps for instance 1>]
28 |         """
29 |         return resps
30 | 
31 | 
32 | @dataclass
33 | class FilterEnsemble:
34 |     """
35 |     FilterEnsemble creates a pipeline applying multiple filters.
36 |     Its intended usage is to stack multiple post-processing steps in order.
37 |     `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
38 |     pipeline separately.
39 |     """
40 | 
41 |     name: str
42 |     filters: List[Filter]
43 | 
44 |     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
45 |         resps = [inst.resps for inst in instances]  # operate just on the model responses
46 |         for f in self.filters:
47 |             # apply filters in sequence
48 |             resps = f.apply(resps, docs)
49 | 
50 |         # add the end results after filtering to filtered_requests of their respective source instances.
51 |         # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
52 |         for inst, resp in zip(instances, resps):
53 |             inst.filtered_resps[self.name] = resp
54 | 


--------------------------------------------------------------------------------
/ola_vlm/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | from ola_vlm.model.multimodal_projector.resampler import Resampler
 5 | 
 6 | 
 7 | class IdentityMap(nn.Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def forward(self, x, *args, **kwargs):
12 |         return x
13 | 
14 |     @property
15 |     def config(self):
16 |         return {"mm_projector_type": 'identity'}
17 | 
18 | 
19 | class SimpleResBlock(nn.Module):
20 |     def __init__(self, channels):
21 |         super().__init__()
22 |         self.pre_norm = nn.LayerNorm(channels)
23 | 
24 |         self.proj = nn.Sequential(
25 |             nn.Linear(channels, channels),
26 |             nn.GELU(),
27 |             nn.Linear(channels, channels)
28 |         )
29 |     def forward(self, x):
30 |         x = self.pre_norm(x)
31 |         return x + self.proj(x)
32 | 
33 | 
34 | def build_resampler(config, num_queries=None):
35 |     return Resampler(
36 |                 dim=config["probe_output_dim"],
37 |                 depth=config["probe_depth"],
38 |                 dim_head=config["probe_dim_head"],
39 |                 heads=config["probe_num_heads"],
40 |                 num_queries=config["num_queries"] if num_queries is None else num_queries,
41 |                 embedding_dim=config.hidden_size,
42 |                 output_dim=config["probe_output_dim"],
43 |                 ff_mult=config["probe_ff_mult"],
44 |             )
45 | 
46 | 
47 | def build_vision_projector(config, delay_load=False, **kwargs):
48 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
49 | 
50 |     if projector_type == 'linear':
51 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
52 | 
53 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
54 |     if mlp_gelu_match:
55 |         mlp_depth = int(mlp_gelu_match.group(1))
56 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
57 |         for _ in range(1, mlp_depth):
58 |             modules.append(nn.GELU())
59 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
60 |         return nn.Sequential(*modules)
61 | 
62 |     if projector_type == 'identity':
63 |         return IdentityMap()
64 | 
65 |     raise ValueError(f'Unknown projector type: {projector_type}')
66 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def options_to_str(options_prompt):
 6 |     option_prompt_str = ""
 7 |     for i, option in enumerate(options_prompt):
 8 |         option_choice = chr(ord("A") + i)
 9 |         option_prompt_str += f"{option_choice}. {option}\n"
10 | 
11 |     option_prompt_str = option_prompt_str.rstrip("\n")
12 |     return option_prompt_str
13 | 
14 | 
15 | def doc_to_visual(doc):
16 |     image_list = []
17 |     if "query_image" in doc:
18 |         image_list.append(doc["query_image"].convert("RGB"))
19 |     for i in range(5):
20 |         id = f"choice_image_{i}"
21 |         if id in doc and doc[id] is not None:
22 |             image_list.append(doc[id].convert("RGB"))
23 |     assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA"
24 |     return image_list
25 | 
26 | 
27 | def doc_to_text(doc, model_specific_prompt_kwargs):
28 |     question = doc["question"]
29 |     ques_type = doc["ques_type"]
30 |     options_prompt = []
31 | 
32 |     if ques_type == "choose_img":
33 |         options_prompt.append("The first image.")
34 |         options_prompt.append("The second image.")
35 | 
36 |         options_str = options_to_str(options_prompt)
37 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
38 | 
39 |     elif ques_type == "choose_txt":
40 |         choices = doc["choices"].split(",")
41 |         for i, choice in enumerate(choices):
42 |             options_prompt.append(f"{choice}")
43 | 
44 |         options_str = options_to_str(options_prompt)
45 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
46 | 
47 |     elif ques_type == "fill_in_blank":
48 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}"
49 | 
50 |     return full_prompt
51 | 
52 | 
53 | def test_process_results(doc, results):
54 |     pred = results[0]
55 |     questionId = doc["question_id"]
56 |     answer = doc["answer"]
57 |     return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}}
58 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench_2/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def seed_doc_to_visual(doc):
 5 |     return [image.convert("RGB") for image in doc["image"]]
 6 | 
 7 | 
 8 | def parse_choice_img(choice: str, img_token: str):
 9 |     if "jpg" in choice or "png" in choice:
10 |         return img_token
11 |     return choice
12 | 
13 | 
14 | def seed_doc_to_text(doc, model_specific_kwargs=None):
15 |     question = doc["question"]
16 |     question.replace("<img>", model_specific_kwargs["img_token"])
17 |     question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
18 |     question += f"B. {parse_choice_img(doc['choice_b'], model_specific_kwargs['img_token'])}\n"
19 |     question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n"
20 |     question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}"
21 |     if doc["data_type"] == "Image Generation":
22 |         num_img_in_question = len(doc["data_id"]) - 4
23 |         prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question
24 |         question = " ".join(prepend_tokens) + "\n" + question
25 |     return f"{question}\n{model_specific_kwargs['post_prompt']}"
26 | 
27 | 
28 | def seed_process_result(doc, result):
29 |     pred = result[0].strip()
30 |     if len(pred) > 1:
31 |         pred = pred[0]
32 |     answer = doc["answer"]
33 |     data_type = doc["data_type"].split(" ")
34 |     data_type = "_".join(data_type)
35 | 
36 |     return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
37 | 
38 | 
39 | def seed_aggregation_result(results):
40 |     total_count = 0
41 |     total_correct = 0
42 |     for result in results:
43 |         if result["pred"] == result["answer"]:
44 |             total_correct += 1
45 |         total_count += 1
46 |     return total_correct / total_count if total_count != 0 else 0
47 | 
48 | 
49 | def seed_aggregation_result_all(results):
50 |     score = seed_aggregation_result(results)
51 |     stored_results = []
52 |     for result in results:
53 |         stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
54 |     with open("./seed_submission.json", "w") as f:
55 |         json.dump(stored_results, f, indent=4)
56 |     print("Storing files for seed_submission ...")
57 | 
58 |     return score
59 | 


--------------------------------------------------------------------------------
/ola_vlm/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m ola_vlm.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from ola_vlm.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/naturalbench/naturalbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: BaiqiL/NaturalBench-lmms-eval # The name of the dataset as listed by HF in the datasets Hub.
 2 | dataset_kwargs:
 3 |   token: True # Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
 4 | task: "naturalbench" # The name of the task, this should be registered in the task manager. If successful, you can call lmms_eval with this task name by setting `--tasks mme`.
 5 | test_split: test # The split of the dataset to use as the test split.
 6 | output_type: generate_until # The type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`.
 7 | doc_to_visual: !function utils.naturalbench_doc_to_visual # The function to process a sample into the appropriate input for the model. 
 8 | doc_to_text: !function utils.naturalbench_doc_to_text # The function to process a sample into the appropriate target output for the model.
 9 | doc_to_target: "answer" # The function to process a sample into a list of possible string choices for `multiple_choice` tasks.
10 | generation_kwargs: # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   top_p: 1.0
14 |   num_beams: 1
15 |   do_sample: false
16 | # The return value of process_results will be used by metrics
17 | process_results: !function utils.naturalbench_process_results
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | # e.g. Following metrics `mme_perception_score` is custom defined. 
20 | # So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }`
21 | # And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy.
22 | metric_list:
23 |   - metric: naturalbench_score # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score.
24 |     aggregation: !function utils.naturalbench_aggregate_results # The name of the aggregation function to use for evaluation.
25 |     higher_is_better: true # Whether the metric is better when the value is higher.


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | import yaml
 5 | import pathlib
 6 | import logging
 7 | import datetime
 8 | import statistics
 9 | 
10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12 | 
13 | eval_logger = logging.getLogger("lmms-eval")
14 | 
15 | 
16 | def ok_vqa_doc_to_visual(doc):
17 |     return [doc["image"].convert("RGB")]
18 | 
19 | 
20 | def ok_vqa_process_results(doc, result):
21 |     eval_ai_processor = EvalAIAnswerProcessor()
22 |     assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23 |     resAns = eval_ai_processor(result[0])
24 |     accuracy = 0
25 | 
26 |     if "answers" in doc and doc["answers"] is not None:
27 |         gtAcc = []
28 | 
29 |         for i in range(len(doc["answers"])):
30 |             doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31 | 
32 |         for i in range(len(doc["answers"])):
33 |             otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34 |             matchingAns = [item for item in otherGTAns if item == resAns]
35 |             acc = min(1, float(len(matchingAns)) / 3)
36 |             gtAcc.append(acc)
37 |         if gtAcc:
38 |             accuracy = statistics.mean(gtAcc)
39 |         else:
40 |             accuracy = 0
41 | 
42 |     return {
43 |         "exact_match": accuracy,
44 |         "submission": {
45 |             "image": f"{doc['question_id']}.jpg",
46 |             "answer": resAns,
47 |         },
48 |     }
49 | 
50 | 
51 | def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52 |     question = doc["question"]
53 |     if model_specific_prompt_kwargs is None:
54 |         model_specific_prompt_kwargs = {}
55 |     pre_prompt = ""
56 |     post_prompt = ""
57 |     if "pre_prompt" in model_specific_prompt_kwargs:
58 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
59 |     if "post_prompt" in model_specific_prompt_kwargs:
60 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
61 |     return f"{pre_prompt}{question}{post_prompt}"
62 | 
63 | 
64 | def ok_vqa_aggreate_submissions(results, args):
65 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66 |     file = f"ok_vqa-test-submission-{now_date_time}.json"
67 |     path = generate_submission_file(file, args)
68 |     with open(path, "w") as f:
69 |         json.dump(results, f)
70 |     print(f"Submission file saved to {path}")
71 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | import yaml
 5 | import pathlib
 6 | import logging
 7 | import datetime
 8 | import statistics
 9 | 
10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12 | 
13 | eval_logger = logging.getLogger("lmms-eval")
14 | 
15 | 
16 | def vizwiz_vqa_doc_to_visual(doc):
17 |     return [doc["image"].convert("RGB")]
18 | 
19 | 
20 | def vizwiz_vqa_process_results(doc, result):
21 |     eval_ai_processor = EvalAIAnswerProcessor()
22 |     assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23 |     resAns = eval_ai_processor(result[0])
24 |     accuracy = 0
25 | 
26 |     if "answers" in doc and doc["answers"] is not None:
27 |         gtAcc = []
28 | 
29 |         for i in range(len(doc["answers"])):
30 |             doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31 | 
32 |         for i in range(len(doc["answers"])):
33 |             otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34 |             matchingAns = [item for item in otherGTAns if item == resAns]
35 |             acc = min(1, float(len(matchingAns)) / 3)
36 |             gtAcc.append(acc)
37 |         if gtAcc:
38 |             accuracy = statistics.mean(gtAcc)
39 |         else:
40 |             accuracy = 0
41 | 
42 |     return {
43 |         "exact_match": accuracy,
44 |         "submission": {
45 |             "image": f"{doc['question_id']}.jpg",
46 |             "answer": resAns,
47 |         },
48 |     }
49 | 
50 | 
51 | def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52 |     if model_specific_prompt_kwargs is None:
53 |         model_specific_prompt_kwargs = {}
54 |     pre_prompt = ""
55 |     post_prompt = ""
56 |     if "pre_prompt" in model_specific_prompt_kwargs:
57 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
58 |     if "post_prompt" in model_specific_prompt_kwargs:
59 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
60 |     text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}"
61 |     return text
62 | 
63 | 
64 | def vizwiz_vqa_aggreate_submissions(results, args):
65 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66 |     submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json"
67 |     path = generate_submission_file(submission_file_name, args)
68 |     with open(path, "w") as f:
69 |         json.dump(results, f)
70 |     print(f"Submission file saved to {path}")
71 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/cn_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import datetime
 4 | from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | import logging
 8 | 
 9 | eval_logger = logging.getLogger("lmms-eval")
10 | dir_name = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | try:
13 |     olympiadbench_evaluator = OlympiadBenchEvaluator()
14 | except:
15 |     pass
16 | 
17 | 
18 | def olympiadbench_doc_to_visual(doc):
19 |     return [image.convert("RGB") for image in doc["images"]]
20 | 
21 | 
22 | def olympiadbench_doc_to_text(doc):
23 |     question = doc["question"]
24 |     subject = doc["subfield"]
25 |     mul_ans = doc["is_multiple_answer"]
26 |     if mul_ans is None:
27 |         mul_ans = False
28 |     ans_type = doc["answer_type"]
29 |     if ans_type == "Need_human_evaluate":
30 |         ans_type = "proof based"
31 | 
32 |     pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
33 | 
34 |     post_prompt = ""
35 |     if not mul_ans:
36 |         post_prompt += f"答案类型为{ans_type}。\n"
37 |     else:
38 |         post_prompt += f"题目有多个答案，答案类型均为{ans_type}。\n"
39 |     post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
40 |     if not mul_ans:
41 |         post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
42 |     else:
43 |         post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
44 | 
45 |     final_question = pre_prompt + question + "\n" + post_prompt
46 |     return final_question
47 | 
48 | 
49 | def olympiadbench_process_results(doc, results):
50 |     precision = doc["error"]
51 |     is_proving = "TP" in doc["source"]
52 |     if precision is None:
53 |         precision = 0
54 |     prediction = results[0].strip()
55 | 
56 |     if is_proving:
57 |         return {"submission": prediction}
58 |     else:
59 |         prediction = prediction.split("所以最终答案是")[-1]
60 |         prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
61 |         accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
62 |         accuracy = int(accuracy)
63 |         return {"exact_match": accuracy}
64 | 
65 | 
66 | def olympiadbench_aggregate_results(results, args):
67 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
68 |     submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
69 |     path = generate_submission_file(submission_file_name, args)
70 |     with open(path, "w") as f:
71 |         json.dump(results, f, ensure_ascii=False)
72 |     print(f"Submission file saved to {path}")


--------------------------------------------------------------------------------
/ola_vlm/eval/eval_cv_bench.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import argparse
 4 | 
 5 | def load_jsonl(f):
 6 |     lines = open(f, encoding='utf-8').readlines()
 7 |     lines = [x.strip() for x in lines]
 8 |     if lines[-1] == '':
 9 |         lines = lines[:-1]
10 |     data = [json.loads(x) for x in lines]
11 |     return data
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--results_file", type=str, default="cv-bench_answer.jsonl")
17 |     args = parser.parse_args()
18 | 
19 |     answers = load_jsonl(args.results_file)
20 | 
21 |     data = {
22 |          "source": [],
23 |          "result": [],
24 |          "task": [],
25 |     }
26 |     import re
27 |     for a in answers:
28 |         data["source"].append(a["source"][0])
29 |         if "(" in a["prediction"]:
30 |             match = re.search(r'\(([A-Z])\)', a["prediction"])
31 |             if match:
32 |                 pred = "(" + match.group(1) + ")"
33 |         else:
34 |             pred = "(" + a["prediction"][0] + ")"
35 |         data["result"].append(pred == a["answer"][0])
36 |         data["task"].append(a["task"][0])
37 | 
38 |     df = pd.DataFrame(data)
39 | 
40 |     def calculate_accuracy(df, source):
41 |         source_df = df[df['source'] == source]
42 |         accuracy = (source_df['result']).mean()
43 |         return accuracy
44 |     
45 |     def calculate_task_accuracy(df, task):
46 |         source_df = df[df['task'] == task]
47 |         accuracy = (source_df['result']).mean()
48 |         return accuracy
49 | 
50 |     accuracy_2d_ade = calculate_accuracy(df, 'ADE20K')
51 |     accuracy_2d_coco = calculate_accuracy(df, 'COCO')
52 |     accuracy_3d_omni = calculate_accuracy(df, 'Omni3D')
53 | 
54 |     tasks = ["Count", "Depth", "Relation", "Distance"]
55 | 
56 |     scores = {}
57 | 
58 |     accuracy_2d = (accuracy_2d_ade + accuracy_2d_coco) / 2
59 |     accuracy_3d = accuracy_3d_omni
60 | 
61 |     combined_accuracy = (accuracy_2d + accuracy_3d) / 2
62 | 
63 |     scores["Overall"] = combined_accuracy
64 | 
65 |     scores["3D"] = accuracy_3d
66 |     scores["2D"] = accuracy_2d
67 | 
68 |     for t in tasks:
69 |         accuracy = calculate_task_accuracy(df, t)
70 |         scores[t] = accuracy
71 | 
72 |     print("\n=========================CV-Bench Scores===============================")
73 |     for key, value in scores.items():
74 |         print(f"{key} -> {value}")
75 |     print("================================================================")
76 | 
77 |     with open(args.results_file.replace('.jsonl', '_score.json'), "w") as f:
78 |         json.dump(scores, f, indent=2)


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/chartqa/utils.py:
--------------------------------------------------------------------------------
 1 | def chartqa_doc_to_visual(doc):
 2 |     return [doc["image"].convert("RGB")]
 3 | 
 4 | 
 5 | def chartqa_doc_to_text(doc, model_specific_prompt_kwargs):
 6 |     question = doc["question"]
 7 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
 8 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
 9 |     return f"{pre_prompt}{question}{post_prompt}"
10 | 
11 | 
12 | def chartqa_process_results(doc, results):
13 |     pred = results[0]
14 |     type = doc["type"]
15 |     score = relaxed_correctness(pred, doc["answer"])
16 |     score = 1.0 if score else 0.0
17 |     return_dict = {"relaxed_overall": score}
18 |     if type == "human_test":
19 |         return_dict["relaxed_human_split"] = score
20 |     else:
21 |         return_dict["relaxed_augmented_split"] = score
22 |     return return_dict
23 | 
24 | 
25 | def relaxed_correctness(prediction, target, max_relative_change: float = 0.05) -> bool:
26 |     """Calculates relaxed correctness.
27 | 
28 |     The correctness tolerates certain error ratio defined by max_relative_change.
29 |     See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
30 |     “Following Methani et al. (2020), we use a relaxed accuracy measure for the
31 |     numeric answers to allow a minor inaccuracy that may result from the automatic
32 |     data extraction process. We consider an answer to be correct if it is within
33 |     5% of the gold answer. For non-numeric answers, we still need an exact match
34 |     to consider an answer to be correct.”
35 | 
36 |     This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
37 |     Args:
38 |       target: List of target string.
39 |       prediction: List of predicted string.
40 |       max_relative_change: Maximum relative change.
41 | 
42 |     Returns:
43 |       Whether the prediction was correct given the specified tolerance.
44 |     """
45 | 
46 |     def _to_float(text: str):
47 |         try:
48 |             if text.endswith("%"):
49 |                 # Convert percentages to floats.
50 |                 return float(text.rstrip("%")) / 100.0
51 |             else:
52 |                 return float(text)
53 |         except ValueError:
54 |             return None
55 | 
56 |     prediction_float = _to_float(prediction)
57 |     target_float = _to_float(target)
58 |     if prediction_float is not None and target_float:
59 |         relative_change = abs(prediction_float - target_float) / abs(target_float)
60 |         return relative_change <= max_relative_change
61 |     else:
62 |         return prediction.lower() == target.lower()
63 | 


--------------------------------------------------------------------------------