├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── 1-usage.yaml
    │   ├── 2-feature-request.yaml
    │   ├── 3-question.yaml
    │   └── 4-discussion.yaml
├── .gitignore
├── LICENSE
├── README.md
├── img
    ├── p-mod.png
    ├── table1.png
    └── table2.png
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   └── summarize_gpt_review.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_mistral.py
    │   │   ├── llava_mpt.py
    │   │   ├── modeling_llama_pmod.py
    │   │   └── pmod_llava_llama.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── lmms-eval
    ├── .github
    │   ├── issue_template.md
    │   ├── pull_request_template.md
    │   └── workflows
    │   │   └── black.yml
    ├── .gitignore
    ├── .pre-commit-config.yaml
    ├── LICENSE
    ├── README.md
    ├── docs
    │   ├── README.md
    │   ├── commands.md
    │   ├── current_tasks.md
    │   ├── model_guide.md
    │   └── task_guide.md
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── filter.py
    │   │   ├── instance.py
    │   │   ├── metrics.py
    │   │   ├── model.py
    │   │   ├── registry.py
    │   │   ├── samplers.py
    │   │   └── task.py
    │   ├── evaluator.py
    │   ├── filters
    │   │   ├── __init__.py
    │   │   ├── decontamination.py
    │   │   ├── extraction.py
    │   │   ├── selection.py
    │   │   └── transformation.py
    │   ├── logging_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── batch_gpt4.py
    │   │   ├── claude.py
    │   │   ├── from_log.py
    │   │   ├── fuyu.py
    │   │   ├── gemini_api.py
    │   │   ├── gpt4v.py
    │   │   ├── idefics2.py
    │   │   ├── instructblip.py
    │   │   ├── internvl.py
    │   │   ├── llama_vid.py
    │   │   ├── llava.py
    │   │   ├── llava_hf.py
    │   │   ├── llava_sglang.py
    │   │   ├── llava_vid.py
    │   │   ├── longva.py
    │   │   ├── minicpm_v.py
    │   │   ├── model_utils
    │   │   │   ├── __init__.py
    │   │   │   ├── load_video.py
    │   │   │   └── qwen
    │   │   │   │   └── qwen_generate_utils.py
    │   │   ├── mplug_owl_video.py
    │   │   ├── mplug_owl_video
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_mplug_owl.py
    │   │   │   ├── modeling_mplug_owl.py
    │   │   │   ├── processing_mplug_owl.py
    │   │   │   └── tokenization_mplug_owl.py
    │   │   ├── phi3v.py
    │   │   ├── qwen_vl.py
    │   │   ├── qwen_vl_api.py
    │   │   ├── reka.py
    │   │   ├── tinyllava.py
    │   │   ├── video_chatgpt.py
    │   │   ├── video_chatgpt
    │   │   │   ├── __init__.py
    │   │   │   ├── constants.py
    │   │   │   ├── eval
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── model_utils.py
    │   │   │   ├── inference.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── consolidate.py
    │   │   │   │   ├── make_delta.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── video_chatgpt.py
    │   │   │   ├── single_video_inference.py
    │   │   │   ├── utils.py
    │   │   │   └── video_conversation.py
    │   │   ├── video_llava.py
    │   │   ├── xcomposer2_4KHD.py
    │   │   └── xcomposer2_4khd.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── _task_utils
    │   │   │   ├── file_utils.py
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   ├── video_loader.py
    │   │   │   └── vqa_eval_metric.py
    │   │   ├── activitynetqa
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── activitynetqa_generation.yaml
    │   │   │   └── utils.py
    │   │   ├── ai2d
    │   │   │   ├── ai2d.yaml
    │   │   │   ├── upload_ai2d.py
    │   │   │   └── utils.py
    │   │   ├── chartqa
    │   │   │   ├── chartqa.yaml
    │   │   │   ├── upload_chartqa.py
    │   │   │   └── utils.py
    │   │   ├── cmmmu
    │   │   │   ├── _cmmmu.yaml
    │   │   │   ├── _default_template_cmmmu_yaml
    │   │   │   ├── cmmmu_test.yaml
    │   │   │   ├── cmmmu_val.yaml
    │   │   │   └── utils.py
    │   │   ├── coco_cap
    │   │   │   ├── coco2014_cap.yaml
    │   │   │   ├── coco2014_cap_test.yaml
    │   │   │   ├── coco2014_cap_val.yaml
    │   │   │   ├── coco2017_cap.yaml
    │   │   │   ├── coco2017_cap_test.yaml
    │   │   │   ├── coco2017_cap_val.yaml
    │   │   │   ├── coco_cap.yaml
    │   │   │   └── utils.py
    │   │   ├── conbench
    │   │   │   ├── conbench.yaml
    │   │   │   └── utils.py
    │   │   ├── cvrr
    │   │   │   ├── _cvrr.yaml
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── cvrr_fine_grained_action_understanding.yaml
    │   │   │   ├── cvrr_interpretation_of_social_context.yaml
    │   │   │   ├── cvrr_interpretation_of_visual_context.yaml
    │   │   │   ├── cvrr_multiple_actions_in_a_single_video.yaml
    │   │   │   ├── cvrr_non_existent_actions_with_existent_scene_depictions.yaml
    │   │   │   ├── cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml
    │   │   │   ├── cvrr_object_instance_count.yaml
    │   │   │   ├── cvrr_partial_actions.yaml
    │   │   │   ├── cvrr_time_order_understanding.yaml
    │   │   │   ├── cvrr_understanding_emotional_context.yaml
    │   │   │   ├── cvrr_unusual_and_physically_anomalous_activities.yaml
    │   │   │   └── utils.py
    │   │   ├── docvqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── docvqa.yaml
    │   │   │   ├── docvqa_test.yaml
    │   │   │   ├── docvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── egoschema
    │   │   │   ├── README.md
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── egoschema.yaml
    │   │   │   ├── egoschema_mcppl.yaml
    │   │   │   ├── egoschema_subset.yaml
    │   │   │   ├── egoschema_subset_mcppl.yaml
    │   │   │   └── utils.py
    │   │   ├── ferret
    │   │   │   ├── ferret.yaml
    │   │   │   └── utils.py
    │   │   ├── flickr30k
    │   │   │   ├── flickr30k.yaml
    │   │   │   ├── flickr30k_test.yaml
    │   │   │   └── utils.py
    │   │   ├── gqa
    │   │   │   ├── gqa.yaml
    │   │   │   └── utils.py
    │   │   ├── gqa_ru
    │   │   │   ├── gqa_ru.yaml
    │   │   │   └── utils.py
    │   │   ├── hallusion_bench
    │   │   │   ├── evaluate_hb.py
    │   │   │   ├── hallusion_bench_image.yaml
    │   │   │   └── utils.py
    │   │   ├── iconqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── iconqa.yaml
    │   │   │   ├── iconqa_test.yaml
    │   │   │   ├── iconqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ii_bench
    │   │   │   ├── ii_bench.yaml
    │   │   │   └── utils.py
    │   │   ├── infovqa
    │   │   │   ├── _default_template_infovqa_yaml
    │   │   │   ├── infovqa.yaml
    │   │   │   ├── infovqa_test.yaml
    │   │   │   ├── infovqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── internal_eval
    │   │   │   ├── _default_template_internal_eval_yaml
    │   │   │   ├── d170_cn.yaml
    │   │   │   ├── d170_cn_utils.py
    │   │   │   ├── d170_en.yaml
    │   │   │   ├── d170_en_utils.py
    │   │   │   ├── dc100_en.yaml
    │   │   │   ├── dc100_en_utils.py
    │   │   │   ├── dc200_cn.yaml
    │   │   │   ├── dc200_cn_utils.py
    │   │   │   ├── internal_eval.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-bench-coco
    │   │   │   ├── llava-bench-coco.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-in-the-wild
    │   │   │   ├── llava-in-the-wild.yaml
    │   │   │   └── utils.py
    │   │   ├── llava_wilder
    │   │   │   ├── _default_template_wilder_yaml
    │   │   │   ├── llava_wilder_small.yaml
    │   │   │   └── utils.py
    │   │   ├── longvideobench
    │   │   │   ├── longvideobench_val_i.yaml
    │   │   │   ├── longvideobench_val_v.yaml
    │   │   │   └── utils.py
    │   │   ├── mathverse
    │   │   │   ├── mathverse.yaml
    │   │   │   ├── mathverse_evals.py
    │   │   │   ├── mathverse_testmini.yaml
    │   │   │   ├── mathverse_testmini_text_dominant.yaml
    │   │   │   ├── mathverse_testmini_text_lite.yaml
    │   │   │   ├── mathverse_testmini_text_only.yaml
    │   │   │   ├── mathverse_testmini_vision_dominant.yaml
    │   │   │   ├── mathverse_testmini_vision_intensive.yaml
    │   │   │   ├── mathverse_testmini_vision_only.yaml
    │   │   │   └── utils.py
    │   │   ├── mathvista
    │   │   │   ├── mathvista.yaml
    │   │   │   ├── mathvista_evals.py
    │   │   │   ├── mathvista_test.yaml
    │   │   │   ├── mathvista_testmini.yaml
    │   │   │   └── utils.py
    │   │   ├── mmbench
    │   │   │   ├── _default_template_mmbench_cn_yaml
    │   │   │   ├── _default_template_mmbench_en_yaml
    │   │   │   ├── _default_template_mmbench_ru_yaml
    │   │   │   ├── cc_utils.py
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── mmbench.yaml
    │   │   │   ├── mmbench_cc.yaml
    │   │   │   ├── mmbench_cn.yaml
    │   │   │   ├── mmbench_cn_dev.yaml
    │   │   │   ├── mmbench_cn_test.yaml
    │   │   │   ├── mmbench_en.yaml
    │   │   │   ├── mmbench_en_dev.yaml
    │   │   │   ├── mmbench_en_test.yaml
    │   │   │   ├── mmbench_evals.py
    │   │   │   ├── mmbench_ru_dev.yaml
    │   │   │   └── ru_utils.py
    │   │   ├── mme
    │   │   │   ├── mme.yaml
    │   │   │   └── utils.py
    │   │   ├── mmmu
    │   │   │   ├── arial.ttf
    │   │   │   ├── mmmu.yaml
    │   │   │   ├── mmmu_group_img.yaml
    │   │   │   ├── mmmu_group_img_test.yaml
    │   │   │   ├── mmmu_group_img_val.yaml
    │   │   │   ├── mmmu_test.yaml
    │   │   │   ├── mmmu_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_group_img.py
    │   │   ├── mmupd
    │   │   │   ├── _default_template_mmupd_yaml
    │   │   │   ├── mmaad_base.yaml
    │   │   │   ├── mmaad_instruction.yaml
    │   │   │   ├── mmaad_option.yaml
    │   │   │   ├── mmiasd_base.yaml
    │   │   │   ├── mmiasd_instruction.yaml
    │   │   │   ├── mmiasd_option.yaml
    │   │   │   ├── mmivqd_base.yaml
    │   │   │   ├── mmivqd_instruction.yaml
    │   │   │   ├── mmivqd_option.yaml
    │   │   │   ├── mmupd.yaml
    │   │   │   ├── mmupd_base.yaml
    │   │   │   ├── mmupd_evals.py
    │   │   │   ├── mmupd_instruction.yaml
    │   │   │   ├── mmupd_option.yaml
    │   │   │   └── utils.py
    │   │   ├── mmvet
    │   │   │   ├── mmvet.yaml
    │   │   │   └── utils.py
    │   │   ├── multidocvqa
    │   │   │   ├── multidocvqa.yaml
    │   │   │   ├── multidocvqa_test.yaml
    │   │   │   ├── multidocvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── multilingual-llava-bench-in-the-wild
    │   │   │   ├── README.md
    │   │   │   ├── _default_template.yaml
    │   │   │   ├── arabic_llava_in_the_wild.yaml
    │   │   │   ├── bengali_llava_in_the_wild.yaml
    │   │   │   ├── chinese_llava_in_the_wild.yaml
    │   │   │   ├── french_llava_in_the_wild.yaml
    │   │   │   ├── hindi_llava_in_the_wild.yaml
    │   │   │   ├── japanese_llava_in_the_wild.yaml
    │   │   │   ├── russian_llava_in_the_wild.yaml
    │   │   │   ├── spanish_llava_in_the_wild.yaml
    │   │   │   ├── urdu_llava_in_the_wild.yaml
    │   │   │   └── utils.py
    │   │   ├── nextqa
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── nextqa.yaml
    │   │   │   ├── nextqa_mc_test.yaml
    │   │   │   ├── nextqa_oe_test.yaml
    │   │   │   ├── nextqa_oe_val.yaml
    │   │   │   ├── stopwords.csv
    │   │   │   └── utils.py
    │   │   ├── nocaps
    │   │   │   ├── _default_template_nocaps_yaml
    │   │   │   ├── nocaps.yaml
    │   │   │   ├── nocaps_test.yaml
    │   │   │   ├── nocaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ocrbench
    │   │   │   ├── ocrbench.yaml
    │   │   │   ├── upload_ocrbench.py
    │   │   │   └── utils.py
    │   │   ├── ok_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _ok_vqa.yaml
    │   │   │   ├── ok_vqa_val2014.yaml
    │   │   │   └── utils.py
    │   │   ├── olympiadbench
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── olympiadbench.yaml
    │   │   │   ├── olympiadbench_evals.py
    │   │   │   ├── olympiadbench_test_cn.yaml
    │   │   │   └── olympiadbench_test_en.yaml
    │   │   ├── perceptiontest
    │   │   │   ├── test
    │   │   │   │   ├── _default_template_yaml
    │   │   │   │   ├── perceptiontest_mc.yaml
    │   │   │   │   ├── perceptiontest_mcppl.yaml
    │   │   │   │   └── utils.py
    │   │   │   └── val
    │   │   │   │   ├── _default_template_yaml
    │   │   │   │   ├── perceptiontest_mc.yaml
    │   │   │   │   ├── perceptiontest_mcppl.yaml
    │   │   │   │   └── utils.py
    │   │   ├── pope
    │   │   │   ├── pope.yaml
    │   │   │   ├── pope_adv.yaml
    │   │   │   ├── pope_full.yaml
    │   │   │   ├── pope_pop.yaml
    │   │   │   ├── pope_random.yaml
    │   │   │   └── utils.py
    │   │   ├── qbench
    │   │   │   ├── abench_dev.yaml
    │   │   │   ├── qbench2_dev.yaml
    │   │   │   ├── qbench_dev.yaml
    │   │   │   ├── qbenchs_dev.yaml
    │   │   │   └── utils.py
    │   │   ├── realworldqa
    │   │   │   ├── realworldqa.yaml
    │   │   │   └── utils.py
    │   │   ├── refcoco+
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco+_bbox_testA.yaml
    │   │   │   ├── refcoco+_bbox_testB.yaml
    │   │   │   ├── refcoco+_bbox_val.yaml
    │   │   │   ├── refcoco+_seg_testA.yaml
    │   │   │   ├── refcoco+_seg_testB.yaml
    │   │   │   ├── refcoco+_seg_val.yaml
    │   │   │   └── utils.py
    │   │   ├── refcoco
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco_bbox_test.yaml
    │   │   │   ├── refcoco_bbox_testA.yaml
    │   │   │   ├── refcoco_bbox_testB.yaml
    │   │   │   ├── refcoco_bbox_val.yaml
    │   │   │   ├── refcoco_seg_test.yaml
    │   │   │   ├── refcoco_seg_testA.yaml
    │   │   │   ├── refcoco_seg_testB.yaml
    │   │   │   ├── refcoco_seg_val.yaml
    │   │   │   └── utils.py
    │   │   ├── refcocog
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcocog_bbox_test.yaml
    │   │   │   ├── refcocog_bbox_val.yaml
    │   │   │   ├── refcocog_seg_test.yaml
    │   │   │   ├── refcocog_seg_val.yaml
    │   │   │   └── utils.py
    │   │   ├── scienceqa
    │   │   │   ├── scienceqa.yaml
    │   │   │   ├── scienceqa_full.yaml
    │   │   │   ├── scienceqa_img.yaml
    │   │   │   └── utils.py
    │   │   ├── screenspot
    │   │   │   ├── README.md
    │   │   │   ├── _default_template_rec_yaml
    │   │   │   ├── _default_template_reg_yaml
    │   │   │   ├── _screenspot.yaml
    │   │   │   ├── screenspot_rec_test.yaml
    │   │   │   ├── screenspot_reg_test.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── seedbench
    │   │   │   ├── seedbench.yaml
    │   │   │   ├── seedbench_ppl.yaml
    │   │   │   └── utils.py
    │   │   ├── seedbench_2
    │   │   │   ├── seedbench_2.yaml
    │   │   │   └── utils.py
    │   │   ├── stvqa
    │   │   │   ├── stvqa.yaml
    │   │   │   └── utils.py
    │   │   ├── synthdog
    │   │   │   ├── donut_evaluator.py
    │   │   │   ├── synthdog.yaml
    │   │   │   ├── synthdog_en.yaml
    │   │   │   ├── synthdog_zh.yaml
    │   │   │   └── utils.py
    │   │   ├── tempcompass
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── _tempcompass.yaml
    │   │   │   ├── tempcompass_caption_matching.yaml
    │   │   │   ├── tempcompass_captioning.yaml
    │   │   │   ├── tempcompass_mc.yaml
    │   │   │   ├── tempcompass_yes_no.yaml
    │   │   │   └── utils.py
    │   │   ├── textcaps
    │   │   │   ├── _default_template_textcaps_yaml
    │   │   │   ├── textcaps.yaml
    │   │   │   ├── textcaps_test.yaml
    │   │   │   ├── textcaps_train.yaml
    │   │   │   ├── textcaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── textvqa
    │   │   │   ├── _default_template_textvqa_yaml
    │   │   │   ├── _textvqa.yaml
    │   │   │   ├── textvqa_test.yaml
    │   │   │   ├── textvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── vatex
    │   │   │   ├── _vatex.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vatex_test.yaml
    │   │   │   └── vatex_val_zh.yaml
    │   │   ├── vcr_wiki
    │   │   │   ├── _default_template_vcr_yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vcr_wiki_en_easy.yaml
    │   │   │   ├── vcr_wiki_en_easy_100.yaml
    │   │   │   ├── vcr_wiki_en_easy_500.yaml
    │   │   │   ├── vcr_wiki_en_hard.yaml
    │   │   │   ├── vcr_wiki_en_hard_100.yaml
    │   │   │   ├── vcr_wiki_en_hard_500.yaml
    │   │   │   ├── vcr_wiki_zh_easy.yaml
    │   │   │   ├── vcr_wiki_zh_easy_100.yaml
    │   │   │   ├── vcr_wiki_zh_easy_500.yaml
    │   │   │   ├── vcr_wiki_zh_hard.yaml
    │   │   │   ├── vcr_wiki_zh_hard_100.yaml
    │   │   │   └── vcr_wiki_zh_hard_500.yaml
    │   │   ├── video_detail_description
    │   │   │   ├── README.md
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── utils.py
    │   │   │   └── video_detail_description.yaml
    │   │   ├── videochatgpt
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── _videochatgpt.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── videochatgpt_consistency.yaml
    │   │   │   ├── videochatgpt_generic.yaml
    │   │   │   └── videochatgpt_temporal.yaml
    │   │   ├── videomme
    │   │   │   ├── utils.py
    │   │   │   ├── videomme.yaml
    │   │   │   └── videomme_w_subtitle.yaml
    │   │   ├── vitatecs
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── _vitatecs.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vitatecs_compositionality.yaml
    │   │   │   ├── vitatecs_direction.yaml
    │   │   │   ├── vitatecs_intensity.yaml
    │   │   │   ├── vitatecs_localization.yaml
    │   │   │   ├── vitatecs_sequence.yaml
    │   │   │   └── vitatecs_type.yaml
    │   │   ├── vizwiz_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _vizwiz_vqa.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vizwiz_vqa_test.yaml
    │   │   │   └── vizwiz_vqa_val.yaml
    │   │   ├── vqav2
    │   │   │   ├── _default_template_vqav2_yaml
    │   │   │   ├── _vqav2.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vqav2_test.yaml
    │   │   │   └── vqav2_val.yaml
    │   │   ├── websrc
    │   │   │   ├── README.md
    │   │   │   ├── utils.py
    │   │   │   ├── websrc.yaml
    │   │   │   ├── websrc_test.yaml
    │   │   │   └── websrc_val.yaml
    │   │   ├── wild_vision_bench
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── utils.py
    │   │   │   └── wild_vision_bench0617.yaml
    │   │   ├── worldqa
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── utils.py
    │   │   │   ├── worldqa.yaml
    │   │   │   ├── worldqa_generation.yaml
    │   │   │   ├── worldqa_mc.yaml
    │   │   │   ├── worldqa_mc_evaluator.py
    │   │   │   └── worldqa_mcppl.yaml
    │   │   └── youcook2
    │   │   │   ├── _default_template_yaml
    │   │   │   ├── utils.py
    │   │   │   └── youcook2_val.yaml
    │   └── utils.py
    ├── miscs
    │   ├── example_eval.yaml
    │   ├── llava_repr_requirements.txt
    │   ├── llava_result_check.md
    │   ├── llava_sglang_result_check.md
    │   ├── repr_scripts.sh
    │   ├── repr_torch_envs.txt
    │   ├── scienceqa_id.txt
    │   ├── script.sh
    │   ├── test_llava.py
    │   ├── test_scienceqa.py
    │   ├── tinyllava_repr_requirements.txt
    │   └── tinyllava_repr_scripts.sh
    ├── pyproject.toml
    ├── setup.py
    └── tools
    │   ├── get_video_avg_time.py
    │   ├── make_image_hf_dataset.ipynb
    │   ├── make_video_hf_dataset.ipynb
    │   └── makecvrr.ipynb
├── pyproject.toml
├── scripts
    ├── lmms-eval
    │   └── eval.sh
    ├── train
    │   ├── finetune_eval_7b_pmod_llava_1_5.sh
    │   └── finetune_eval_7b_pmod_llava_next.sh
    ├── upload_pypi.sh
    ├── zero1.json
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json
└── util_scripts
    ├── clean_data_json.py
    ├── demo.py
    ├── download_llava-next_data.py
    └── setup_env.sh


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # The .dockerignore file excludes files from the container build process.
 2 | #
 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file
 4 | 
 5 | # Exclude Git files
 6 | .git
 7 | .github
 8 | .gitignore
 9 | 
10 | # Exclude Python cache files
11 | __pycache__
12 | .mypy_cache
13 | .pytest_cache
14 | .ruff_cache
15 | 
16 | # Exclude Python virtual environment
17 | /venv
18 | 
19 | # Exclude some weights
20 | /openai
21 | /liuhaotian
22 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | # Unix-style newlines with a newline ending every file
 4 | [*]
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | charset = utf-8
 9 | 
10 | # 4 space indentation
11 | [*.{py,json}]
12 | indent_style = space
13 | indent_size = 4
14 | 
15 | # 2 space indentation
16 | [*.{md,sh,yaml,yml}]
17 | indent_style = space
18 | indent_size = 2


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # https://git-scm.com/docs/gitattributes
 2 | 
 3 | # Set the default behavior, in case people don't have core.autocrlf set.
 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion
 5 | * text=auto
 6 | 
 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
 8 | # Source files
 9 | # ============
10 | *.pxd    text diff=python
11 | *.py     text diff=python
12 | *.py3    text diff=python
13 | *.pyw    text diff=python
14 | *.pyx    text diff=python
15 | *.pyz    text diff=python
16 | *.pyi    text diff=python
17 | 
18 | # Binary files
19 | # ============
20 | *.db     binary
21 | *.p      binary
22 | *.pkl    binary
23 | *.pickle binary
24 | *.pyc    binary export-ignore
25 | *.pyo    binary export-ignore
26 | *.pyd    binary
27 | 
28 | # Jupyter notebook
29 | *.ipynb  text eol=lf
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-usage.yaml:
--------------------------------------------------------------------------------
 1 | name: Usage issues
 2 | description: Report issues in usage.
 3 | title: "[Usage] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for taking the time to fill out this form.  Please give as detailed description as possible for us to better assist with the issue :)
 9 |   - type: textarea
10 |     id: what-happened
11 |     attributes:
12 |       label: Describe the issue
13 |       description: Please give as detailed description as possible for us to better assist with the issue.  Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub.
14 |       placeholder: Issue
15 |       value: |
16 |         Issue:
17 |         
18 |         Command:
19 |         ```
20 |         PASTE THE COMMANDS HERE.
21 |         ```
22 |         
23 |         Log: 
24 |         ```
25 |         PASTE THE LOGS HERE.
26 |         ```
27 |         
28 |         Screenshots:
29 |         You may attach screenshots if it better explains the issue.
30 |     validations:
31 |       required: true
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Request for a new feature
 3 | title: "[Feature request] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  Please share your thoughts of the new features below.
 9 |   - type: textarea
10 |     id: feature
11 |     attributes:
12 |       label: feature
13 |       placeholder: Start your thoughts here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-question.yaml:
--------------------------------------------------------------------------------
 1 | name: Questions
 2 | description: General questions about the work
 3 | title: "[Question] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: question
11 |     attributes:
12 |       label: Question
13 |       placeholder: Start question here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/4-discussion.yaml:
--------------------------------------------------------------------------------
 1 | name: Discussions
 2 | description: General discussions about the work
 3 | title: "[Discussion] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: discussion
11 |     attributes:
12 |       label: Discussion
13 |       placeholder: Start discussion here...


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | *.json
11 | *.jsonl
12 | 
13 | # Data
14 | !**/alpaca-data-conversation.json
15 | 
16 | # Editor
17 | .idea
18 | *.swp
19 | 
20 | # Other
21 | .DS_Store
22 | wandb
23 | output
24 | 
25 | checkpoints
26 | ckpts*
27 | 
28 | .ipynb_checkpoints
29 | # *.ipynb
30 | 
31 | # DevContainer
32 | !.devcontainer/*
33 | 
34 | # Demo
35 | serve_images/
36 | 
37 | # Ignore training and evaluation data
38 | playground/data/*
39 | !playground/data/coco2014_val_qa_eval
40 | !playground/data/prompts
41 | !playground/data/coco2014_val_gpt4_qa_30x3.jsonl
42 | 
43 | # VSCode jsons
44 | #!.vscode/launch.json
45 | #!.vscode/settings.json
46 | 
47 | # deepspeed zero{1,2,3}.json config files
48 | !scripts/*.json
49 | 


--------------------------------------------------------------------------------
/img/p-mod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/p-mod.png


--------------------------------------------------------------------------------
/img/table1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/table1.png


--------------------------------------------------------------------------------
/img/table2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/table2.png


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import PmodLlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.pmod_llava_llama import PmodLlavaLlamaForCausalLM, PmodLlavaConfig
2 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/lmms-eval/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | Before you open an issue, please check if a similar issue already exists or has been closed before.
2 | 
3 | ### When you open an issue, please be sure to include the following
4 | 
5 | - [ ] A descriptive title: [xxx] XXXX
6 | - [ ] A detailed description
7 | 
8 | Thank you for your contributions!
9 | 


--------------------------------------------------------------------------------
/lmms-eval/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Before you open a pull-request, please check if a similar issue already exists or has been closed before.
2 | 
3 | ### When you open a pull-request, please be sure to include the following
4 | 
5 | - [ ] A descriptive title: [xxx] XXXX
6 | - [ ] A detailed description
7 | 
8 | Thank you for your contributions!
9 | 


--------------------------------------------------------------------------------
/lmms-eval/.github/workflows/black.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Set up Python
11 |         uses: actions/setup-python@v4
12 |         with:
13 |           python-version: '3.9'
14 |       - name: Install specific version of Black
15 |         run: pip install black==23.9.1
16 |       - name: Run Black
17 |         run: black --line-length=240 ./


--------------------------------------------------------------------------------
/lmms-eval/.gitignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.pyc
 3 | output/
 4 | data/
 5 | lm_cache
 6 | .idea
 7 | build
 8 | dist
 9 | *.egg-info
10 | venv
11 | .vscode/
12 | temp
13 | __pycache__
14 | .ipynb_checkpoints
15 | temp
16 | # IPython
17 | profile_default/
18 | ipython_config.py
19 | logs/
20 | scripts/
21 | wandb/
22 | SimSun.ttf
23 | submissions/
24 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
26 | zk.log
27 | cache_dir
28 | ckpt
29 | pretrained/
30 | LLaVA/
31 | *logs
32 | temp/
33 | InternVL/
34 | logs/
35 | data/
36 | llava-video/
37 | Video-MME/
38 | VATEX/
39 | lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
40 | 


--------------------------------------------------------------------------------
/lmms-eval/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/psf/black
3 |     rev: 23.12.1
4 |     hooks:
5 |       - id: black
6 |         language_version: python3


--------------------------------------------------------------------------------
/lmms-eval/docs/README.md:
--------------------------------------------------------------------------------
 1 | # LMMs Eval Documentation
 2 | 
 3 | Welcome to the docs for `lmms-eval`!
 4 | 
 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
12 | * If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/api/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/models/model_utils/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoChatGPTLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | 
 7 | # Defining model
 8 | DEFAULT_VIDEO_TOKEN = "<video>"
 9 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
10 | DEFAULT_VID_START_TOKEN = "<vid_start>"
11 | DEFAULT_VID_END_TOKEN = "<vid_end>"
12 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/models/video_chatgpt/eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/model/__init__.py:
--------------------------------------------------------------------------------
1 | from lmms_eval.models.video_chatgpt.model.video_chatgpt import VideoChatGPTLlamaForCausalLM, VideoChatGPTConfig
2 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from lmms_eval.models.video_chatgpt.model import *
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
16 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path)
17 |     src_model.save_pretrained(dst_path)
18 |     src_tokenizer.save_pretrained(dst_path)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--src", type=str, required=True)
24 |     parser.add_argument("--dst", type=str, required=True)
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     consolidate_ckpt(args.src, args.dst)
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from lmms_eval.models.video_chatgpt.model import *
 3 | from transformers import StoppingCriteria
 4 | 
 5 | 
 6 | class KeywordsStoppingCriteria(StoppingCriteria):
 7 |     def __init__(self, keywords, tokenizer, input_ids):
 8 |         self.keywords = keywords
 9 |         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
10 |         self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
11 |         self.tokenizer = tokenizer
12 |         self.start_len = None
13 |         self.input_ids = input_ids
14 | 
15 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
16 |         if self.start_len is None:
17 |             self.start_len = self.input_ids.shape[1]
18 |         else:
19 |             for keyword_id in self.keyword_ids:
20 |                 if output_ids[0, -1] == keyword_id:
21 |                     return True
22 |             outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
23 |             for keyword in self.keywords:
24 |                 if keyword in outputs:
25 |                     return True
26 |         return False
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/video_chatgpt/utils.py:
--------------------------------------------------------------------------------
1 | def disable_torch_init():
2 |     """
3 |     Disable the redundant torch default initialization to accelerate model creation.
4 |     """
5 |     import torch
6 | 
7 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
8 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/video_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_cache_dir(config, sub_dir="videos"):
 5 |     HF_HOME = os.environ["HF_HOME"]
 6 |     cache_dir = config["dataset_kwargs"]["cache_dir"]
 7 |     cache_dir = os.path.join(HF_HOME, cache_dir)
 8 |     cache_dir = os.path.join(cache_dir, sub_dir)
 9 |     return cache_dir
10 | 
11 | 
12 | def _get_video_file(prefix: str, video_name: str, suffix: str):
13 |     if not isinstance(video_name, str):
14 |         video_name = str(video_name)
15 |     if not video_name.endswith(suffix):
16 |         video_name = f"{video_name}.{suffix}"
17 |     video_path = os.path.join(prefix, video_name)
18 |     return video_path
19 | 
20 | 
21 | def get_video(prefix: str, video_name: str, suffix: str = "mp4"):
22 |     tried = [os.path.abspath(_get_video_file(prefix, video_name, suffix)), os.path.abspath(_get_video_file(prefix, video_name, suffix.upper())), os.path.abspath(_get_video_file(prefix, video_name, suffix.lower()))]
23 |     for video_path in tried:
24 |         if os.path.exists(video_path):
25 |             return video_path
26 |     raise FileNotFoundError(f"Tried both {tried} but none of them exist, please check")
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/activitynetqa/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ActivityNetQA
 2 | dataset_kwargs:
 3 |   token: True
 4 |   video: True
 5 |   force_download: False
 6 |   local_files_only: False
 7 |   cache_dir: activitynetqa
 8 | model_specific_prompt_kwargs:
 9 |   default:
10 |     pre_prompt: ""
11 |     post_prompt: " Answer the question using a single word or phrase."
12 | 
13 | metadata:
14 |   version: 0.0
15 |   gpt_eval_model_name: gpt-3.5-turbo-0613


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml:
--------------------------------------------------------------------------------
 1 | task: "activitynetqa"
 2 | test_split: test
 3 | output_type: generate_until
 4 | doc_to_visual: !function utils.activitynetqa_doc_to_visual
 5 | doc_to_text: !function utils.activitynetqa_doc_to_text
 6 | doc_to_target: !function utils.activitynetqa_doc_to_answer
 7 | process_results: !function utils.activitynetqa_process_results # gpt eval here for each QA pairs
 8 | metric_list:
 9 |   - metric: gpt_eval_score
10 |     aggregation: !function utils.activitynetqa_aggregate_score # parse scores from each QA pairs
11 |     higher_is_better: true
12 |   - metric: gpt_eval_accuracy
13 |     aggregation: !function utils.activitynetqa_aggregate_accuracy # parse accuracy from each QA pairs
14 |     higher_is_better: true
15 | 
16 | include: _default_template_yaml
17 | 
18 | generation_kwargs:
19 |   until:
20 |     - "ASSISTANT:"
21 |   image_aspect_ratio: original
22 |   max_new_tokens: 64
23 |   temperature: 0
24 |   top_p: 1.0
25 |   num_beams: 1
26 |   do_sample: false
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ChartQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "chartqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.chartqa_doc_to_visual
 8 | doc_to_text: !function utils.chartqa_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | process_results: !function utils.chartqa_process_results
15 | metric_list:
16 |   - metric: relaxed_overall
17 |     aggregation: mean
18 |     higher_is_better: true
19 |   - metric: relaxed_human_split
20 |     aggregation: mean
21 |     higher_is_better: true
22 |   - metric: relaxed_augmented_split
23 |     aggregation: mean
24 |     higher_is_better: true
25 | metadata:
26 |   - version: 0.0
27 | model_specific_prompt_kwargs:
28 |   default:
29 |     pre_prompt: ""
30 |     post_prompt: "\nAnswer the question with a single word."
31 |   qwen_vl:
32 |     pre_prompt: ""
33 |     post_prompt: " Answer:"
34 | 
35 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml:
--------------------------------------------------------------------------------
1 | group: cmmmu
2 | task:
3 | - cmmmu_val
4 | - cmmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/CMMMU
2 | output_type: generate_until
3 | doc_to_visual: !function utils.cmmmu_doc_to_visual
4 | doc_to_text: !function utils.cmmmu_doc_to_text
5 | doc_to_target: "answer"
6 | generation_kwargs:
7 |   max_new_tokens: 16
8 |   image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_test"
 2 | test_split: test
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_test_results_for_submission
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | metric_list:
 7 |   - metric: submission
 8 |     aggregation: !function utils.cmmmu_test_aggregate_results_for_submission
 9 |     higher_is_better: false
10 | metadata:
11 |   - version: 0.0
12 | include: _default_template_cmmmu_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_val"
 2 | test_split: val
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_results
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   image_aspect_ratio: original
 9 | metric_list:
10 |   - metric: cmmmu_acc
11 |     aggregation: !function utils.cmmmu_aggregate_results
12 |     higher_is_better: true
13 | metadata:
14 |   - version: 0.0
15 | include: _default_template_cmmmu_yaml
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2014_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2014_cap_test"
 5 | group : "coco_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2017_cap
2 | task:
3 |   - coco2017_cap_val
4 |   - coco2017_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2017_cap_test"
 5 | group : "coco_caption2017"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test
5 |   - coco2017_cap_val
6 |   - coco2017_cap_test
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/conbench/conbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: ConBench/ConBench_D
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ConBench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.conbench_doc_to_visual
 8 | doc_to_text: !function utils.conbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 1024
12 |   temperature: 0.2
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: True
16 | # The return value of process_results will be used by metrics
17 | process_results: !function utils.conbench_process_results
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: ConScore_D
21 |     aggregation: !function utils.conbench_aggregate_results
22 |     higher_is_better: true
23 | metadata:
24 |   - version: 0.0
25 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/_cvrr.yaml:
--------------------------------------------------------------------------------
 1 | group: cvrr
 2 | task:
 3 | - cvrr_continuity_and_object_instance_count
 4 | - cvrr_fine_grained_action_understanding
 5 | - cvrr_interpretation_of_social_context
 6 | - cvrr_interpretation_of_visual_context
 7 | - cvrr_multiple_actions_in_a_single_video
 8 | - cvrr_non_existent_actions_with_existent_scene_depictions
 9 | - cvrr_non_existent_actions_with_non_existent_scene_depictions
10 | - cvrr_partial_actions
11 | - cvrr_time_order_understanding
12 | - cvrr_understanding_emotional_context
13 | - cvrr_unusual_and_physically_anomalous_activities
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/CVRR-ES
 2 | dataset_kwargs:
 3 |   token: True
 4 |   video: True
 5 |   cache_dir: cvrr-es
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: ""
10 | 
11 | metadata:
12 |   version: 0.0
13 |   gpt_eval_model_name: gpt-3.5-turbo-0125


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_fine_grained_action_understanding.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "fine_grained_action_understanding"
 2 | task: "cvrr_fine_grained_action_understanding"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_interpretation_of_social_context.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "interpretation_of_social_context"
 2 | task: "cvrr_interpretation_of_social_context"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_interpretation_of_visual_context.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "interpretation_of_visual_context"
 2 | task: "cvrr_interpretation_of_visual_context"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_multiple_actions_in_a_single_video.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "multiple_actions_in_a_single_video"
 2 | task: "cvrr_multiple_actions_in_a_single_video"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_existent_scene_depictions.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "non_existent_actions_with_existent_scene_depictions"
 2 | task: "cvrr_non_existent_actions_with_existent_scene_depictions"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "non_existent_actions_with_non_existent_scene_depictions"
 2 | task: "cvrr_non_existent_actions_with_non_existent_scene_depictions"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "continuity_and_object_instance_count"
 2 | task: "cvrr_continuity_and_object_instance_count"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_partial_actions.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "partial_actions"
 2 | task: "cvrr_partial_actions"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_time_order_understanding.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "time_order_understanding"
 2 | task: "cvrr_time_order_understanding"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_understanding_emotional_context.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "understanding_emotional_context"
 2 | task: "cvrr_understanding_emotional_context"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cvrr/cvrr_unusual_and_physically_anomalous_activities.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "unusual_and_physically_anomalous_activities"
 2 | task: "cvrr_unusual_and_physically_anomalous_activities"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.cvrr_doc_to_visual
 6 | doc_to_text: !function utils.cvrr_doc_to_text
 7 | doc_to_target: !function utils.cvrr_doc_to_answer
 8 | process_results: !function utils.cvrr_process_results
 9 | metric_list:
10 |   - metric: gpt_eval_accuracy
11 |     aggregation: !function utils.cvrr_aggregate_accuracy
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score
14 |     aggregation: !function utils.cvrr_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: DocVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | output_type: generate_until
 6 | doc_to_visual: !function utils.docvqa_doc_to_visual
 7 | doc_to_text: !function utils.docvqa_doc_to_text
 8 | doc_to_target: "answers"
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer the question using a single word or phrase."
17 |   qwen_vl:
18 |     pre_prompt: ""
19 |     post_prompt: " Answer:"
20 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml:
--------------------------------------------------------------------------------
1 | group: docvqa
2 | task:
3 | - docvqa_val
4 | - docvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_test"
2 | test_split: test
3 | process_results: !function utils.docvqa_test_process_results
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.docvqa_test_aggregate_results
7 |     higher_is_better: true
8 | include: _default_template_docvqa_yaml
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_val"
2 | test_split: validation
3 | metric_list:
4 |   - metric: anls
5 |     aggregation: mean
6 |     higher_is_better: true
7 | include: _default_template_docvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | from loguru import logger
 8 | 
 9 | 
10 | def docvqa_doc_to_visual(doc):
11 |     return [doc["image"].convert("RGB")]
12 | 
13 | 
14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs):
15 |     question = doc["question"]
16 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
17 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
18 |     return f"{pre_prompt}{question}{post_prompt}"
19 | 
20 | 
21 | def docvqa_test_process_results(doc, results):
22 |     pred = results[0]
23 |     questionId = doc["questionId"]
24 |     return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}}
25 | 
26 | 
27 | def docvqa_test_aggregate_results(results, args):
28 |     # save results as json
29 |     path = generate_submission_file("docvqa_test_for_submission.json", args)
30 |     with open(path, "w") as f:
31 |         json.dump(results, f)
32 |     logger.info(f"Results saved to {path}")
33 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/egoschema/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/egoschema
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: egoschema
6 | model_specific_prompt_kwargs:
7 |   default:
8 |     pre_prompt: ""
9 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/egoschema/egoschema.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "GENERATION"
 2 | task: "egoschema"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.egoschema_doc_to_visual
 6 | doc_to_text: !function utils.egoschema_doc_to_text
 7 | doc_to_target: !function utils.egoschema_doc_to_answer
 8 | process_results: !function utils.egoschema_process_results_generation
 9 | metric_list:
10 |   - metric: submission
11 |     aggregation: !function utils.egoschema_aggregate_mc
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/egoschema/egoschema_mcppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "MC_PPL"
 2 | task: "egoschema_mcppl"
 3 | test_split: test
 4 | output_type: multiple_choice
 5 | doc_to_visual: !function utils.egoschema_doc_to_visual
 6 | doc_to_text: "question"
 7 | doc_to_target: !function utils.egoschema_doc_to_answer
 8 | doc_to_choice: !function utils.egoschema_doc_to_choice
 9 | process_results: !function utils.egoschema_process_results
10 | metric_list:
11 |   - metric: submission
12 |     aggregation: !function utils.egoschema_aggregate_mc_ppl
13 |     higher_is_better: true
14 | include: _default_template_yaml
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/egoschema/egoschema_subset.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Subset"
 2 | task: "egoschema_subset"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.egoschema_doc_to_visual
 6 | doc_to_text: !function utils.egoschema_doc_to_text
 7 | doc_to_target: !function utils.egoschema_doc_to_answer
 8 | process_results: !function utils.egoschema_process_results_generation
 9 | metric_list:
10 |   - metric: submission
11 |     aggregation: !function utils.egoschema_aggregate_mc
12 |     higher_is_better: true
13 |   - metric: score
14 |     aggregation: !function utils.egoschema_aggregate_score
15 |     higher_is_better: true
16 | include: _default_template_yaml
17 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/egoschema/egoschema_subset_mcppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Subset"
 2 | task: "egoschema_subset_mcppl"
 3 | test_split: test
 4 | output_type: multiple_choice
 5 | doc_to_visual: !function utils.egoschema_doc_to_visual
 6 | doc_to_text: "question"
 7 | doc_to_target: !function utils.egoschema_doc_to_answer
 8 | doc_to_choice: !function utils.egoschema_doc_to_choice
 9 | process_results: !function utils.egoschema_process_results
10 | metric_list:
11 |   - metric: submission
12 |     aggregation: !function utils.egoschema_aggregate_mc_ppl
13 |     higher_is_better: true
14 |   - metric: score
15 |     aggregation: !function utils.egoschema_aggregate_score
16 |     higher_is_better: true
17 | include: _default_template_yaml
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ferret/ferret.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/Ferret-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ferret"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ferret_doc_to_visual
 8 | doc_to_text: !function utils.ferret_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.ferret_process_results
20 | metric_list:
21 |   - metric: gpt_eval_ferret_all
22 |     aggregation: !function utils.ferret_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_ferret_refer_desc
25 |     aggregation: !function utils.ferret_refer_desc_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_ferret_refer_reason
28 |     aggregation: !function utils.ferret_refer_reason_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_ferret_ground_conv
31 |     aggregation: !function utils.ferret_ground_conv_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0314"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml:
--------------------------------------------------------------------------------
1 | group: flickr30k
2 | task:
3 | - flickr30k_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/gqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/GQA
 2 | dataset_name: testdev_balanced_instructions
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "gqa"
 6 | test_split: testdev
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.gqa_doc_to_visual
 9 | doc_to_text: !function utils.gqa_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |     ignore_case: true
22 |     ignore_punctuation: true
23 | metadata:
24 |   - version: 0.0
25 |   
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | GQA_RAW_IMAGE_DATASET = None
 4 | GQA_ID2IMAGE = None
 5 | 
 6 | 
 7 | def gqa_doc_to_visual(doc):
 8 |     global GQA_RAW_IMAGE_DATASET
 9 |     global GQA_ID2IMAGE
10 |     if GQA_RAW_IMAGE_DATASET is None:
11 |         GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
12 |         GQA_ID2IMAGE = {}
13 |         for row in GQA_RAW_IMAGE_DATASET:
14 |             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
15 |     image = GQA_ID2IMAGE[doc["imageId"]]
16 |     return [image]
17 | 
18 | 
19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
20 |     question = doc["question"]
21 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
22 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
23 |     return f"{pre_prompt}{question}{post_prompt}"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa_ru/gqa_ru.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: deepvk/GQA-ru
 2 | dataset_name: testdev_balanced_instructions
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "gqa-ru"
 6 | test_split: testdev
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.gqa_doc_to_visual
 9 | doc_to_text: !function utils.gqa_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |     ignore_case: true
22 |     ignore_punctuation: true
23 | metadata:
24 |   - version: 0.0
25 |   
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nОтветь одним словом."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa_ru/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | GQA_RAW_IMAGE_DATASET = None
 4 | GQA_ID2IMAGE = None
 5 | 
 6 | 
 7 | def gqa_doc_to_visual(doc):
 8 |     global GQA_RAW_IMAGE_DATASET
 9 |     global GQA_ID2IMAGE
10 |     if GQA_RAW_IMAGE_DATASET is None:
11 |         GQA_RAW_IMAGE_DATASET = load_dataset("deepvk/GQA-ru", "testdev_balanced_images", split="testdev", token=True)
12 |         GQA_ID2IMAGE = {}
13 |         for row in GQA_RAW_IMAGE_DATASET:
14 |             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
15 |     image = GQA_ID2IMAGE[doc["imageId"]]
16 |     return [image]
17 | 
18 | 
19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
20 |     question = doc["question"]
21 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
22 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
23 |     return f"{pre_prompt}{question}{post_prompt}"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ICON-QA
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.doc_to_visual
 6 | doc_to_text: !function utils.doc_to_text
 7 | doc_to_target: "answers"
 8 | # process_results: !function utils.test_process_results
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     statement: "Given a set of images and a question, please provide the answer to the question.\n"
17 |     options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
18 |     freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
19 | metric_list:
20 |   - metric: anls
21 |     aggregation: mean
22 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml:
--------------------------------------------------------------------------------
1 | group: iconqa
2 | task:
3 | - iconqa_val
4 | - iconqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_test"
2 | test_split: test
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_val"
2 | test_split: val
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ii_bench/ii_bench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/II-Bench
 2 | task: "ii-bench"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.ii_bench_doc_to_visual
 6 | doc_to_text: !function utils.ii_bench_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.ii_bench_process_results
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.ii_bench_aggregate_submissions
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: "Instruction: Please try to answer the single-answer multiple choice question below based on the picture provided.\n"
19 |     post_prompt: "\nAnswer:"
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: InfographicVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | doc_to_target: "answers"
 6 | doc_to_visual: !function utils.infovqa_doc_to_visual
 7 | doc_to_text: !function utils.infovqa_doc_to_text
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml:
--------------------------------------------------------------------------------
1 | group: infovqa
2 | task:
3 | - infovqa_val
4 | - infovqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "infovqa_test"
 2 | test_split: test
 3 | output_type: generate_until
 4 | process_results: !function utils.infovqa_test_process_results
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.infovqa_test_aggregate_results
 8 |     higher_is_better: true
 9 | include: _default_template_infovqa_yaml
10 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "infovqa_val"
2 | test_split: validation
3 | output_type: generate_until
4 | metric_list:
5 |   - metric: anls
6 |     aggregation: mean
7 |     higher_is_better: true
8 | include: _default_template_infovqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | from loguru import logger as eval_logger
 8 | 
 9 | 
10 | def infovqa_doc_to_visual(doc):
11 |     return [doc["image"].convert("RGB")]
12 | 
13 | 
14 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs):
15 |     question = doc["question"]
16 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
17 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
18 |     return f"{pre_prompt}{question}{post_prompt}"
19 | 
20 | 
21 | def infovqa_test_process_results(doc, results):
22 |     pred = results[0]
23 |     questionId = doc["questionId"]
24 |     return {"submission": {"questionId": int(questionId), "answer": pred}}
25 | 
26 | 
27 | def infovqa_test_aggregate_results(results, args):
28 |     # save results as json
29 |     file = generate_submission_file("infovqa_test_for_submission.json", args)
30 |     with open(file, "w") as f:
31 |         json.dump(results, f)
32 |     eval_logger.info(f"Results saved to {file}")
33 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/_default_template_internal_eval_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     pre_prompt: ""
4 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/d170_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/D170_v4.1_CN
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "d170_cn"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_text: !function utils.doc_to_text # Such that {{prompt}} will be replaced by doc["question"]
 8 | doc_to_visual: !function d170_cn_utils.doc_to_visual
 9 | doc_to_target: "{{annotation}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function d170_cn_utils.process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_info
21 |     aggregation: !function d170_cn_utils.d170_cn_aggregate_info
22 |     higher_is_better: false
23 |   - metric: gpt_eval_avg_score
24 |     aggregation: !function d170_cn_utils.d170_cn_aggregate_avg_score
25 |     higher_is_better: true
26 |   - metric: gpt_eval_score2_rate
27 |     aggregation: !function d170_cn_utils.d170_cn_aggregate_score2_rate
28 |     higher_is_better: true
29 | metadata:
30 |   version: 0.0
31 |   gpt_eval_model_name: "gpt-4-1106-preview"
32 | include: _default_template_internal_eval_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/d170_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/D170_v4.1_EN
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "d170_en"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function d170_en_utils.doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{prompt}} will be replaced by doc["question"]
 9 | doc_to_target: "{{annotation}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function d170_en_utils.process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_info
21 |     aggregation: !function d170_en_utils.d170_en_aggregate_info
22 |     higher_is_better: false
23 |   - metric: gpt_eval_avg_score
24 |     aggregation: !function d170_en_utils.d170_en_aggregate_avg_score
25 |     higher_is_better: true
26 |   - metric: gpt_eval_score2_rate
27 |     aggregation: !function d170_en_utils.d170_en_aggregate_score2_rate
28 |     higher_is_better: true
29 | metadata:
30 |   version: 0.0
31 |   gpt_eval_model_name: "gpt-4-1106-preview"
32 | include: _default_template_internal_eval_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/dc100_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DC100_EN
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "dc100_en"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function dc100_en_utils.doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{prompt}} will be replaced by doc["question"]
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function dc100_en_utils.process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_info
21 |     aggregation: !function dc100_en_utils.dc100_en_aggregate_info
22 |     higher_is_better: false
23 |   - metric: gpt_eval_avg_score
24 |     aggregation: !function dc100_en_utils.dc100_en_aggregate_avg_score
25 |     higher_is_better: true
26 | metadata:
27 |   version: 0.0
28 |   gpt_eval_model_name: "gpt-4-vision-preview"
29 | include: _default_template_internal_eval_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/dc200_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DC200_CN
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "dc200_cn"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function dc200_cn_utils.doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{prompt}} will be replaced by doc["question"]
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function dc200_cn_utils.process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_info
21 |     aggregation: !function dc200_cn_utils.dc200_cn_aggregate_info
22 |     higher_is_better: false
23 |   - metric: gpt_eval_avg_score
24 |     aggregation: !function dc200_cn_utils.dc200_cn_aggregate_avg_score
25 |     higher_is_better: true
26 | metadata:
27 |   version: 0.0
28 |   gpt_eval_model_name: "gpt-4-vision-preview"
29 | include: _default_template_internal_eval_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/internal_eval.yaml:
--------------------------------------------------------------------------------
1 | group: internal_eval
2 | task:
3 | - d170_cn
4 | - d170_en
5 | - dc100_en
6 | - dc200_cn
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/internal_eval/utils.py:
--------------------------------------------------------------------------------
1 | def doc_to_text(doc, model_specific_prompt_kwargs=None):
2 |     if model_specific_prompt_kwargs is None:
3 |         return doc["question"]
4 |     question = doc["question"]
5 |     pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
6 |     post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
7 |     return f"{pre_prompt}{question}{post_prompt}"
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-coco
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_bench_coco"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 | process_results: !function utils.llava_process_results
19 | metric_list:
20 |   - metric: gpt_eval_llava_all
21 |     aggregation: !function utils.llava_all_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_conv
24 |     aggregation: !function utils.llava_conv_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_detail
27 |     aggregation: !function utils.llava_detail_aggregation
28 |     higher_is_better: true
29 |   - metric: gpt_eval_llava_complex
30 |     aggregation: !function utils.llava_complex_aggregation
31 |     higher_is_better: true
32 | metadata:
33 |   version: 0.0
34 |   gpt_eval_model_name: "gpt-4-0314"
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-in-the-wild
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_in_the_wild"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 32768
15 |   temperature: 0
16 |   top_p: 1.0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.llava_process_results
20 | metric_list:
21 |   - metric: gpt_eval_llava_all
22 |     aggregation: !function utils.llava_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_llava_conv
25 |     aggregation: !function utils.llava_conv_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_llava_detail
28 |     aggregation: !function utils.llava_detail_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_llava_complex
31 |     aggregation: !function utils.llava_complex_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0613"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml:
--------------------------------------------------------------------------------
 1 | output_type: generate_until
 2 | doc_to_visual: !function utils.llava_doc_to_visual
 3 | doc_to_text: !function utils.llava_doc_to_text
 4 | doc_to_target: "gpt4v_answer"
 5 | generation_kwargs:
 6 |   until:
 7 |     - "ASSISTANT:"
 8 |   image_aspect_ratio: original
 9 |   max_new_tokens: 4096
10 |   temperature: 0
11 |   top_p: 1.0
12 |   num_beams: 1
13 |   do_sample: false
14 | process_results: !function utils.llava_process_results
15 | metric_list:
16 |   - metric: gpt_eval_llava_all
17 |     aggregation: !function utils.llava_all_aggregation
18 |     higher_is_better: true
19 | metadata:
20 |   version: 0.0
21 |   api_type : openai
22 |   gpt_eval_model_name: "gpt-4-vision-preview"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-wilder
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_wilder_small"
 5 | test_split: small
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: ""
10 |   xcomposer2_4khd:
11 |     pre_prompt: "[UNUSED_TOKEN_146]user\nQuestion: "
12 |     post_prompt: "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
13 | include: _default_template_wilder_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/longvideobench/longvideobench_val_i.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: longvideobench/LongVideoBench
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: longvideobench
 5 |   video: True
 6 |   force_download: False
 7 |   local_files_only: False
 8 |   # From_YouTube: True
 9 | task: longvideobench_val_i
10 | test_split: validation
11 | doc_to_visual: !function utils.longvideobench_doc_to_visual_i
12 | doc_to_text: !function utils.longvideobench_doc_to_text
13 | doc_to_target: "correct_choice"
14 | generation_kwargs:
15 |   max_new_tokens: 32
16 |   temperature: 0
17 |   do_sample: False
18 | process_results: !function utils.longvideobench_process_results
19 | metric_list:
20 |   - metric: lvb_acc
21 |     aggregation: !function utils.longvideobench_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     pre_prompt: ""
27 |     post_prompt: "Answer with the option's letter from the given choices directly.\n"
28 |     insert_interleave_subtitles: True
29 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/longvideobench/longvideobench_val_v.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: longvideobench/LongVideoBench
 2 | dataset_kwargs:
 3 |   token: True
 4 |   cache_dir: longvideobench
 5 |   video: True
 6 |   force_download: False
 7 |   local_files_only: False
 8 |   # From_YouTube: True
 9 | task: longvideobench_val_v
10 | test_split: validation
11 | doc_to_visual: !function utils.longvideobench_doc_to_visual_v
12 | doc_to_text: !function utils.longvideobench_doc_to_text
13 | doc_to_target: "correct_choice"
14 | generation_kwargs:
15 |   max_new_tokens: 32
16 |   temperature: 0
17 |   do_sample: False
18 | process_results: !function utils.longvideobench_process_results
19 | metric_list:
20 |   - metric: lvb_acc
21 |     aggregation: !function utils.longvideobench_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     pre_prompt: ""
27 |     post_prompt: "Answer with the option's letter from the given choices directly.\n"
28 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml:
--------------------------------------------------------------------------------
 1 | group: mathverse
 2 | task:
 3 |   - mathverse_testmini
 4 |   - mathverse_testmini_text_only
 5 |   - mathverse_testmini_text_lite
 6 |   - mathverse_testmini_text_dominant
 7 |   - mathverse_testmini_vision_intensive
 8 |   - mathverse_testmini_vision_dominant
 9 |   - mathverse_testmini_vision_only
10 | metadata:
11 |   version: 0.0
12 |   gpt_eval_model_name: "gpt-3.5-turbo"
13 |   trunk_response: 30
14 |   quick_match: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini"
 6 | test_split: testmini
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_dominant"
 6 | test_split: text_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_lite"
 6 | test_split: text_lite
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_text_only
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_only"
 6 | test_split: text_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_dominant"
 6 | test_split: vision_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_intensive"
 6 | test_split: vision_intensive
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_only"
 6 | test_split: vision_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml:
--------------------------------------------------------------------------------
1 | group: mathvista
2 | task:
3 |   - mathvista_testmini
4 |   - mathvista_test
5 | metadata:
6 |   version: 0.0
7 |   gpt_eval_model_name: "gpt-4-0613"
8 |   quick_extract: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 | model_specific_generation_kwargs:
28 |   llava:
29 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_testmini"
 5 | test_split: testmini
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 |     shot: 0
28 |     use_caption: False
29 |     use_ocr: False
30 |   phi3v:
31 |     shot_type: "solution"
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | dataset_name: "cn"
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text
 9 | generation_kwargs:
10 |   max_new_tokens: 256
11 |   temperature: 0
12 |   top_p: 1.0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function cn_utils.mmbench_process_results
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
20 | model_specific_generation_kwargs:
21 |   llava:
22 |     image_aspect_ratio: original
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | model_specific_prompt_kwargs:
 6 |   default:
 7 |     pre_prompt: ""
 8 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual
10 | doc_to_text: !function en_utils.mmbench_doc_to_text
11 | doc_to_target: "answer"
12 | process_results: !function en_utils.mmbench_process_results
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | output_type: generate_until
17 | dataset_name: "en"
18 | generation_kwargs:
19 |   until:
20 |     - "ASSISTANT:"
21 |   max_new_tokens: 1024
22 |   temperature: 0
23 |   top_p: 1.0
24 |   num_beams: 1
25 |   do_sample: false
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: deepvk/MMBench-ru
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | model_specific_prompt_kwargs:
 6 |   default:
 7 |     pre_prompt: ""
 8 |     post_prompt: "\nВыбери правильный вариант ответа буквой."
 9 | doc_to_visual: !function ru_utils.mmbench_doc_to_visual
10 | doc_to_text: !function ru_utils.mmbench_doc_to_text
11 | doc_to_target: "answer"
12 | process_results: !function ru_utils.mmbench_process_results
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | output_type: generate_until
17 | generation_kwargs:
18 |   until:
19 |     - "ASSISTANT:"
20 |   max_new_tokens: 1024
21 |   temperature: 0
22 |   top_p: 1.0
23 |   num_beams: 1
24 |   do_sample: false
25 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml:
--------------------------------------------------------------------------------
 1 | group: mmbench
 2 | task:
 3 |   - mmbench_en_dev
 4 |   - mmbench_en_test
 5 |   - mmbench_cn_dev
 6 |   - mmbench_cn_test
 7 |   - mmbench_cn_cc
 8 |   - mmbench_ru_dev
 9 | metadata:
10 |   version: 0.0
11 |   sys_prompt: "There are several options:"
12 |   gpt_eval_model_name: "gpt-3.5-turbo"
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_name: cc
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "mmbench_cn_cc"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual
 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 256
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function cc_utils.mmbench_cn_cc_process_results
18 | metric_list:
19 |   - metric: gpt_eval_score
20 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
21 |     higher_is_better: true
22 |   - metric: submission
23 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
24 | metadata:
25 |   version: 0.0
26 |   gpt_eval_model_name: "gpt-3.5-turbo"
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     pre_prompt: ""
31 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original
35 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml:
--------------------------------------------------------------------------------
 1 | group: mmbench_cn
 2 | task:
 3 |   - mmbench_cn_dev
 4 |   - mmbench_cn_test
 5 |   - mmbench_cn_cc
 6 | metadata:
 7 |   version: 0.0
 8 |   gpt_eval_model_name: "gpt-3.5-turbo"
 9 |   sys_prompt: "有如下几个选项："
10 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_cn_dev"
 2 | test_split: "dev"
 3 | metric_list:
 4 |   - metric: gpt_eval_score
 5 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
 6 |     higher_is_better: true
 7 |   - metric: submission
 8 |     higher_is_better: true
 9 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results
10 | include: _default_template_mmbench_cn_yaml
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml:
--------------------------------------------------------------------------------
1 | task: mmbench_cn_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function cn_utils.mmbench_aggregate_test_results
6 |     higher_is_better: true
7 | include: _default_template_mmbench_cn_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_en
2 | task:
3 |   - mmbench_en_dev
4 |   - mmbench_en_test
5 | metadata:
6 |   version: 0.0
7 |   sys_prompt: "There are several options:"
8 |   gpt_eval_model_name: "gpt-3.5-turbo"
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_en_dev"
 2 | test_split: dev
 3 | include: _default_template_mmbench_en_yaml
 4 | metric_list:
 5 |   - metric: gpt_eval_score
 6 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
 7 |     higher_is_better: true
 8 |   - metric: submission
 9 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
10 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml:
--------------------------------------------------------------------------------
1 | task: "mmbench_en_test"
2 | test_split: test
3 | include: _default_template_mmbench_en_yaml
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function en_utils.mmbench_aggregate_test_results
7 |     higher_is_better: true
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_ru_dev"
 2 | test_split: dev
 3 | include: _default_template_mmbench_ru_yaml
 4 | metric_list:
 5 |   - metric: gpt_eval_score
 6 |     aggregation: !function ru_utils.mmbench_aggregate_dev_results_eval
 7 |     higher_is_better: true
 8 |   - metric: submission
 9 |     aggregation: !function ru_utils.mmbench_aggregate_dev_results_submission
10 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu
2 | task:
3 | - mmmu_val
4 | - mmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu_group_img
2 | task:
3 | - mmmu_val_group_img
4 | - mmmu_test_group_img
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test_group_img"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual
 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils_group_img.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils_group_img.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val_group_img"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual
 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils_group_img.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | metric_list:
17 |   - metric: mmmu_acc
18 |     aggregation: !function utils_group_img.mmmu_aggregate_results
19 |     higher_is_better: true
20 | metadata:
21 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | metric_list:
17 |   - metric: mmmu_acc
18 |     aggregation: !function utils.mmmu_aggregate_results
19 |     higher_is_better: true
20 | metadata:
21 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/_default_template_mmupd_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: MM-UPD/MM-UPD
 2 | doc_to_target: "answer"
 3 | doc_to_visual: !function utils.mmupd_doc_to_visual
 4 | doc_to_text: !function utils.mmupd_doc_to_text
 5 | doc_to_target: "answer"
 6 | process_results: !function utils.mmupd_process_results
 7 | model_specific_generation_kwargs:
 8 |   llava:
 9 |     image_aspect_ratio: original
10 | output_type: generate_until
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmaad_base.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmaad_base"
 2 | test_split: test
 3 | dataset_name: mmaad_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\n"
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmaad_base
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmaad_instruction.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmaad_instruction"
 2 | test_split: test
 3 | dataset_name: mmaad_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmaad_instruction
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmaad_option.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmaad_option"
 2 | test_split: test
 3 | dataset_name: mmaad_option
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmaad_option
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmiasd_base.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmiasd_base"
 2 | test_split: test
 3 | dataset_name: mmiasd_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\n"
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmiasd_base
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmiasd_instruction.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmiasd_instruction"
 2 | test_split: test
 3 | dataset_name: mmiasd_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmiasd_instruction
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmiasd_option.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmiasd_option"
 2 | test_split: test
 3 | dataset_name: mmiasd_option
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmiasd_option
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmivqd_base.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmivqd_base"
 2 | test_split: test
 3 | dataset_name: mmivqd_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\n"
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmivqd_base
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmivqd_instruction.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmivqd_instruction"
 2 | test_split: test
 3 | dataset_name: mmivqd_base
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nIf the given image is irrelevant to the question, answer \"F. The image and question are irrelevant.\"."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmivqd_instruction
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmivqd_option.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmivqd_option"
 2 | test_split: test
 3 | dataset_name: mmivqd_option
 4 | model_specific_prompt_kwargs:
 5 |   default:
 6 |     pre_prompt: ""
 7 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 8 | include: _default_template_mmupd_yaml
 9 | metric_list:
10 |   - metric: gpt_eval_score
11 |     aggregation: !function utils.mmivqd_option
12 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmupd.yaml:
--------------------------------------------------------------------------------
 1 | group: mmupd
 2 | task:
 3 |   - mmaad_base
 4 |   - mmaad_option
 5 |   - mmaad_instruction
 6 |   - mmiasd_base
 7 |   - mmiasd_option
 8 |   - mmiasd_instruction
 9 |   - mmivqd_base
10 |   - mmivqd_option
11 |   - mmivqd_instruction
12 | metadata:
13 |   version: 0.0
14 |   sys_prompt: ""
15 |   gpt_eval_model_name: "gpt-3.5-turbo-0125"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmupd_base.yaml:
--------------------------------------------------------------------------------
 1 | group: mmupd_base
 2 | task:
 3 |   - mmaad_base
 4 |   - mmiasd_base
 5 |   - mmivqd_base
 6 | metadata:
 7 |   version: 0.0
 8 |   sys_prompt: ""
 9 |   gpt_eval_model_name: "gpt-3.5-turbo-0125"
10 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmupd_instruction.yaml:
--------------------------------------------------------------------------------
1 | group: mmupd_instruction
2 | task:
3 |   - mmaad_instruction
4 |   - mmiasd_instruction
5 |   - mmivqd_instruction
6 | metadata:
7 |   version: 0.0
8 |   sys_prompt: ""
9 |   gpt_eval_model_name: "gpt-3.5-turbo-0125"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmupd/mmupd_option.yaml:
--------------------------------------------------------------------------------
1 | group: mmupd_option
2 | task:
3 |   - mmaad_option
4 |   - mmiasd_option
5 |   - mmivqd_option
6 | metadata:
7 |   version: 0.0
8 |   sys_prompt: ""
9 |   gpt_eval_model_name: "gpt-3.5-turbo-0125"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMVet
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mmvet"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mmvet_doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"]
 9 | doc_to_target: "{{answer}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 32768
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mmvet_process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mmvet_aggregate_results
22 |     higher_is_better: true
23 | metadata:
24 |   version: 0.0
25 |   gpt_eval_model_name: "gpt-4-0613"
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: "Please think step by step and try to provide best answer to the following question: \n\n"
29 |     post_prompt: ""
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml:
--------------------------------------------------------------------------------
1 | group: multidocvqa
2 | task:
3 | - multidocvqa_val
4 | - multidocvqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_test_results_for_submission
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_val"
 3 | test_split: val
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_results
13 | metric_list:
14 |   - metric: anls
15 |     aggregation: !function utils.multidocvqa_aggregate_results_anls
16 |     higher_is_better: true
17 |   - metric: accuracy
18 |     aggregation: !function utils.multidocvqa_aggregate_results_accuracy
19 |     higher_is_better: true
20 | model_specific_prompt_kwargs:
21 |   default:
22 |     pre_prompt: ""
23 |     post_prompt: "\nAnswer the question using a single word or phrase."
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | test_split: train
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.llava_doc_to_visual
 4 | doc_to_text: !function utils.llava_doc_to_text
 5 | doc_to_target: "gpt_answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 |   image_aspect_ratio: original
10 |   max_new_tokens: 1024
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function utils.llava_process_results
16 | metric_list:
17 |   - metric: gpt_eval_llava_all
18 |     aggregation: !function utils.llava_all_aggregation
19 |     higher_is_better: true
20 |   - metric: gpt_eval_llava_conv
21 |     aggregation: !function utils.llava_conv_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_detail
24 |     aggregation: !function utils.llava_detail_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_complex
27 |     aggregation: !function utils.llava_complex_aggregation
28 |     higher_is_better: true
29 | metadata:
30 |   version: 0.0
31 |   gpt_eval_model_name: "gpt-4-0613"
32 | model_specific_prompt_kwargs:
33 |   default:
34 |     pre_prompt: ""
35 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: arabic
4 |     token: True
5 | task: "llava_in_the_wild_arabic"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: bengali
4 |     token: True
5 | task: "llava_in_the_wild_bengali"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: chinese
4 |     token: True
5 | task: "llava_in_the_wild_chinese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: french
4 |     token: True
5 | task: "llava_in_the_wild_french"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: hindi
4 |     token: True
5 | task: "llava_in_the_wild_hindi"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: japanese
4 |     token: True
5 | task: "llava_in_the_wild_japanese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: russian
4 |     token: True
5 | task: "llava_in_the_wild_russian"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |   config: spanish
4 |   token: True
5 | task: "llava_in_the_wild_spanish"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: urdu
4 |     token: True
5 | task: "llava_in_the_wild_urdu"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nextqa/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/NExTQA
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: nextqa
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nextqa/nextqa.yaml:
--------------------------------------------------------------------------------
1 | group: nextqa
2 | task:
3 | - nextqa_oe_test
4 | - nextqa_oe_val
5 | - nextqa_mc_test
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nextqa/nextqa_mc_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "nextqa_mc_test"
 2 | dataset_name: MC
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.nextqa_doc_to_visual
 6 | doc_to_text: !function utils.nextqa_doc_to_text_mc
 7 | doc_to_target: !function utils.nextqa_doc_to_target
 8 | process_results: !function utils.nextqa_mc_process_results
 9 | metric_list:
10 |   - metric: exact_match
11 |     aggregation: mean
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nextqa/nextqa_oe_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "nextqa_oe_test"
 2 | dataset_name: OE
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.nextqa_doc_to_visual
 6 | doc_to_text: !function utils.nextqa_doc_to_text
 7 | doc_to_target: !function utils.nextqa_doc_to_target
 8 | process_results: !function utils.nextqa_process_results
 9 | metric_list:
10 |   - metric: WUPS
11 |     aggregation: !function utils.nextqa_aggregate_results
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | model_specific_prompt_kwargs:
15 |   default:
16 |     pre_prompt: ""
17 |     post_prompt: "\nAnswer a question using a short phrase or sentence."
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nextqa/nextqa_oe_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "nextqa_oe_val"
 2 | dataset_name: OE
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.nextqa_doc_to_visual
 6 | doc_to_text: !function utils.nextqa_doc_to_text
 7 | doc_to_target: !function utils.nextqa_doc_to_target
 8 | process_results: !function utils.nextqa_process_results
 9 | metric_list:
10 |   - metric: WUPS
11 |     aggregation: !function utils.nextqa_aggregate_results
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | model_specific_prompt_kwargs:
15 |   default:
16 |     pre_prompt: ""
17 |     post_prompt: "\nAnswer a question using a short phrase or sentence."
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: "Provide a one-sentence caption for the provided image."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml:
--------------------------------------------------------------------------------
1 | group : nocaps
2 | task:
3 |   - nocaps_test
4 |   - nocaps_val


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "nocaps_test"
 5 | group : "nocaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_passthrough 
21 |     aggregation : !function utils.nocaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: echo840/OCRBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ocrbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual
 8 | doc_to_text: !function utils.ocrbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 1.0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.ocrbench_process_results
17 | metric_list:
18 |   - metric: ocrbench_accuracy
19 |     aggregation: !function utils.ocrbench_aggregate_accuracy
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OK-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual
 4 | doc_to_text: !function utils.ok_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metric_list:
10 |   - metric: exact_match
11 |     aggregation: mean
12 |     higher_is_better: true
13 |     ignore_case: true
14 |     ignore_punctuation: true
15 |   - metric: submission
16 |     aggregation: !function utils.ok_vqa_aggregate_submissions
17 |     higher_is_better: true
18 | process_results: !function utils.ok_vqa_process_results
19 | model_specific_prompt_kwargs:
20 |   default:
21 |     pre_prompt: ""
22 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val2014"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"ok_vqa_{split}")
16 | 
17 |             save_path = f"./ok_vqa_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "ok_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_ok_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task:
3 | - ok_vqa_val2014


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task: ok_vqa_val2014
3 | test_split: val2014
4 | include: _default_template_vqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml:
--------------------------------------------------------------------------------
1 | group: olympiadbench
2 | task:
3 | - olympiadbench_test_en
4 | - olympiadbench_test_cn
5 | metadata:
6 |   - version: 0.0
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_cn"
 5 | test_split: test_cn
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function cn_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function cn_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_en"
 5 | test_split: test_en
 6 | output_type: generate_until
 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 1.0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function en_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function en_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/test/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/PerceptionTest
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: perceptiontest
6 | model_specific_prompt_kwargs:
7 |   default:
8 |     pre_prompt: ""
9 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/test/perceptiontest_mc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question"
 2 | task: "perceptiontest_test_mc"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.perceptiontest_doc_to_visual
 6 | doc_to_text: !function utils.perceptiontest_doc_to_text
 7 | doc_to_target: !function utils.perceptiontest_doc_to_answer_mc
 8 | process_results: !function utils.perceptiontest_process_results_mc
 9 | metric_list:
10 |   - metric: submission
11 |     aggregation: !function utils.perceptiontest_aggregate_mc
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/test/perceptiontest_mcppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question"
 2 | task: "perceptiontest_test_mcppl"
 3 | test_split: test
 4 | output_type: multiple_choice
 5 | doc_to_visual: !function utils.perceptiontest_doc_to_visual
 6 | doc_to_text: "question"
 7 | doc_to_target: !function utils.perceptiontest_doc_to_answer_mc
 8 | doc_to_choice: !function utils.perceptiontest_doc_to_choice
 9 | process_results: !function utils.perceptiontest_process_results_mc_ppl
10 | metric_list:
11 |   - metric: submission
12 |     aggregation: !function utils.perceptiontest_aggregate_mc_ppl
13 |     higher_is_better: true
14 | include: _default_template_yaml
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/val/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/PerceptionTest_Val
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: perceptiontest_val
6 | model_specific_prompt_kwargs:
7 |   default:
8 |     pre_prompt: ""
9 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/val/perceptiontest_mc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question_val"
 2 | task: "perceptiontest_val_mc"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.perceptiontest_val_doc_to_visual
 6 | doc_to_text: !function utils.perceptiontest_val_doc_to_text
 7 | doc_to_target: !function utils.perceptiontest_val_doc_to_answer
 8 | process_results: !function utils.perceptiontest_val_process_results_mc
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.perceptiontest_val_aggregate_accuracy
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/perceptiontest/val/perceptiontest_mcppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "mc_question_val"
 2 | task: "perceptiontest_val_mcppl"
 3 | test_split: validation
 4 | output_type: multiple_choice
 5 | doc_to_visual: !function utils.perceptiontest_val_doc_to_visual
 6 | doc_to_text: "question"
 7 | doc_to_target: !function utils.perceptiontest_val_doc_to_answer
 8 | doc_to_choice: !function utils.perceptiontest_val_doc_to_choice
 9 | process_results: !function utils.perceptiontest_val_process_results_mc_ppl
10 | metric_list:
11 |   - metric: accuracy
12 |     aggregation: !function utils.perceptiontest_val_aggregate_accuracy
13 |     higher_is_better: true
14 | include: _default_template_yaml
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "pope"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.pope_doc_to_visual
 8 | doc_to_text: !function utils.pope_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 1.0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.pope_process_results
17 | metric_list:
18 |   - metric: pope_accuracy
19 |     aggregation: !function utils.pope_aggregate_accuracy
20 |     higher_is_better: true
21 |   - metric: pope_precision
22 |     aggregation: !function utils.pope_aggregate_precision
23 |     higher_is_better: true
24 |   - metric: pope_recall
25 |     aggregation: !function utils.pope_aggregate_recall
26 |     higher_is_better: true
27 |   - metric: pope_f1_score
28 |     aggregation: !function utils.pope_aggregate_f1_score
29 |     higher_is_better: true
30 |   - metric: pope_yes_ratio
31 |     aggregation: !function utils.pope_aggregate_yes_ratio
32 |     higher_is_better: true
33 | metadata:
34 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_adv.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_adv"
 6 | test_split: adversarial 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_full.yaml:
--------------------------------------------------------------------------------
1 | group : pope_full
2 | task:
3 |   - pope_adv
4 |   - pope_pop
5 |   - pope_random


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_pop.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_pop"
 6 | test_split: popular 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_random.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_random"
 6 | test_split: random 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/qbench/abench_dev.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: q-future/A-Bench-HF
 2 | task: "abench_dev"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.q_bench_doc_to_visual
 6 | doc_to_text: !function utils.q_bench_doc_to_text
 7 | doc_to_target: "correct_choice"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.a_bench_process_results
13 | metric_list:
14 |   - metric: abench_acc
15 |     aggregation: !function utils.a_bench_aggregate_results
16 |     higher_is_better: true
17 | 
18 | model_specific_prompt_kwargs:
19 |   default:
20 |     pre_prompt: ""
21 |     post_prompt: "Answer with the option's letter from the given choices directly.\n"
22 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/qbench/qbench2_dev.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: q-future/Q-Bench2-HF
 2 | task: "qbench2_dev"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.q_bench_doc_to_visual
 6 | doc_to_text: !function utils.q_bench_doc_to_text
 7 | doc_to_target: "correct_choice"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.q_bench_process_results
13 | metric_list:
14 |   - metric: qbench_acc
15 |     aggregation: !function utils.q_bench_aggregate_results
16 |     higher_is_better: true
17 | 
18 | model_specific_prompt_kwargs:
19 |   default:
20 |     pre_prompt: ""
21 |     post_prompt: "Answer with the option's letter from the given choices directly.\n"
22 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/qbench/qbench_dev.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: q-future/Q-Bench-HF
 2 | task: "qbench_dev"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.q_bench_doc_to_visual
 6 | doc_to_text: !function utils.q_bench_doc_to_text
 7 | doc_to_target: "correct_choice"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.q_bench_process_results
13 | metric_list:
14 |   - metric: qbench_acc
15 |     aggregation: !function utils.q_bench_aggregate_results
16 |     higher_is_better: true
17 | 
18 | model_specific_prompt_kwargs:
19 |   default:
20 |     pre_prompt: ""
21 |     post_prompt: "Answer with the option's letter from the given choices directly.\n"
22 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/qbench/qbenchs_dev.yaml:
--------------------------------------------------------------------------------
1 | group: qbenchs_dev
2 | task:
3 | - qbench_dev
4 | - qbench2_dev
5 | - abench_dev
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RealWorldQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "realworldqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual
 8 | doc_to_text: !function utils.realworldqa_doc_to_text
 9 | doc_to_target: "answer"
10 | 
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | 
18 | filter_list:
19 |   - name: "flexible-extract"
20 |     filter:
21 |       - function: !function utils.NumberWordsToDigitsFilter
22 |       - function: !function utils.MultiChoiceRegexFilter
23 |         group_select: 0
24 |         ignore_case: true
25 |         ignore_punctuation: true
26 |         regex_pattern: "(\\([A-Z]\\))"
27 | 
28 | metric_list:
29 |   - metric: exact_match
30 |     aggregation: mean
31 |     higher_is_better: true
32 |     ignore_case: true
33 |     ignore_punctuation: true
34 |       
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""
39 |   gpt4v:
40 |     pre_prompt: ""
41 |     post_prompt: ""
42 |   xcomposer2_4khd:
43 |     pre_prompt: "[UNUSED_TOKEN_146]user\nQuestion: "
44 |     post_prompt: "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is"
45 | metadata:
46 |   - version: 0.0
47 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "val", "testA", "testB"]
 5 | splits = ["val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco+_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco+", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+
2 | task:
3 | - refcoco+_seg_val
4 | - refcoco+_seg_testA
5 | - refcoco+_seg_testB
6 | - refcoco+_bbox_val
7 | - refcoco+_bbox_testA
8 | - refcoco+_bbox_testB
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testA
3 | include: _default_template_bbox_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testB
3 | include: _default_template_bbox_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testA
3 | include: _default_template_seg_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testB
3 | include: _default_template_seg_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val", "testA", "testB"]
 5 | splits = ["test", "val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml:
--------------------------------------------------------------------------------
 1 | group: refcoco
 2 | task:
 3 | - refcoco_seg_test
 4 | - refcoco_seg_val
 5 | - refcoco_seg_testA
 6 | - refcoco_seg_testB
 7 | - refcoco_bbox_test
 8 | - refcoco_bbox_val
 9 | - refcoco_bbox_testA
10 | - refcoco_bbox_testB
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_test
3 | test_split: test
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testA
3 | test_split: testA
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testB
3 | test_split: testB
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_val
3 | test_split: val
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_test
3 | test_split: test
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testA
3 | test_split: testA
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testB
3 | test_split: testB
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_val
3 | test_split: val
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val"]
 5 | splits = ["test", "val"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcocog_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcocog", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog
2 | task:
3 | - refcocog_seg_test
4 | - refcocog_seg_val
5 | - refcocog_bbox_test
6 | - refcocog_bbox_val
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_test
3 | include: _default_template_bbox_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_test
3 | include: _default_template_seg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-FULL
 3 | task: "scienceqa"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   
33 | model_specific_generation_kwargs:
34 |   llava:
35 |     image_aspect_ratio: original
36 |   
37 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml:
--------------------------------------------------------------------------------
1 | group: scienceqa_full
2 | task:
3 |   - scienceqa
4 |   - scienceqa_img


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-IMG
 3 | task: "scienceqa_img"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   idefics2:
33 |     format: default
34 |     pre_prompt: ""
35 |     post_prompt: "\nAnswer:"
36 | model_specific_generation_kwargs:
37 |   llava:
38 |     image_aspect_ratio: original
39 |   
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual
 4 | doc_to_text: !function utils.screenspot_doc_to_text
 5 | doc_to_target: "instruction"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.screenspot_process_result
10 | metric_list:
11 |   - metric: screenspot_CIDEr
12 |     aggregation : !function utils.screenspot_cider
13 |     higher_is_better : true
14 | metadata:
15 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot
2 | task:
3 | - screenspot_reg_test
4 | - screenspot_rec_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_rec
2 | task: screenspot_rec_test
3 | include: _default_template_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_reg
2 | task: screenspot_reg_test
3 | include: _default_template_reg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 | # The return value of process_results will be used by metrics
15 | process_results: !function utils.seed_process_result
16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17 | metric_list:
18 |   - metric: seed_image
19 |     aggregation: !function utils.seed_aggregation_result
20 |     higher_is_better: true
21 |   - metric: seed_video
22 |     aggregation: !function utils.seed_aggregation_result
23 |     higher_is_better: true
24 |   - metric: seed_all
25 |     aggregation: !function utils.seed_aggregation_result
26 |     higher_is_better: true
27 | metadata:
28 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench_ppl"
 5 | test_split: test
 6 | output_type: multiple_choice
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text_mc
 9 | doc_to_choice : !function utils.seed_doc_to_choice
10 | doc_to_target: !function utils.seed_doc_to_mc_target
11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
12 | metric_list:
13 |   - metric: acc
14 | metadata:
15 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ST-VQA
 2 | task: "stvqa"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.stvqa_doc_to_visual
 6 | doc_to_text: !function utils.stvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.stvqa_process_results
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.stvqa_aggregate_submissions
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from loguru import logger
 3 | 
 4 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 5 | 
 6 | 
 7 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs):
 8 |     question = doc["question"]
 9 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
10 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
11 |     return f"{pre_prompt}{question}{post_prompt}"
12 | 
13 | 
14 | def stvqa_doc_to_visual(doc):
15 |     return [doc["image"].convert("RGB")]
16 | 
17 | 
18 | def stvqa_process_results(doc, results):
19 |     answer = results[0]
20 |     return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}}
21 | 
22 | 
23 | def stvqa_aggregate_submissions(results, args):
24 |     file = generate_submission_file("stvqa_test_for_submission.json", args)
25 |     with open(file, "w") as f:
26 |         json.dump(results, f)
27 |     logger.info(f"Results saved to {file}")
28 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/synthdog/synthdog.yaml:
--------------------------------------------------------------------------------
1 | group: synthdog
2 | task:
3 | - synthdog_en
4 | - synthdog_zh


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/synthdog/synthdog_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: naver-clova-ix/synthdog-en
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "synthdog_en"
 5 | test_split: validation
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.synthdog_doc_to_visual
 8 | doc_to_text: OCR this image section by section, from top to bottom, and left to right. Do not insert line breaks in the output text. If a word is split due to a line break in the image, use a space instead.
 9 | doc_to_target: !function utils.synthdog_doc_to_target
10 | generation_kwargs:
11 |   max_new_tokens: 1024
12 |   temperature: 0
13 |   top_p: 1.0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.synthdog_process_results
17 | metric_list:
18 |   - metric: tree_edit_distance
19 |     aggregation: !function utils.synthdog_aggregate_ted
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/synthdog/synthdog_zh.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: naver-clova-ix/synthdog-zh
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "synthdog_zh"
 5 | test_split: validation
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.synthdog_doc_to_visual
 8 | doc_to_text: OCR this image section by section, from top to bottom, and left to right. Do not insert line breaks in the output text. If a word is split due to a line break in the image, use a space instead.
 9 | doc_to_target: !function utils.synthdog_doc_to_target
10 | generation_kwargs:
11 |   max_new_tokens: 1024
12 |   temperature: 0
13 |   top_p: 1.0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.synthdog_process_results
17 | metric_list:
18 |   - metric: tree_edit_distance
19 |     aggregation: !function utils.synthdog_aggregate_ted
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TempCompass
 2 | dataset_kwargs:
 3 |   token: True
 4 |   video: True
 5 |   cache_dir: tempcompass
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: {
10 |     "multi-choice": "\nPlease directly give the best option:",
11 |     "yes_no": "\nPlease answer yes or no:",
12 |     "caption_matching": "\nPlease directly give the best option:",
13 |     "captioning": ""
14 | }


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/_tempcompass.yaml:
--------------------------------------------------------------------------------
1 | group: tempcompass
2 | task:
3 | - tempcompass_multi_choice
4 | - tempcompass_yes_no
5 | - tempcompass_caption_matching
6 | - tempcompass_captioning
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/tempcompass_caption_matching.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "caption_matching"
 2 | task: "tempcompass_caption_matching"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.tempcompass_doc_to_visual
 6 | doc_to_text: !function utils.tempcompass_doc_to_text_caption_matching
 7 | doc_to_target: !function utils.tempcompass_doc_to_answer
 8 | process_results: !function utils.tempcompass_process_results_caption_matching
 9 | metric_list:
10 |   - metric: avg_accuracy
11 |     aggregation: !function utils.tempcompass_aggregate_rating
12 |     higher_is_better: true
13 |   - metric: speed_accuracy
14 |     aggregation: !function utils.tempcompass_aggregate_rating
15 |     higher_is_better: true
16 |   - metric: direction_accuracy
17 |     aggregation: !function utils.tempcompass_aggregate_rating
18 |     higher_is_better: true
19 |   - metric: action_accuracy
20 |     aggregation: !function utils.tempcompass_aggregate_rating
21 |     higher_is_better: true
22 |   - metric: order_accuracy
23 |     aggregation: !function utils.tempcompass_aggregate_rating
24 |     higher_is_better: true
25 |   - metric: attribute_change_accuracy
26 |     aggregation: !function utils.tempcompass_aggregate_rating
27 |     higher_is_better: true
28 | include: _default_template_yaml
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/tempcompass_captioning.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "captioning"
 2 | task: "tempcompass_captioning"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.tempcompass_doc_to_visual
 6 | doc_to_text: !function utils.tempcompass_doc_to_text_captioning
 7 | doc_to_target: !function utils.tempcompass_doc_to_answer
 8 | process_results: !function utils.tempcompass_process_results_captioning
 9 | metric_list:
10 |   - metric: avg_accuracy
11 |     aggregation: !function utils.tempcompass_aggregate_rating
12 |     higher_is_better: true
13 |   - metric: speed_accuracy
14 |     aggregation: !function utils.tempcompass_aggregate_rating
15 |     higher_is_better: true
16 |   - metric: direction_accuracy
17 |     aggregation: !function utils.tempcompass_aggregate_rating
18 |     higher_is_better: true
19 |   - metric: action_accuracy
20 |     aggregation: !function utils.tempcompass_aggregate_rating
21 |     higher_is_better: true
22 |   - metric: order_accuracy
23 |     aggregation: !function utils.tempcompass_aggregate_rating
24 |     higher_is_better: true
25 |   - metric: attribute_change_accuracy
26 |     aggregation: !function utils.tempcompass_aggregate_rating
27 |     higher_is_better: true
28 | include: _default_template_yaml
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/tempcompass_mc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "multi-choice"
 2 | task: "tempcompass_multi_choice"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.tempcompass_doc_to_visual
 6 | doc_to_text: !function utils.tempcompass_doc_to_text_multi_choice
 7 | doc_to_target: !function utils.tempcompass_doc_to_answer
 8 | process_results: !function utils.tempcompass_process_results_multi_choice
 9 | metric_list:
10 |   - metric: avg_accuracy
11 |     aggregation: !function utils.tempcompass_aggregate_rating
12 |     higher_is_better: true
13 |   - metric: speed_accuracy
14 |     aggregation: !function utils.tempcompass_aggregate_rating
15 |     higher_is_better: true
16 |   - metric: direction_accuracy
17 |     aggregation: !function utils.tempcompass_aggregate_rating
18 |     higher_is_better: true
19 |   - metric: action_accuracy
20 |     aggregation: !function utils.tempcompass_aggregate_rating
21 |     higher_is_better: true
22 |   - metric: order_accuracy
23 |     aggregation: !function utils.tempcompass_aggregate_rating
24 |     higher_is_better: true
25 |   - metric: attribute_change_accuracy
26 |     aggregation: !function utils.tempcompass_aggregate_rating
27 |     higher_is_better: true
28 | include: _default_template_yaml
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/tempcompass/tempcompass_yes_no.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "yes_no"
 2 | task: "tempcompass_yes_no"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.tempcompass_doc_to_visual
 6 | doc_to_text: !function utils.tempcompass_doc_to_text_yes_no
 7 | doc_to_target: !function utils.tempcompass_doc_to_answer
 8 | process_results: !function utils.tempcompass_process_results_yes_no
 9 | metric_list:
10 |   - metric: avg_accuracy
11 |     aggregation: !function utils.tempcompass_aggregate_rating
12 |     higher_is_better: true
13 |   - metric: speed_accuracy
14 |     aggregation: !function utils.tempcompass_aggregate_rating
15 |     higher_is_better: true
16 |   - metric: direction_accuracy
17 |     aggregation: !function utils.tempcompass_aggregate_rating
18 |     higher_is_better: true
19 |   - metric: action_accuracy
20 |     aggregation: !function utils.tempcompass_aggregate_rating
21 |     higher_is_better: true
22 |   - metric: order_accuracy
23 |     aggregation: !function utils.tempcompass_aggregate_rating
24 |     higher_is_better: true
25 |   - metric: attribute_change_accuracy
26 |     aggregation: !function utils.tempcompass_aggregate_rating
27 |     higher_is_better: true
28 | include: _default_template_yaml
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: Provide a one-sentence caption for the provided image.


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml:
--------------------------------------------------------------------------------
1 | group : textcaps
2 | task:
3 |   - textcaps_val
4 |   - textcaps_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_test"
 5 | group : "textcaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 1.0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_passthrough 
21 |     aggregation : !function utils.textcaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/textvqa
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.textvqa_doc_to_visual
 4 | doc_to_text: !function utils.textvqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.textvqa_process_results
10 | model_specific_prompt_kwargs:
11 |   default:
12 |     pre_prompt: ""
13 |     post_prompt: "\nAnswer the question using a single word or phrase."
14 |     ocr: false
15 |   qwen_vl:
16 |     pre_prompt: ""
17 |     post_prompt: " Answer:"
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml:
--------------------------------------------------------------------------------
1 | group: textvqa
2 | task:
3 | - textvqa_val
4 | - textvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: textvqa_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function utils.textvqa_aggregate_submissions
6 |     higher_is_better: true
7 | include: _default_template_textvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | task: textvqa_val
 2 | test_split: validation
 3 | metric_list:
 4 |   - metric: exact_match
 5 |     aggregation: mean
 6 |     higher_is_better: true
 7 |     ignore_case: true
 8 |     ignore_punctuation: true
 9 |   - metric: submission
10 |     aggregation: !function utils.textvqa_aggregate_submissions
11 |     higher_is_better: true
12 | include: _default_template_textvqa_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vatex/_vatex.yaml:
--------------------------------------------------------------------------------
1 | group : vatex
2 | task:
3 | - vatex_val_zh
4 | - vatex_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vcr_doc_to_visual
 6 | doc_to_text: !function utils.vcr_doc_to_text
 7 | doc_to_target: "answer"
 8 | generation_kwargs:
 9 |   max_new_tokens: 120
10 |   temperature: 0
11 |   top_p: 0
12 |   num_beams: 1
13 |   do_sample: false
14 | # The return value of process_results will be used by metrics
15 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
16 | metadata:
17 |   - version: 0.0.1


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-easy-test
 3 | task: "vcr_wiki_en_easy"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_100.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-easy-test-100
 3 | task: "vcr_wiki_en_easy_100"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_easy_500.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-easy-test-500
 3 | task: "vcr_wiki_en_easy_500"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-hard-test
 3 | task: "vcr_wiki_en_hard"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_100.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-hard-test-100
 3 | task: "vcr_wiki_en_hard_100"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_en_hard_500.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-en-hard-test-500
 3 | task: "vcr_wiki_en_hard_500"
 4 | test_split: test
 5 | process_results: !function utils.vcr_en_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "What is the covered texts in the image? Please restore the covered texts without outputting the explanations."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-easy-test
 3 | task: "vcr_wiki_zh_easy"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_100.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-easy-test-100
 3 | task: "vcr_wiki_zh_easy_100"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_easy_500.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-easy-test-500
 3 | task: "vcr_wiki_zh_easy_500"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-hard-test
 3 | task: "vcr_wiki_zh_hard"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_100.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-hard-test-100
 3 | task: "vcr_wiki_zh_hard_100"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vcr_wiki/vcr_wiki_zh_hard_500.yaml:
--------------------------------------------------------------------------------
 1 | "include": "_default_template_vcr_yaml"
 2 | dataset_path: vcr-org/VCR-wiki-zh-hard-test-500
 3 | task: "vcr_wiki_zh_hard_500"
 4 | test_split: test
 5 | process_results: !function utils.vcr_zh_process_results
 6 | metric_list:
 7 |   - metric: jaccard
 8 |     aggregation: !function utils.vcr_aggregate_jaccard
 9 |     higher_is_better: true
10 |   - metric: exact_match
11 |     aggregation: !function utils.vcr_aggregate_exact_match
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "图像中被覆盖的文本是什么？请在不输出解释的情况下还原被覆盖的文本。"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/video_detail_description/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VideoDetailDescription
 2 | dataset_kwargs:
 3 |   token: True
 4 |   video: True
 5 |   cache_dir: videochatgpt
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: ""
10 | 
11 | metadata:
12 |   version: 0.0
13 |   gpt_eval_model_name: gpt-3.5-turbo-0613


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/video_detail_description/video_detail_description.yaml:
--------------------------------------------------------------------------------
 1 | task: "video_dc499"
 2 | test_split: test
 3 | output_type: generate_until
 4 | doc_to_visual: !function utils.video_detail_description_doc_to_visual
 5 | doc_to_text: !function utils.video_detail_description_doc_to_text
 6 | doc_to_target: !function utils.video_detail_description_doc_to_answer
 7 | process_results: !function utils.video_detail_description_process_results_generic
 8 | metric_list:
 9 |   - metric: gpt_eval_score
10 |     aggregation: !function utils.video_detail_description_aggregate_score
11 |     higher_is_better: true
12 | include: _default_template_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/videochatgpt/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VideoChatGPT
 2 | dataset_kwargs:
 3 |   token: True
 4 |   video: True
 5 |   cache_dir: videochatgpt
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: ""
10 | 
11 | metadata:
12 |   version: 0.0
13 |   gpt_eval_model_name: gpt-3.5-turbo-0613


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/videochatgpt/_videochatgpt.yaml:
--------------------------------------------------------------------------------
1 | group: videochatgpt
2 | task:
3 | - videochatgpt_gen
4 | - videochatgpt_temporal
5 | - videochatgpt_consistency
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Consistency"
 2 | task: "videochatgpt_consistency"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.videochatgpt_doc_to_visual
 6 | doc_to_text: !function utils.videochatgpt_doc_to_text_consistency
 7 | doc_to_target: !function utils.videochatgpt_doc_to_answer
 8 | process_results: !function utils.videochatgpt_process_results_consistency
 9 | metric_list:
10 |   - metric: gpt_eval_score_consistency
11 |     aggregation: !function utils.videochatgpt_aggregate_consistency
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 
15 | generation_kwargs:
16 |   until:
17 |     - "ASSISTANT:"
18 |   image_aspect_ratio: original
19 |   max_new_tokens: 1024
20 |   temperature: 0
21 |   top_p: 1.0
22 |   num_beams: 1
23 |   do_sample: false
24 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/videochatgpt/videochatgpt_generic.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Generic"
 2 | task: "videochatgpt_gen"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.videochatgpt_doc_to_visual
 6 | doc_to_text: !function utils.videochatgpt_doc_to_text
 7 | doc_to_target: !function utils.videochatgpt_doc_to_answer
 8 | process_results: !function utils.videochatgpt_process_results_generic
 9 | metric_list:
10 |   - metric: gpt_eval_score_correctness
11 |     aggregation: !function utils.videochatgpt_aggregate_score
12 |     higher_is_better: true
13 |   - metric: gpt_eval_score_detailed_orientation
14 |     aggregation: !function utils.videochatgpt_aggregate_score
15 |     higher_is_better: true
16 |   - metric: gpt_eval_score_context
17 |     aggregation: !function utils.videochatgpt_aggregate_score
18 |     higher_is_better: true
19 | include: _default_template_yaml
20 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Temporal"
 2 | task: "videochatgpt_temporal"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.videochatgpt_doc_to_visual
 6 | doc_to_text: !function utils.videochatgpt_doc_to_text
 7 | doc_to_target: !function utils.videochatgpt_doc_to_answer
 8 | process_results: !function utils.videochatgpt_process_results_temporal
 9 | metric_list:
10 |   - metric: gpt_eval_score_temporal
11 |     aggregation: !function utils.videochatgpt_aggregate_score
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 
15 | generation_kwargs:
16 |   until:
17 |     - "ASSISTANT:"
18 |   image_aspect_ratio: original
19 |   max_new_tokens: 1024
20 |   temperature: 0
21 |   top_p: 1.0
22 |   num_beams: 1
23 |   do_sample: false
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lscpku/VITATECS
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: vitatecs
6 | model_specific_prompt_kwargs:
7 |   default:
8 |     pre_prompt: ""
9 |     post_prompt: "\nPlease response with a single letter (A or B):"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/_vitatecs.yaml:
--------------------------------------------------------------------------------
1 | group: vitatecs
2 | task:
3 | - vitatecs_direction
4 | - vitatecs_intensity
5 | - vitatecs_sequence
6 | - vitatecs_compositionality
7 | - vitatecs_localization
8 | - vitatecs_type
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_compositionality.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Compositionality"
 2 | task: "vitatecs_compositionality"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_direction.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Direction"
 2 | task: "vitatecs_direction"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_intensity.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Intensity"
 2 | task: "vitatecs_intensity"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_localization.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Localization"
 2 | task: "vitatecs_localization"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_sequence.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Sequence"
 2 | task: "vitatecs_sequence"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vitatecs/vitatecs_type.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Type"
 2 | task: "vitatecs_type"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vitatecs_doc_to_visual
 6 | doc_to_text: !function utils.vitatecs_doc_to_text
 7 | doc_to_target: !function utils.vitatecs_doc_to_answer
 8 | process_results: !function utils.vitatecs_process_results
 9 | metric_list:
10 |   - metric: accuracy
11 |     aggregation: !function utils.vitatecs_aggregate_rating
12 |     higher_is_better: true
13 | include: _default_template_yaml
14 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VizWiz-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metadata:
10 |   - version: 0.0
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
15 | process_results: !function utils.vizwiz_vqa_process_results
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val", "test"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"vizwiz_{task}_{split}")
16 | 
17 |             save_path = f"./vizwiz_{task}_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_vizwiz_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: vizwiz_vqa
2 | task:
3 | - vizwiz_vqa_val
4 | - vizwiz_vqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_test
 3 | test_split: test
 4 | include: _default_template_vqa_yaml
 5 | process_results: !function utils.vizwiz_vqa_process_results
 6 | metric_list:
 7 |   # - metric: exact_match
 8 |   #   aggregation: mean
 9 |   #   higher_is_better: true
10 |   #   ignore_case: true
11 |   #   ignore_punctuation: true
12 |   - metric: submission
13 |     aggregation: !function utils.vizwiz_vqa_aggregate_submissions
14 |     higher_is_better: true
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_val
 3 | test_split: val
 4 | include: _default_template_vqa_yaml
 5 | metric_list:
 6 |   - metric: exact_match
 7 |     aggregation: mean
 8 |     higher_is_better: true
 9 |     ignore_case: true
10 |     ignore_punctuation: true
11 |   # - metric: submission
12 |   #   aggregation: !function utils.vizwiz_vqa_aggregate_submissions
13 |   #   higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VQAv2
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vqav2_doc_to_visual
 6 | doc_to_text: !function utils.vqav2_doc_to_text
 7 | doc_to_target: "answer"
 8 | generation_kwargs:
 9 |   max_new_tokens: 16
10 | metadata:
11 |   - version: 0.0
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml:
--------------------------------------------------------------------------------
1 | group: vqav2
2 | task:
3 | - vqav2_val
4 | - vqav2_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml:
--------------------------------------------------------------------------------
1 | task: "vqav2_test"
2 | include: _default_template_vqav2_yaml
3 | test_split: test
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.vqav2_aggregate_submissions
7 |     higher_is_better: true
8 | process_results: !function utils.vqav2_process_results_test
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "vqav2_val"
 2 | include: _default_template_vqav2_yaml
 3 | test_split: validation
 4 | metric_list:
 5 |   - metric: exact_match
 6 |     aggregation: mean
 7 |     higher_is_better: true
 8 |     ignore_case: true
 9 |     ignore_punctuation: true
10 | process_results: !function utils.vqav2_process_results_val
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc.yaml:
--------------------------------------------------------------------------------
1 | group: websrc
2 | task:
3 | - websrc_val
4 | - websrc_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc-test
 2 | task: "websrc_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.websrc_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc
 2 | task: "websrc_val"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: websrc_squad_f1
16 |     aggregation: !function utils.websrc_aggregate_results
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/wild_vision_bench/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: WildVision/wildvision-arena-data
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.wild_vision_doc_to_visual
 6 | doc_to_text: !function utils.wild_vision_doc_to_text
 7 | doc_to_target: !function utils.wild_vision_doc_to_target
 8 | generation_kwargs:
 9 |   max_new_tokens: 4096
10 |   temperature: 0
11 |   top_p: 1.0
12 |   num_beams: 1
13 |   do_sample: false
14 | # The return value of process_results will be used by metrics
15 | process_results: !function utils.wild_vision_process_results
16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17 | metric_list:
18 |   - metric: gpt_eval_score
19 |     aggregation: !function utils.wild_vision_aggregation
20 |     higher_is_better: true
21 | metadata:
22 |   judge_model: gpt-4o
23 |   baseline_model: claude-3-sonnet-20240229
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/wild_vision_bench/wild_vision_bench0617.yaml:
--------------------------------------------------------------------------------
 1 | task: wildvision_0617
 2 | dataset_name: release_bench_0617_with_modelresponse 
 3 | test_split: test500
 4 | output_type: generate_until
 5 | include: _default_template_yaml
 6 | model_specific_prompt_kwargs:
 7 |   default:
 8 |     pre_prompt: ""
 9 |     post_prompt: ""
10 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/worldqa/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/worldqa
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: multi-hop-reasoning 
6 | metadata:
7 |   version: 0.0
8 |   gpt_eval_model_name: "gpt-4-0613"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/worldqa/worldqa.yaml:
--------------------------------------------------------------------------------
1 | group: worldqa
2 | task:
3 | - worldqa_gen
4 | - worldqa_mc
5 | 
6 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/worldqa/worldqa_generation.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "Generation"
 2 | task: "worldqa_gen"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.worldqa_doc_to_visual
 6 | doc_to_text: !function utils.worldqa_doc_to_text
 7 | doc_to_target: !function utils.worldqa_doc_to_answer
 8 | process_results: !function utils.worldqa_process_results
 9 | metric_list:
10 |   - metric: submission
11 |     aggregation: !function utils.worldqa_aggregate_gen
12 |     higher_is_better: true
13 |   - metric: gpt_eval
14 |     aggregation: !function utils.worldq_gen_gpt_eval
15 |     higher_is_better: true  
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: ""
20 | include: _default_template_yaml
21 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/worldqa/worldqa_mc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "MC"
 2 | task: "worldqa_mc"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.worldqa_doc_to_visual
 6 | doc_to_text: !function utils.worldqa_doc_to_text
 7 | doc_to_target: !function utils.worldqa_doc_to_answer_mc
 8 | process_results: !function utils.worldqa_process_results_mc
 9 | metric_list:
10 |   - metric: gpt_eval
11 |     aggregation: !function utils.worldqa_aggregate_mc_eval
12 |     higher_is_better: true
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
17 | filter_list:
18 |   - name: "flexible-extract"
19 |     filter:
20 |       - function: !function utils.MultiChoiceRegexFilter
21 |         group_select: 0
22 |         ignore_case: true
23 |         ignore_punctuation: true
24 |         regex_pattern: "(\\([A-Z]\\))"
25 | 
26 | include: _default_template_yaml
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/worldqa/worldqa_mcppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: "MC_PPL"
 2 | task: "worldqa_mc_ppl"
 3 | test_split: test
 4 | output_type: multiple_choice
 5 | doc_to_visual: !function utils.worldqa_doc_to_visual
 6 | doc_to_text: "question"
 7 | doc_to_target: !function utils.worldqa_doc_to_answer_mc_ppl
 8 | doc_to_choice: !function utils.worldqa_doc_to_choice
 9 | metric_list:
10 |   - metric: acc
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: ""
15 | include: _default_template_yaml
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/youcook2/_default_template_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/YouCook2
2 | dataset_kwargs:
3 |   token: True
4 |   video: True
5 |   cache_dir: YouCookIIVideos
6 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/example_eval.yaml:
--------------------------------------------------------------------------------
1 | - model: llava
2 |   model_args: pretrained=liuhaotian/llava-v1.5-7b
3 |   tasks: mmmu_val
4 |   batch_size: 1
5 |   log_samples: true
6 |   log_samples_suffix: eval_mmmu
7 |   output_path: "./logs/"
8 | 
9 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | datasets==2.16.1
 3 | evaluate==0.4.1
 4 | hf_transfer==0.1.6
 5 | Jinja2==3.1.3
 6 | numpy==1.26.4
 7 | openai==1.13.3
 8 | packaging==23.2
 9 | pandas==2.2.1
10 | Pillow==10.2.0
11 | protobuf==4.25.3
12 | pycocoevalcap==1.2
13 | pycocotools==2.0.7
14 | pytablewriter==1.2.0
15 | pytest==8.0.2
16 | python_Levenshtein==0.25.0
17 | pytz==2024.1
18 | PyYAML==6.0.1
19 | PyYAML==6.0.1
20 | Requests==2.31.0
21 | sacrebleu==2.4.0
22 | scikit_learn==1.2.2
23 | sentencepiece==0.1.99
24 | setuptools==68.2.2
25 | sglang==0.1.12
26 | shortuuid==1.0.12
27 | sqlitedict==2.1.0
28 | tenacity==8.2.3
29 | torch==2.0.1
30 | tokenizers==0.15.2
31 | tqdm==4.66.2
32 | transformers==4.37.2
33 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/miscs/llava_result_check.md


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_sglang_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/miscs/llava_sglang_result_check.md


--------------------------------------------------------------------------------
/lmms-eval/miscs/repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install LLaVA without building dependencies
 6 | cd LLaVA
 7 | pip install --no-deps -U -e .
 8 | 
 9 | # install all the requirements that require for reproduce llava results
10 | pip install -r llava_repr_requirements.txt
11 | 
12 | # Run and exactly reproduce llava_v1.5 results!
13 | # mme as an example
14 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval/miscs/script.sh:
--------------------------------------------------------------------------------
 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b"   --tasks mme_llava_prompt  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
 2 | 
 3 | 
 4 | gpu = 8 bs 1:
 5 | 
 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 7 | |     Tasks      |Version|Filter|n-shot|  Metric   |Value|   |Stderr |
 8 | |----------------|-------|------|-----:|-----------|----:|---|------:|
 9 | |mme_llava_prompt|Yaml   |none  |     0|exact_match| 1873|±  |38.4331|
10 | 
11 | gpu = 8 bs 1 use_flash_attention_2=True:
12 | 
13 | 
14 | 
15 | 
16 | 
17 | gpu = 4 bs 1 use_flash_attention_2=True:
18 | 
19 | 
20 | 
21 | accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval --model qwen_vl   --model_args pretrained="Qwen/Qwen-VL"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix debug --output_path ./logs/
22 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_llava.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | 
 4 | import torch
 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 6 | 
 7 | model_id = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | prompt_1 = "USER: <image>\nWhat does this image show?\nASSISTANT:"
10 | prompt_2 = "USER: <image> <image> \nWhat is the difference between these two images?\nASSISTANT:"
11 | image_file_1 = "image1.png"
12 | image_file_2 = "image2.png"
13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 | raw_image_1 = Image.open(image_file_1)
16 | raw_image_2 = Image.open(image_file_2)
17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16)
18 | import pdb
19 | 
20 | pdb.set_trace()
21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
22 | print(processor.batch_decode(output, skip_special_tokens=True))
23 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_scienceqa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"]
4 | for doc in dataset:
5 |     print(doc["id"])
6 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/tinyllava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.27.2
 2 | datasets==2.16.1
 3 | deepspeed==0.14.0
 4 | einops==0.6.1
 5 | einops-exts==0.0.4
 6 | evaluate==0.4.1
 7 | hf_transfer==0.1.6
 8 | Jinja2==3.1.3
 9 | numpy==1.26.4
10 | openai==1.13.3
11 | openpyxl
12 | packaging==23.2
13 | pandas==2.2.1
14 | peft==0.10.0
15 | Pillow==10.2.0
16 | protobuf==4.25.3
17 | pycocoevalcap==1.2
18 | pycocotools==2.0.7
19 | pytablewriter==1.2.0
20 | pytest==8.0.2
21 | python_Levenshtein==0.25.0
22 | pytz==2024.1
23 | PyYAML==6.0.1
24 | Requests==2.31.0
25 | sacrebleu==2.4.0
26 | scikit_learn==1.2.2
27 | sentencepiece==0.1.99
28 | setuptools==68.2.2
29 | sglang==0.1.12
30 | shortuuid==1.0.12
31 | sqlitedict==2.1.0
32 | tenacity==8.2.3
33 | tiktoken
34 | # torch==2.0.1
35 | # torchvision==0.15.2
36 | tokenizers==0.15.1
37 | timm==0.6.13
38 | tqdm==4.66.2
39 | transformers==4.39.3
40 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/tinyllava_repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install TinyLLaVA without building dependencies
 6 | cd ..
 7 | git clone https://github.com/TinyLLaVA/TinyLLaVA_Factory
 8 | cd TinyLLaVA_Factory
 9 | pip install --no-deps -U -e .
10 | 
11 | # install all the requirements that require for reproduce llava results
12 | pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
13 | pip install -r tinyllava_repr_requirements.txt
14 | 
15 | # Run and reproduce tinyllava best results!
16 | accelerate launch \
17 |     --num_processes=1 \
18 |     -m lmms_eval \
19 |     --model tinyllava \
20 |     --model_args pretrained=tinyllava/TinyLLaVA-Phi-2-SigLIP-3.1B,conv_mode=phi \
21 |     --tasks vqav2,gqa,scienceqa_img,textvqa,mmvet,pope,mme,mmmu_val \
22 |     --batch_size 1 \
23 |     --log_samples \
24 |     --log_samples_suffix tinyllava-phi2-siglip-3.1b \
25 |     --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | # This is to make sure that the package supports editable installs
4 | setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/scripts/upload_pypi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Step 0: Clean up
 4 | rm -rf dist
 5 | 
 6 | # Step 1: Change the package name to "llava-torch"
 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml
 8 | 
 9 | # Step 2: Build the package
10 | python -m build
11 | 
12 | # Step 3: Revert the changes in pyproject.toml to the original
13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml
14 | 
15 | # Step 4: Upload to PyPI
16 | python -m twine upload dist/*
17 | 


--------------------------------------------------------------------------------
/scripts/zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 1,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/util_scripts/demo.py:
--------------------------------------------------------------------------------
 1 | # llava inference demo from llava README.md
 2 | import os
 3 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 4 | 
 5 | from llava.mm_utils import get_model_name_from_path
 6 | from llava.eval.run_llava import eval_model
 7 | 
 8 | from transformers.utils import logging
 9 | logging.set_verbosity_info()
10 | logger = logging.get_logger("transformers")
11 | 
12 | model_path = "MCG-NJU/p-MoD-LLaVA-NeXT-7B"
13 | 
14 | prompt = "What are the things I should be cautious about when I visit here?"
15 | image_file = "https://llava-vl.github.io/static/images/view.jpg"
16 | 
17 | args = type('Args', (), {
18 |     "model_path": model_path,
19 |     "model_base": None,
20 |     "model_name": get_model_name_from_path(model_path),
21 |     "query": prompt,
22 |     "conv_mode": None,
23 |     "image_file": image_file,
24 |     "sep": ",",
25 |     "temperature": 0,
26 |     "top_p": None,
27 |     "num_beams": 1,
28 |     "use_flash_attn": False,
29 |     "max_new_tokens": 1000,
30 | })()
31 | 
32 | eval_model(args)
33 | 


--------------------------------------------------------------------------------
/util_scripts/setup_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | # git clone
 5 | git clone https://github.com/MCG-NJU/p-MoD.git
 6 | cd p-MoD
 7 | 
 8 | #Install dependencies
 9 | conda create -n p-mod python=3.10 -y
10 | #conda activate p-mod
11 | $CONDA_PATH="~/miniconda3"
12 | source $CONDA_PATH/bin/activate p-mod
13 | 
14 | pip install --upgrade pip  # enable PEP 660 support
15 | pip install -e .
16 | pip install -e lmms-eval
17 | 
18 | # packages for training
19 | pip install -e ".[train]"
20 | pip install flash-attn --no-build-isolation --no-cache-dir
21 | 
22 | # Login to huggingface and wandb
23 | huggingface-cli login
24 | wandb login
25 | 


--------------------------------------------------------------------------------