├── .devcontainer ├── Dockerfile ├── devcontainer.env ├── devcontainer.json └── postCreateCommand.sh ├── .dockerignore ├── .editorconfig ├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── 1-usage.yaml │ ├── 2-feature-request.yaml │ ├── 3-question.yaml │ └── 4-discussion.yaml ├── .gitignore ├── IG-VLM ├── LICENSE ├── README.md ├── data │ ├── multiple_choice_qa │ │ ├── EgoSchema.csv │ │ ├── IntentQA.csv │ │ ├── NExT_QA.csv │ │ ├── STAR.csv │ │ └── TVQA.csv │ ├── open_ended_qa │ │ ├── ActivityNet_QA.csv │ │ ├── MSRVTT_QA.csv │ │ ├── MSVD_QA.csv │ │ └── TGIF_FrameQA.csv │ └── text_generation_benchmark │ │ ├── Consistency_QA1.csv │ │ ├── Consistency_QA2.csv │ │ ├── Generic_QA.csv │ │ └── Temporal_QA.csv ├── eval_gpt4v_multiplechoice.py ├── eval_gpt4v_openended.py ├── eval_gpt4v_textgeneration_openended.py ├── eval_llava_multiplechoice.py ├── eval_llava_openended.py ├── eval_llava_textgeneration_openended.py ├── evaluation │ ├── __init__.py │ ├── direct_answer_eval.py │ ├── gpt3_consistency_utils.py │ └── gpt3_evaluation_utils.py ├── example │ ├── QzulzuWr0Zw_20_45.mp4 │ ├── extraction_sample │ │ ├── rlQ2kW-FvMk_66_79 │ │ │ ├── 1.jpg │ │ │ ├── 2.jpg │ │ │ ├── 3.jpg │ │ │ ├── 4.jpg │ │ │ ├── 5.jpg │ │ │ └── 6.jpg │ │ └── ysTmUTQ5wZE_17_45 │ │ │ ├── 1.jpg │ │ │ ├── 2.jpg │ │ │ ├── 3.jpg │ │ │ ├── 4.jpg │ │ │ ├── 5.jpg │ │ │ └── 6.jpg │ ├── imagegrid_sample │ │ ├── QzulzuWr0Zw_20_45.jpg │ │ └── rlQ2kW-FvMk_66_79.jpg │ ├── rlQ2kW-FvMk_66_79.mp4 │ └── ysTmUTQ5wZE_17_45.mp4 ├── model_processor │ ├── __init__.py │ ├── base_model_inference.py │ ├── gpt4_model_processor.py │ └── llava2_model_processor.py ├── pipeline_processor │ ├── __init__.py │ ├── gpt4_pipeline.py │ ├── llava_pipeline.py │ └── record.py ├── requirements.txt ├── scripts │ └── nextqa_multi.sh └── vision_processor │ ├── __init__.py │ ├── base_frame_extractor.py │ ├── base_post_processor.py │ ├── decorator_processor.py │ ├── fps_extractor.py │ ├── fps_gridview_processor.py │ ├── gridview_generator.py │ └── video_validation.py ├── LICENSE ├── README.md ├── cog.yaml ├── docs ├── Customize_Component.md ├── Data.md ├── Evaluation.md ├── Finetune_Custom_Data.md ├── Intel.md ├── LLaVA_Bench.md ├── LLaVA_from_LLaMA2.md ├── LoRA.md ├── MODEL_ZOO.md ├── ScienceQA.md ├── Windows.md └── macOS.md ├── images ├── demo_cli.gif ├── llava_example_cmp.png ├── llava_logo.png ├── llava_v1_5_radar.jpg └── teaser.png ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ ├── table │ │ ├── answer │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ ├── answer_bard.jsonl │ │ │ ├── answer_gpt35.jsonl │ │ │ ├── answer_llama-13b.jsonl │ │ │ └── answer_vicuna-13b.jsonl │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ ├── model.jsonl │ │ ├── prompt.jsonl │ │ ├── question.jsonl │ │ ├── results │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ ├── review │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ ├── reviewer.jsonl │ │ └── rule.json │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ └── llava_mpt.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ ├── m3-demo-1.jpg │ │ ├── m3-demo.jpg │ │ ├── sample_demo_1.mp4 │ │ ├── sample_demo_3.mp4 │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── lmms-eval ├── LICENSE ├── README.md ├── docs │ ├── README.md │ ├── commands.md │ ├── model_guide.md │ └── task_guide.md ├── example_eval.yaml ├── llava_repr_requirements.txt ├── lmms_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── evaluator.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── logging_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── fuyu.py │ │ ├── gpt4v.py │ │ ├── idefics2.py │ │ ├── instructblip.py │ │ ├── llava.py │ │ ├── llava_hf.py │ │ ├── llava_sglang.py │ │ ├── minicpm_v.py │ │ ├── model_utils │ │ │ ├── __init__.py │ │ │ └── qwen │ │ │ │ └── qwen_generate_utils.py │ │ ├── phi3v.py │ │ ├── qwen_vl.py │ │ ├── qwen_vl_api.py │ │ └── xcomposer2_4KHD.py │ ├── tasks │ │ ├── __init__.py │ │ ├── _task_utils │ │ │ ├── file_utils.py │ │ │ ├── gpt_eval_utils.py │ │ │ └── vqa_eval_metric.py │ │ ├── ai2d │ │ │ ├── ai2d.yaml │ │ │ ├── upload_ai2d.py │ │ │ └── utils.py │ │ ├── chartqa │ │ │ ├── chartqa.yaml │ │ │ ├── upload_chartqa.py │ │ │ └── utils.py │ │ ├── cmmmu │ │ │ ├── _cmmmu.yaml │ │ │ ├── _default_template_cmmmu_yaml │ │ │ ├── cmmmu_test.yaml │ │ │ ├── cmmmu_val.yaml │ │ │ └── utils.py │ │ ├── coco_cap │ │ │ ├── coco2014_cap.yaml │ │ │ ├── coco2014_cap_test.yaml │ │ │ ├── coco2014_cap_val.yaml │ │ │ ├── coco2017_cap.yaml │ │ │ ├── coco2017_cap_test.yaml │ │ │ ├── coco2017_cap_val.yaml │ │ │ ├── coco_cap.yaml │ │ │ └── utils.py │ │ ├── docvqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── docvqa.yaml │ │ │ ├── docvqa_test.yaml │ │ │ ├── docvqa_val.yaml │ │ │ └── utils.py │ │ ├── ferret │ │ │ ├── ferret.yaml │ │ │ └── utils.py │ │ ├── flickr30k │ │ │ ├── flickr30k.yaml │ │ │ ├── flickr30k_test.yaml │ │ │ └── utils.py │ │ ├── gqa │ │ │ ├── gqa.yaml │ │ │ └── utils.py │ │ ├── hallusion_bench │ │ │ ├── evaluate_hb.py │ │ │ ├── hallusion_bench_image.yaml │ │ │ └── utils.py │ │ ├── iconqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── iconqa.yaml │ │ │ ├── iconqa_test.yaml │ │ │ ├── iconqa_val.yaml │ │ │ └── utils.py │ │ ├── infovqa │ │ │ ├── _default_template_infovqa_yaml │ │ │ ├── infovqa.yaml │ │ │ ├── infovqa_test.yaml │ │ │ ├── infovqa_val.yaml │ │ │ └── utils.py │ │ ├── llava-bench-coco │ │ │ ├── llava-bench-coco.yaml │ │ │ └── utils.py │ │ ├── llava-in-the-wild │ │ │ ├── llava-in-the-wild.yaml │ │ │ └── utils.py │ │ ├── mathverse │ │ │ ├── mathverse.yaml │ │ │ ├── mathverse_evals.py │ │ │ ├── mathverse_testmini.yaml │ │ │ ├── mathverse_testmini_text_dominant.yaml │ │ │ ├── mathverse_testmini_text_lite.yaml │ │ │ ├── mathverse_testmini_text_only.yaml │ │ │ ├── mathverse_testmini_vision_dominant.yaml │ │ │ ├── mathverse_testmini_vision_intensive.yaml │ │ │ ├── mathverse_testmini_vision_only.yaml │ │ │ └── utils.py │ │ ├── mathvista │ │ │ ├── mathvista.yaml │ │ │ ├── mathvista_evals.py │ │ │ ├── mathvista_test.yaml │ │ │ ├── mathvista_testmini.yaml │ │ │ └── utils.py │ │ ├── mmbench │ │ │ ├── _default_template_mmbench_cn_yaml │ │ │ ├── _default_template_mmbench_en_yaml │ │ │ ├── cc_utils.py │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── mmbench.yaml │ │ │ ├── mmbench_cc.yaml │ │ │ ├── mmbench_cn.yaml │ │ │ ├── mmbench_cn_dev.yaml │ │ │ ├── mmbench_cn_test.yaml │ │ │ ├── mmbench_en.yaml │ │ │ ├── mmbench_en_dev.yaml │ │ │ ├── mmbench_en_test.yaml │ │ │ └── mmbench_evals.py │ │ ├── mme │ │ │ ├── mme.yaml │ │ │ └── utils.py │ │ ├── mmmu │ │ │ ├── mmmu.yaml │ │ │ ├── mmmu_group_img.yaml │ │ │ ├── mmmu_group_img_test.yaml │ │ │ ├── mmmu_group_img_val.yaml │ │ │ ├── mmmu_test.yaml │ │ │ ├── mmmu_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_group_img.py │ │ ├── mmvet │ │ │ ├── mmvet.yaml │ │ │ └── utils.py │ │ ├── multidocvqa │ │ │ ├── multidocvqa.yaml │ │ │ ├── multidocvqa_test.yaml │ │ │ ├── multidocvqa_val.yaml │ │ │ └── utils.py │ │ ├── multilingual-llava-bench-in-the-wild │ │ │ ├── README.md │ │ │ ├── _default_template.yaml │ │ │ ├── arabic_llava_in_the_wild.yaml │ │ │ ├── bengali_llava_in_the_wild.yaml │ │ │ ├── chinese_llava_in_the_wild.yaml │ │ │ ├── french_llava_in_the_wild.yaml │ │ │ ├── hindi_llava_in_the_wild.yaml │ │ │ ├── japanese_llava_in_the_wild.yaml │ │ │ ├── russian_llava_in_the_wild.yaml │ │ │ ├── spanish_llava_in_the_wild.yaml │ │ │ ├── urdu_llava_in_the_wild.yaml │ │ │ └── utils.py │ │ ├── nocaps │ │ │ ├── _default_template_nocaps_yaml │ │ │ ├── nocaps.yaml │ │ │ ├── nocaps_test.yaml │ │ │ ├── nocaps_val.yaml │ │ │ └── utils.py │ │ ├── ocrbench │ │ │ ├── ocrbench.yaml │ │ │ ├── upload_ocrbench.py │ │ │ └── utils.py │ │ ├── ok_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _ok_vqa.yaml │ │ │ ├── ok_vqa_val2014.yaml │ │ │ └── utils.py │ │ ├── olympiadbench │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── olympiadbench.yaml │ │ │ ├── olympiadbench_evals.py │ │ │ ├── olympiadbench_test_cn.yaml │ │ │ └── olympiadbench_test_en.yaml │ │ ├── pope │ │ │ ├── pope.yaml │ │ │ ├── pope_adv.yaml │ │ │ ├── pope_full.yaml │ │ │ ├── pope_pop.yaml │ │ │ ├── pope_random.yaml │ │ │ └── utils.py │ │ ├── realworldqa │ │ │ ├── realworldqa.yaml │ │ │ └── utils.py │ │ ├── refcoco+ │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco+_bbox_rec_testA.yaml │ │ │ ├── refcoco+_bbox_rec_testB.yaml │ │ │ ├── refcoco+_bbox_rec_val.yaml │ │ │ ├── refcoco+_bbox_testA.yaml │ │ │ ├── refcoco+_bbox_testB.yaml │ │ │ ├── refcoco+_bbox_val.yaml │ │ │ ├── refcoco+_seg_testA.yaml │ │ │ ├── refcoco+_seg_testB.yaml │ │ │ ├── refcoco+_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── refcoco │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco_bbox_rec_test.yaml │ │ │ ├── refcoco_bbox_rec_testA.yaml │ │ │ ├── refcoco_bbox_rec_testB.yaml │ │ │ ├── refcoco_bbox_rec_val.yaml │ │ │ ├── refcoco_bbox_test.yaml │ │ │ ├── refcoco_bbox_testA.yaml │ │ │ ├── refcoco_bbox_testB.yaml │ │ │ ├── refcoco_bbox_val.yaml │ │ │ ├── refcoco_seg_test.yaml │ │ │ ├── refcoco_seg_testA.yaml │ │ │ ├── refcoco_seg_testB.yaml │ │ │ ├── refcoco_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── refcocog │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcocog_bbox_rec_test.yaml │ │ │ ├── refcocog_bbox_rec_val.yaml │ │ │ ├── refcocog_bbox_test.yaml │ │ │ ├── refcocog_bbox_val.yaml │ │ │ ├── refcocog_seg_test.yaml │ │ │ ├── refcocog_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── scienceqa │ │ │ ├── scienceqa.yaml │ │ │ ├── scienceqa_full.yaml │ │ │ ├── scienceqa_img.yaml │ │ │ └── utils.py │ │ ├── screenspot │ │ │ ├── README.md │ │ │ ├── _default_template_rec_yaml │ │ │ ├── _default_template_reg_yaml │ │ │ ├── _screenspot.yaml │ │ │ ├── screenspot_rec_test.yaml │ │ │ ├── screenspot_reg_test.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── seedbench │ │ │ ├── seedbench.yaml │ │ │ ├── seedbench_ppl.yaml │ │ │ └── utils.py │ │ ├── seedbench_2 │ │ │ ├── seedbench_2.yaml │ │ │ └── utils.py │ │ ├── stvqa │ │ │ ├── stvqa.yaml │ │ │ └── utils.py │ │ ├── textcaps │ │ │ ├── _default_template_textcaps_yaml │ │ │ ├── textcaps.yaml │ │ │ ├── textcaps_test.yaml │ │ │ ├── textcaps_train.yaml │ │ │ ├── textcaps_val.yaml │ │ │ └── utils.py │ │ ├── textvqa │ │ │ ├── _default_template_textvqa_yaml │ │ │ ├── _textvqa.yaml │ │ │ ├── textvqa_test.yaml │ │ │ ├── textvqa_val.yaml │ │ │ └── utils.py │ │ ├── vizwiz_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _vizwiz_vqa.yaml │ │ │ ├── utils.py │ │ │ ├── vizwiz_vqa_test.yaml │ │ │ └── vizwiz_vqa_val.yaml │ │ ├── vqav2 │ │ │ ├── _default_template_vqav2_yaml │ │ │ ├── _vqav2.yaml │ │ │ ├── utils.py │ │ │ ├── vqav2_test.yaml │ │ │ └── vqav2_val.yaml │ │ └── websrc │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── websrc.yaml │ │ │ ├── websrc_test.yaml │ │ │ └── websrc_val.yaml │ └── utils.py ├── miscs │ ├── llava_result_check.md │ ├── repr_scripts.sh │ ├── repr_torch_envs.txt │ ├── scienceqa_id.txt │ ├── script.sh │ ├── test_llava.py │ └── test_scienceqa.py ├── pyproject.toml └── setup.py ├── playground └── data │ ├── coco2014_val_gpt4_qa_30x3.jsonl │ ├── coco2014_val_qa_eval │ ├── qa90_gpt4_answer.jsonl │ └── qa90_questions.jsonl │ └── prompts │ ├── complex_reasoning │ ├── 000_caps.txt │ ├── 000_conv.txt │ ├── 001_caps.txt │ ├── 001_conv.txt │ ├── 002_caps.txt │ ├── 002_conv.txt │ └── system_message.txt │ ├── conversation │ ├── 000_caps.txt │ ├── 000_conv.txt │ ├── 001_caps.txt │ ├── 001_conv.txt │ └── system_message.txt │ └── detail_description │ ├── 000_caps.txt │ ├── 000_conv.txt │ ├── 001_caps.txt │ ├── 001_conv.txt │ ├── 002_caps.txt │ ├── 002_conv.txt │ └── system_message.txt ├── predict.py ├── pyproject.toml └── scripts ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_seed_for_submission.py ├── convert_sqa_to_llava.py ├── convert_sqa_to_llava_base_prompt.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── extract_mm_projector.py ├── finetune.sh ├── finetune_full_schedule.sh ├── finetune_lora.sh ├── finetune_qlora.sh ├── finetune_sqa.sh ├── merge_lora_weights.py ├── pretrain.sh ├── pretrain_xformers.sh ├── sqa_eval_batch.sh ├── sqa_eval_gather.sh ├── upload_pypi.sh ├── v1_5 ├── eval │ ├── gqa.sh │ ├── llavabench.sh │ ├── mmbench.sh │ ├── mmbench_cn.sh │ ├── mme-llava-next.sh │ ├── mme.sh │ ├── mmvet.sh │ ├── pope.sh │ ├── qbench.sh │ ├── qbench_zh.sh │ ├── seed.sh │ ├── sqa.sh │ ├── textvqa.sh │ ├── vizwiz.sh │ └── vqav2.sh ├── finetune.sh ├── finetune_lora.sh ├── finetune_task.sh ├── finetune_task_lora.sh └── pretrain.sh ├── zero2.json ├── zero3.json └── zero3_offload.json /.devcontainer/devcontainer.env: -------------------------------------------------------------------------------- 1 | SAMPLE_ENV_VAR1="Sample Value" 2 | SAMPLE_ENV_VAR2=332431bf-68bf -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # The .dockerignore file excludes files from the container build process. 2 | # 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file 4 | 5 | # Exclude Git files 6 | .git 7 | .github 8 | .gitignore 9 | 10 | # Exclude Python cache files 11 | __pycache__ 12 | .mypy_cache 13 | .pytest_cache 14 | .ruff_cache 15 | 16 | # Exclude Python virtual environment 17 | /venv 18 | 19 | # Exclude some weights 20 | /openai 21 | /liuhaotian 22 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | # Unix-style newlines with a newline ending every file 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | charset = utf-8 9 | 10 | # 4 space indentation 11 | [*.{py,json}] 12 | indent_style = space 13 | indent_size = 4 14 | 15 | # 2 space indentation 16 | [*.{md,sh,yaml,yml}] 17 | indent_style = space 18 | indent_size = 2 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitattributes 2 | 3 | # Set the default behavior, in case people don't have core.autocrlf set. 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion 5 | * text=auto 6 | 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | *.pyz text diff=python 16 | *.pyi text diff=python 17 | 18 | # Binary files 19 | # ============ 20 | *.db binary 21 | *.p binary 22 | *.pkl binary 23 | *.pickle binary 24 | *.pyc binary export-ignore 25 | *.pyo binary export-ignore 26 | *.pyd binary 27 | 28 | # Jupyter notebook 29 | *.ipynb text eol=lf 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-usage.yaml: -------------------------------------------------------------------------------- 1 | name: Usage issues 2 | description: Report issues in usage. 3 | title: "[Usage] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this form. Please give as detailed description as possible for us to better assist with the issue :) 9 | - type: textarea 10 | id: what-happened 11 | attributes: 12 | label: Describe the issue 13 | description: Please give as detailed description as possible for us to better assist with the issue. Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub. 14 | placeholder: Issue 15 | value: | 16 | Issue: 17 | 18 | Command: 19 | ``` 20 | PASTE THE COMMANDS HERE. 21 | ``` 22 | 23 | Log: 24 | ``` 25 | PASTE THE LOGS HERE. 26 | ``` 27 | 28 | Screenshots: 29 | You may attach screenshots if it better explains the issue. 30 | validations: 31 | required: true 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Request for a new feature 3 | title: "[Feature request] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. Please share your thoughts of the new features below. 9 | - type: textarea 10 | id: feature 11 | attributes: 12 | label: feature 13 | placeholder: Start your thoughts here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-question.yaml: -------------------------------------------------------------------------------- 1 | name: Questions 2 | description: General questions about the work 3 | title: "[Question] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: question 11 | attributes: 12 | label: Question 13 | placeholder: Start question here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-discussion.yaml: -------------------------------------------------------------------------------- 1 | name: Discussions 2 | description: General discussions about the work 3 | title: "[Discussion] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: discussion 11 | attributes: 12 | label: Discussion 13 | placeholder: Start discussion here... -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | *.pyc 4 | *.egg-info 5 | dist 6 | 7 | # Log 8 | *.log 9 | *.log.* 10 | *.json 11 | *.jsonl 12 | 13 | # Data 14 | !**/alpaca-data-conversation.json 15 | 16 | # Editor 17 | .idea 18 | *.swp 19 | 20 | # Other 21 | .DS_Store 22 | wandb 23 | output 24 | 25 | checkpoints 26 | ckpts* 27 | 28 | .ipynb_checkpoints 29 | *.ipynb 30 | 31 | # DevContainer 32 | !.devcontainer/* 33 | 34 | # Demo 35 | serve_images/ 36 | llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json 37 | IG-VLM/.git_org 38 | lmms-eval/.git_org 39 | llava/eval/table/results/test_sqa_llava_13b_v0.json 40 | -------------------------------------------------------------------------------- /IG-VLM/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright <2024> 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | 13 | -------------------------------------------------------------------------------- /IG-VLM/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/evaluation/__init__.py -------------------------------------------------------------------------------- /IG-VLM/example/QzulzuWr0Zw_20_45.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/QzulzuWr0Zw_20_45.mp4 -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/1.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/2.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/3.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/4.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/5.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/6.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/1.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/2.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/3.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/4.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/5.jpg -------------------------------------------------------------------------------- /IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/6.jpg -------------------------------------------------------------------------------- /IG-VLM/example/imagegrid_sample/QzulzuWr0Zw_20_45.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/imagegrid_sample/QzulzuWr0Zw_20_45.jpg -------------------------------------------------------------------------------- /IG-VLM/example/imagegrid_sample/rlQ2kW-FvMk_66_79.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/imagegrid_sample/rlQ2kW-FvMk_66_79.jpg -------------------------------------------------------------------------------- /IG-VLM/example/rlQ2kW-FvMk_66_79.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/rlQ2kW-FvMk_66_79.mp4 -------------------------------------------------------------------------------- /IG-VLM/example/ysTmUTQ5wZE_17_45.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/ysTmUTQ5wZE_17_45.mp4 -------------------------------------------------------------------------------- /IG-VLM/model_processor/__init__.py: -------------------------------------------------------------------------------- 1 | # from llava_model_processor import LlavaProcessor 2 | # from base_model_inference import BaseModelInference 3 | # from videollava_model_processor import VideoLLavaProcessor 4 | -------------------------------------------------------------------------------- /IG-VLM/model_processor/base_model_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024, Deep Representation Learning Research Group, Seoul National University. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | from abc import ABC, abstractmethod 10 | from enum import Enum 11 | 12 | 13 | class BaseModelInference(ABC): 14 | def __init__(self, model_name, local_save_path): 15 | self.model_name = model_name 16 | self.local_save_path = local_save_path 17 | self.error_list = [] 18 | 19 | @abstractmethod 20 | def load_model(self, **kwargs): 21 | pass 22 | 23 | @abstractmethod 24 | def inference(self, **kwargs): 25 | pass 26 | 27 | @abstractmethod 28 | def extract_answers(self): 29 | pass 30 | 31 | def save_local_file(self, answer): 32 | directory = os.path.dirname(self.local_save_path) 33 | if not os.path.exists(directory): 34 | os.makedirs(directory) 35 | 36 | with open(self.local_save_path, "w") as file: 37 | file.write(answer) 38 | 39 | def infer_and_save(self, **kwargs): 40 | try: 41 | self.inference(**kwargs) 42 | answer = self.extract_answers() 43 | return answer 44 | except Exception as e: 45 | self.error_list.append(e) 46 | print(e) 47 | return -1 48 | -------------------------------------------------------------------------------- /IG-VLM/pipeline_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/pipeline_processor/__init__.py -------------------------------------------------------------------------------- /IG-VLM/pipeline_processor/record.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024, Deep Representation Learning Research Group, Seoul National University. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from enum import Enum 9 | 10 | 11 | class EvaluationType(Enum): 12 | DEFAULT = 0 13 | CORRECTNESS = 1 14 | DETAILED_ORIENTATION = 2 15 | CONTEXT = 3 16 | TEMPORAL = 4 17 | -------------------------------------------------------------------------------- /IG-VLM/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | moviepy 3 | opencv-python 4 | openai==1.14.0 5 | torch==2.2.0 6 | bitsandbytes==0.42.0 7 | flash_attn==2.5.3 8 | transformers==4.36.2 9 | transformers-stream-generator==0.0.4 10 | torchvision==0.17.0 11 | pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d -------------------------------------------------------------------------------- /IG-VLM/scripts/nextqa_multi.sh: -------------------------------------------------------------------------------- 1 | matryoshka_vis_token_scale = 1 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | echo 'CHUNKS NUM:' $CHUNKS 9 | for IDX in $(seq 0 $((CHUNKS-1))); do 10 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python eval_llava_multiplechoice.py --path_qa_pair_csv ./data/multiple_choice_qa/NExT_QA.csv --path_video /data/NExTVideo_all/%s.mp4 --path_result ./result_nextqa-$matryoshka_vis_token_scale/ --llm_size 7b --matryoshka_vis_token_scale=$matryoshka_vis_token_scale \ 11 | --num_chunks $CHUNKS \ 12 | --chunk_idx $IDX & 13 | done 14 | 15 | wait 16 | 17 | -------------------------------------------------------------------------------- /IG-VLM/vision_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/vision_processor/__init__.py -------------------------------------------------------------------------------- /IG-VLM/vision_processor/base_post_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | from abc import ABC, abstractmethod 4 | import numpy as np 5 | from enum import Enum 6 | 7 | from PIL import Image 8 | 9 | import os 10 | import sys 11 | 12 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | from .decorator_processor import * 14 | from .base_frame_extractor import * 15 | 16 | 17 | class BasePostProcessor(ABC): 18 | def __init__(self): 19 | pass 20 | 21 | @abstractmethod 22 | def post_process(self, **kwargs): 23 | pass 24 | 25 | def _post_processo_on_option(self, *args, **kwargs): 26 | data = self.post_process(*args, **kwargs) 27 | return data 28 | 29 | def post_process_based_on_options(self, option, *args, **kwargs): 30 | self.option = option 31 | data = self._post_processo_on_option(*args, **kwargs) 32 | if option == SaveOption.BASE64: 33 | return self._save_data_to_base64(data, quality=self.quality) 34 | elif option == SaveOption.IMAGE: 35 | return Image.fromarray(data) 36 | else: 37 | raise ValueError("Invalid option: {}".format(option)) 38 | 39 | @save_to_base64 40 | def _save_data_to_base64(self, data, quality=95): 41 | return data 42 | 43 | @save_to_file 44 | def _save_data_to_file(self, data, filename=None, quality=95): 45 | cv2.imwrite(filename, data, [int(cv2.IMWRITE_JPEG_QUALITY), quality]) 46 | pass 47 | -------------------------------------------------------------------------------- /IG-VLM/vision_processor/decorator_processor.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import base64 3 | import os 4 | from io import BytesIO 5 | from PIL import Image 6 | 7 | 8 | def save_to_bytes(func): 9 | def wrapper(self, data): 10 | bytes_data = pickle.dumps(data) 11 | return func(self, bytes_data) 12 | 13 | return wrapper 14 | 15 | 16 | def save_to_one_file(func): 17 | def wrapper(self, data, filename): 18 | with open(filename, "wb") as file: 19 | func(self, data, file) 20 | 21 | return wrapper 22 | 23 | 24 | def save_to_file(func): 25 | def wrapper(self, data, filename, quality): 26 | os.makedirs(filename, exist_ok=True) 27 | for i, image_data in enumerate(data): 28 | file_path = os.path.join(filename, f"{i+1}.jpg") 29 | 30 | func(self, image_data, file_path, quality) 31 | 32 | return wrapper 33 | 34 | 35 | def save_to_base64(func): 36 | def wrapper(self, data, quality=95): 37 | rlt = Image.fromarray(data) 38 | 39 | with BytesIO() as byte_output: 40 | rlt.save(byte_output, format="JPEG", quality=quality) 41 | byte_output.seek(0) 42 | byte_data = byte_output.read() 43 | base64_data = base64.b64encode(byte_data).decode("utf-8") 44 | return func(self, base64_data) 45 | 46 | return wrapper 47 | -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /docs/Customize_Component.md: -------------------------------------------------------------------------------- 1 | # Customize Components in M3 2 | 3 | This is an initial guide on how to replace the LLMs, visual encoders, etc. with your choice of components. 4 | 5 | ## LLM 6 | 7 | It is quite simple to swap out LLaMA to any other LLMs. You can refer to our implementation of [`llava_llama.py`](https://raw.githubusercontent.com/mu-cai/matryoshka-mm/main/llava/model/language_model/llava_llama.py) for an example of how to replace the LLM. 8 | 9 | Although it may seem that it still needs ~100 lines of code, most of them are copied from the original `llama.py` from HF. The only part that is different is to insert some lines for processing the multimodal inputs. 10 | 11 | In `forward` function, you can see that we call `self.prepare_inputs_labels_for_multimodal` to process the multimodal inputs. This function is defined in `LlavaMetaForCausalLM` and you just need to insert it into the `forward` function of your LLM. 12 | 13 | In `prepare_inputs_for_generation` function, you can see that we add `images` to the `model_inputs`. This is because we need to pass the images to the LLM during generation. 14 | 15 | These are basically all the changes you need to make to replace the LLM. 16 | 17 | ## Visual Encoder 18 | 19 | You can check out [`clip_encoder.py`](https://github.com/mu-cai/matryoshka-mm/blob/main/llava/model/multimodal_encoder/clip_encoder.py) on how we implement the CLIP visual encoder. 20 | 21 | -------------------------------------------------------------------------------- /docs/Intel.md: -------------------------------------------------------------------------------- 1 | # Intel Platforms 2 | 3 | * Support [Intel GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) 4 | * Support [Intel CPU Sapphire Rapides](https://ark.intel.com/content/www/us/en/ark/products/codename/126212/products-formerly-sapphire-rapids.html) 5 | * Based on [Intel Extension for Pytorch](https://intel.github.io/intel-extension-for-pytorch) 6 | 7 | More details in [**intel branch**](https://github.com/mu-cai/matryoshka-mm/tree/intel/docs/intel) 8 | -------------------------------------------------------------------------------- /docs/MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | 4 | If you are interested in including any other details in Model Zoo, please open an issue :) 5 | 6 | The usage of LLaVA checkpoints should comply with the base LLM's model license. 7 | 8 | ## LLaVA-NeXT-M3 9 | 10 | 11 | Checkpoint is [here](https://huggingface.co/mucai/llava-next-vicuna-7b-m3). 12 | 13 | 14 | Base model: Vicuna v1.5-7B 15 | 16 | 17 | ## LLaVA-v1.5-M3 18 | 19 | 20 | Checkpoint is [here](https://huggingface.co/mucai/llava-v1.5-7b-m3). 21 | 22 | Base model: Vicuna v1.5-7B 23 | -------------------------------------------------------------------------------- /docs/Windows.md: -------------------------------------------------------------------------------- 1 | # Run LLaVA on Windows 2 | 3 | *NOTE: LLaVA on Windows is not fully supported. Currently we only support 16-bit inference. For a more complete support, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) for now. More functionalities on Windows is to be added soon, stay tuned.* 4 | 5 | ## Installation 6 | 7 | 1. Clone this repository and navigate to LLaVA folder 8 | ```bash 9 | git clone https://github.com/mu-cai/matryoshka-mm.git 10 | cd LLaVA 11 | ``` 12 | 13 | 2. Install Package 14 | ```Shell 15 | conda create -n matryoshka-mm python=3.10 -y 16 | conda activate matryoshka-mm 17 | python -m pip install --upgrade pip # enable PEP 660 support 18 | pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu117 19 | pip install -e . 20 | pip uninstall bitsandbytes 21 | ``` 22 | 23 | ## Run demo 24 | 25 | See instructions [here](https://github.com/mu-cai/matryoshka-mm#demo). 26 | 27 | Note that quantization (4-bit, 8-bit) is *NOT* supported on Windows. Stay tuned for the 4-bit support on Windows! 28 | -------------------------------------------------------------------------------- /docs/macOS.md: -------------------------------------------------------------------------------- 1 | # Run LLaVA on macOS 2 | 3 | *NOTE: LLaVA on macOS is not fully supported. Currently we only support 16-bit inference. More functionalities on macOS is to be added soon, stay tuned.* 4 | 5 | ## Installation 6 | 7 | 1. Clone this repository and navigate to LLaVA folder 8 | ```bash 9 | git clone https://github.com/mu-cai/matryoshka-mm.git 10 | cd LLaVA 11 | ``` 12 | 13 | 2. Install Package 14 | ```Shell 15 | conda create -n llava python=3.10 -y 16 | conda activate llava 17 | python -mpip install --upgrade pip # enable PEP 660 support 18 | pip install -e . 19 | pip install torch==2.1.0 torchvision==0.16.0 20 | pip uninstall bitsandbytes 21 | ``` 22 | 23 | ## Run demo 24 | 25 | Specify `--device mps` when launching model worker or CLI. 26 | 27 | See instructions [here](https://github.com/mu-cai/matryoshka-mm#demo). 28 | 29 | Note that quantization (4-bit, 8-bit) is *NOT* supported on macOS. Stay tuned for the 4-bit support on macOS! 30 | -------------------------------------------------------------------------------- /images/demo_cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/demo_cli.gif -------------------------------------------------------------------------------- /images/llava_example_cmp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_example_cmp.png -------------------------------------------------------------------------------- /images/llava_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_logo.png -------------------------------------------------------------------------------- /images/llava_v1_5_radar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_v1_5_radar.jpg -------------------------------------------------------------------------------- /images/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/teaser.png -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | except: 6 | pass 7 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/m3-demo-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/m3-demo-1.jpg -------------------------------------------------------------------------------- /llava/serve/examples/m3-demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/m3-demo.jpg -------------------------------------------------------------------------------- /llava/serve/examples/sample_demo_1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/sample_demo_1.mp4 -------------------------------------------------------------------------------- /llava/serve/examples/sample_demo_3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/sample_demo_3.mp4 -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /lmms-eval/docs/README.md: -------------------------------------------------------------------------------- 1 | # LMMs Eval Documentation 2 | 3 | Welcome to the docs for `lmms-eval`! 4 | 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) 6 | 7 | ## Table of Contents 8 | 9 | * To learn about the command line flags, see the [commands](commands.md) 10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md). 11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). -------------------------------------------------------------------------------- /lmms-eval/example_eval.yaml: -------------------------------------------------------------------------------- 1 | - model: llava 2 | model_args: pretrained=liuhaotian/llava-v1.5-7b 3 | tasks: ai2d 4 | batch_size: 1 5 | log_samples: true 6 | log_samples_suffix: eval_vizwiz_vqa 7 | output_path: "./logs/" 8 | 9 | - model: llava 10 | model_args: pretrained=liuhaotian/llava-v1.5-13b 11 | tasks: mme 12 | batch_size: 1 13 | log_samples: true 14 | log_samples_suffix: mme 15 | output_path: "./logs/" 16 | -------------------------------------------------------------------------------- /lmms-eval/llava_repr_requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | datasets==2.16.1 3 | evaluate==0.4.1 4 | hf_transfer==0.1.6 5 | Jinja2==3.1.3 6 | numpy==1.26.4 7 | openai==1.13.3 8 | packaging==23.2 9 | pandas==2.2.1 10 | Pillow==10.2.0 11 | protobuf==4.25.3 12 | pycocoevalcap==1.2 13 | pycocotools==2.0.7 14 | pytablewriter==1.2.0 15 | pytest==8.0.2 16 | python_Levenshtein==0.25.0 17 | pytz==2024.1 18 | PyYAML==6.0.1 19 | PyYAML==6.0.1 20 | Requests==2.31.0 21 | sacrebleu==2.4.0 22 | scikit_learn==1.2.2 23 | sentencepiece==0.1.99 24 | setuptools==68.2.2 25 | sglang==0.1.12 26 | shortuuid==1.0.12 27 | sqlitedict==2.1.0 28 | tenacity==8.2.3 29 | torch==2.0.1 30 | openai>=1.0.0 31 | pycocoevalcap 32 | tokenizers==0.15.2 33 | tqdm==4.66.2 34 | tqdm-multiprocess 35 | transformers==4.37.2 36 | zstandard 37 | pillow 38 | pyyaml 39 | sympy 40 | mpmath 41 | Jinja2 42 | openpyxl 43 | Levenshtein 44 | hf_transfer 45 | tenacity 46 | wandb>=0.16.0 47 | transformers-stream-generator 48 | tiktoken 49 | pre-commit -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/api/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/instance.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Literal, Tuple 3 | 4 | 5 | @dataclass 6 | class Instance: 7 | request_type: Literal["loglikelihood", "generate_until"] 8 | arguments: tuple 9 | idx: int 10 | metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here 11 | resps: list = field(default_factory=list) 12 | filtered_resps: dict = field(default_factory=dict) 13 | 14 | # initialized after init 15 | task_name: str = None 16 | doc_id: str = None 17 | repeats: str = None 18 | doc: dict = None 19 | 20 | def __post_init__(self) -> None: 21 | # unpack metadata field 22 | self.task_name, self.doc_id, self.repeats = self.metadata 23 | 24 | @property 25 | def args(self): 26 | """ 27 | Returns (string,) where `string` is the string to calculate loglikelihood over 28 | """ 29 | return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/decontamination.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class DecontaminationFilter(Filter): 5 | """ 6 | A filter which evaluates 7 | """ 8 | 9 | name = "track_decontamination" 10 | 11 | def __init__(self, path) -> None: 12 | """ 13 | 14 | TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 15 | should further cache result on a given (task_name, doc_id) 16 | """ 17 | self._decontam_results = None 18 | 19 | def apply(self, resps, docs) -> None: 20 | """ 21 | Return {"no_contamination", "only_contamination"} keys for the 2 different subsets 22 | """ 23 | pass 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AVAILABLE_MODELS = { 4 | "llava": "Llava", 5 | "llava_hf": "LlavaHf", 6 | "llava_sglang": "LlavaSglang", 7 | "qwen_vl": "Qwen_VL", 8 | "fuyu": "Fuyu", 9 | "gpt4v": "GPT4V", 10 | "instructblip": "InstructBLIP", 11 | "minicpm_v": "MiniCPM_V", 12 | "idefics2": "Idefics2", 13 | "qwen_vl_api": "Qwen_VL_API", 14 | "phi3v": "Phi3v", 15 | } 16 | 17 | for model_name, model_class in AVAILABLE_MODELS.items(): 18 | try: 19 | exec(f"from .{model_name} import {model_class}") 20 | except ImportError: 21 | pass 22 | 23 | 24 | import hf_transfer 25 | 26 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/model_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/models/model_utils/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def generate_submission_file(file_name, args, subpath="submissions"): 5 | path = os.path.join(args.output_path, subpath) 6 | os.makedirs(path, exist_ok=True) 7 | path = os.path.join(path, file_name) 8 | return os.path.abspath(path) 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ai2d 2 | task: "ai2d" 3 | dataset_kwargs: 4 | token: True 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ai2d_doc_to_visual 8 | doc_to_text: !function utils.ai2d_doc_to_text 9 | doc_to_target: !function utils.ai2d_doc_to_target 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | metric_list: 15 | - metric: exact_match 16 | aggregation: mean 17 | higher_is_better: true 18 | ignore_case: true 19 | ignore_punctuation: true 20 | metadata: 21 | - version: 0.0 22 | 23 | model_specific_prompt_kwargs: 24 | default: 25 | prompt_format: mcq 26 | pre_prompt: "" 27 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 28 | # qwen formulate ai2d as question answering instead of mcq 29 | qwen_vl: 30 | prompt_format: qa 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | 34 | model_specific_target_kwargs: 35 | default: "mcq" 36 | qwen_vl: "qa" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/utils.py: -------------------------------------------------------------------------------- 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None): 2 | question, choices = doc["question"], doc["options"] 3 | len_choices = len(choices) 4 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 5 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 6 | if model_specific_prompt_kwargs["prompt_format"] == "mcq": 7 | options = [chr(ord("A") + i) for i in range(len_choices)] 8 | choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) 9 | return f"{pre_prompt}{question}\n{choices_str}{post_prompt}" 10 | elif model_specific_prompt_kwargs["prompt_format"] == "qa": 11 | options = "\n".join(choices) 12 | return f"{pre_prompt}{question}{options}{post_prompt}" 13 | else: 14 | raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}") 15 | 16 | 17 | def ai2d_doc_to_visual(doc): 18 | return [doc["image"].convert("RGB")] 19 | 20 | 21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs): 22 | if model_specific_target_kwargs == "mcq": 23 | len_choices = len(doc["options"]) 24 | options = [chr(ord("A") + i) for i in range(len_choices)] 25 | return options[int(doc["answer"])] 26 | elif model_specific_target_kwargs == "qa": 27 | return doc["options"][int(doc["answer"])] 28 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ChartQA 2 | dataset_kwargs: 3 | token: True 4 | task: "chartqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.chartqa_doc_to_visual 8 | doc_to_text: !function utils.chartqa_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | process_results: !function utils.chartqa_process_results 15 | metric_list: 16 | - metric: relaxed_overall 17 | aggregation: mean 18 | higher_is_better: true 19 | - metric: relaxed_human_split 20 | aggregation: mean 21 | higher_is_better: true 22 | - metric: relaxed_augmented_split 23 | aggregation: mean 24 | higher_is_better: true 25 | metadata: 26 | - version: 0.0 27 | model_specific_prompt_kwargs: 28 | default: 29 | pre_prompt: "" 30 | post_prompt: "\nAnswer the question with a single word." 31 | qwen_vl: 32 | pre_prompt: "" 33 | post_prompt: " Answer:" 34 | 35 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml: -------------------------------------------------------------------------------- 1 | group: cmmmu 2 | task: 3 | - cmmmu_val 4 | - cmmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/CMMMU 2 | output_type: generate_until 3 | doc_to_visual: !function utils.cmmmu_doc_to_visual 4 | doc_to_text: !function utils.cmmmu_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_test" 2 | test_split: test 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_test_results_for_submission 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | metric_list: 7 | - metric: submission 8 | aggregation: !function utils.cmmmu_test_aggregate_results_for_submission 9 | higher_is_better: false 10 | metadata: 11 | - version: 0.0 12 | include: _default_template_cmmmu_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_val" 2 | test_split: val 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_results 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original 9 | metric_list: 10 | - metric: cmmmu_acc 11 | aggregation: !function utils.cmmmu_aggregate_results 12 | higher_is_better: true 13 | metadata: 14 | - version: 0.0 15 | include: _default_template_cmmmu_yaml 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2014_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2014_cap_test" 5 | group : "coco_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: "Provide a one-sentence caption for the provided image." 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2017_cap 2 | task: 3 | - coco2017_cap_val 4 | - coco2017_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2017_cap_test" 5 | group : "coco_caption2017" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task: "coco2017_cap_val" 5 | group : "coco_caption2017" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_Bleu_4 21 | aggregation : !function utils.coco_bleu4 22 | higher_is_better : true 23 | - metric: coco_Bleu_3 24 | aggregation : !function utils.coco_bleu3 25 | higher_is_better : true 26 | - metric: coco_Bleu_2 27 | aggregation : !function utils.coco_bleu2 28 | higher_is_better : true 29 | - metric: coco_Bleu_1 30 | aggregation : !function utils.coco_bleu1 31 | higher_is_better : true 32 | - metric: coco_METEOR 33 | aggregation : !function utils.coco_meteor 34 | higher_is_better : true 35 | - metric: coco_ROUGE_L 36 | aggregation : !function utils.coco_rougel 37 | higher_is_better : true 38 | - metric: coco_CIDEr 39 | aggregation : !function utils.coco_cider 40 | higher_is_better : true 41 | #- metric: coco_SPICE 42 | # aggregation : !function utils.coco_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test 5 | - coco2017_cap_val 6 | - coco2017_cap_test 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: DocVQA 3 | dataset_kwargs: 4 | token: True 5 | output_type: generate_until 6 | doc_to_visual: !function utils.docvqa_doc_to_visual 7 | doc_to_text: !function utils.docvqa_doc_to_text 8 | doc_to_target: "answers" 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | post_prompt: "\nAnswer the question using a single word or phrase." 17 | qwen_vl: 18 | pre_prompt: "" 19 | post_prompt: " Answer:" 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml: -------------------------------------------------------------------------------- 1 | group: docvqa 2 | task: 3 | - docvqa_val 4 | - docvqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_test" 2 | test_split: test 3 | process_results: !function utils.docvqa_test_process_results 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.docvqa_test_aggregate_results 7 | higher_is_better: true 8 | include: _default_template_docvqa_yaml 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_val" 2 | test_split: validation 3 | metric_list: 4 | - metric: anls 5 | aggregation: mean 6 | higher_is_better: true 7 | include: _default_template_docvqa_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | logger = logging.getLogger("lmms-eval") 8 | 9 | 10 | def docvqa_doc_to_visual(doc): 11 | return [doc["image"].convert("RGB")] 12 | 13 | 14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs): 15 | question = doc["question"] 16 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 17 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 18 | return f"{pre_prompt}{question}{post_prompt}" 19 | 20 | 21 | def docvqa_test_process_results(doc, results): 22 | pred = results[0] 23 | questionId = doc["questionId"] 24 | return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}} 25 | 26 | 27 | def docvqa_test_aggregate_results(results, args): 28 | # save results as json 29 | path = generate_submission_file("docvqa_test_for_submission.json", args) 30 | with open(path, "w") as f: 31 | json.dump(results, f) 32 | logger.info(f"Results saved to {path}") 33 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ferret/ferret.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/Ferret-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "ferret" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ferret_doc_to_visual 8 | doc_to_text: !function utils.ferret_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.ferret_process_results 20 | metric_list: 21 | - metric: gpt_eval_ferret_all 22 | aggregation: !function utils.ferret_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_ferret_refer_desc 25 | aggregation: !function utils.ferret_refer_desc_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_ferret_refer_reason 28 | aggregation: !function utils.ferret_refer_reason_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_ferret_ground_conv 31 | aggregation: !function utils.ferret_ground_conv_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "gpt-4-0314" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml: -------------------------------------------------------------------------------- 1 | group: flickr30k 2 | task: 3 | - flickr30k_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/gqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/GQA 2 | dataset_name: testdev_balanced_instructions 3 | dataset_kwargs: 4 | token: True 5 | task: "gqa" 6 | test_split: testdev 7 | output_type: generate_until 8 | doc_to_visual: !function utils.gqa_doc_to_visual 9 | doc_to_text: !function utils.gqa_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | ignore_case: true 22 | ignore_punctuation: true 23 | metadata: 24 | - version: 0.0 25 | 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/utils.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | GQA_RAW_IMAGE_DATASET = None 4 | GQA_ID2IMAGE = None 5 | 6 | 7 | def gqa_doc_to_visual(doc): 8 | global GQA_RAW_IMAGE_DATASET 9 | global GQA_ID2IMAGE 10 | if GQA_RAW_IMAGE_DATASET is None: 11 | GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True) 12 | GQA_ID2IMAGE = {} 13 | for row in GQA_RAW_IMAGE_DATASET: 14 | GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB") 15 | image = GQA_ID2IMAGE[doc["imageId"]] 16 | return [image] 17 | 18 | 19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs): 20 | question = doc["question"] 21 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 22 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 23 | return f"{pre_prompt}{question}{post_prompt}" 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/HallusionBench 2 | dataset_kwargs: 3 | token: True 4 | task: "hallusion_bench_image" 5 | test_split: image 6 | output_type: generate_until 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text 9 | doc_to_target: "gt_answer_details" 10 | process_results: !function evaluate_hb.hb_process_results 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "" 15 | generation_kwargs: 16 | max_new_tokens: 128 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | metric_list: 22 | - metric: aAcc 23 | aggregation: !function evaluate_hb.hb_aggregation_result_aAcc 24 | higher_is_better: true 25 | - metric: qAcc 26 | aggregation: !function evaluate_hb.hb_aggregation_result_qAcc 27 | higher_is_better: true 28 | - metric: fAcc 29 | aggregation: !function evaluate_hb.hb_aggregation_result_fAcc 30 | higher_is_better: true 31 | # - metric: aAcc 32 | # aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern 33 | # higher_is_better: true 34 | # - metric: qAcc 35 | # aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern 36 | # higher_is_better: true 37 | # - metric: fAcc 38 | # aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern 39 | # higher_is_better: true 40 | metadata: 41 | - version: 0.0 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ICON-QA 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.doc_to_visual 6 | doc_to_text: !function utils.doc_to_text 7 | doc_to_target: "answers" 8 | # process_results: !function utils.test_process_results 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | statement: "Given a set of images and a question, please provide the answer to the question.\n" 17 | options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly." 18 | freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase." 19 | metric_list: 20 | - metric: anls 21 | aggregation: mean 22 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml: -------------------------------------------------------------------------------- 1 | group: iconqa 2 | task: 3 | - iconqa_val 4 | - iconqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_test" 2 | test_split: test 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_val" 2 | test_split: val 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: InfographicVQA 3 | dataset_kwargs: 4 | token: True 5 | doc_to_target: "answers" 6 | doc_to_visual: !function utils.infovqa_doc_to_visual 7 | doc_to_text: !function utils.infovqa_doc_to_text 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml: -------------------------------------------------------------------------------- 1 | group: infovqa 2 | task: 3 | - infovqa_val 4 | - infovqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_test" 2 | test_split: test 3 | output_type: generate_until 4 | process_results: !function utils.infovqa_test_process_results 5 | metric_list: 6 | - metric: submission 7 | aggregation: !function utils.infovqa_test_aggregate_results 8 | higher_is_better: true 9 | include: _default_template_infovqa_yaml 10 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_val" 2 | test_split: validation 3 | output_type: generate_until 4 | metric_list: 5 | - metric: anls 6 | aggregation: mean 7 | higher_is_better: true 8 | include: _default_template_infovqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 7 | 8 | lmms_logger = logging.getLogger("lmms-eval") 9 | 10 | 11 | def infovqa_doc_to_visual(doc): 12 | return [doc["image"].convert("RGB")] 13 | 14 | 15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs): 16 | question = doc["question"] 17 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 18 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 19 | return f"{pre_prompt}{question}{post_prompt}" 20 | 21 | 22 | def infovqa_test_process_results(doc, results): 23 | pred = results[0] 24 | questionId = doc["questionId"] 25 | return {"submission": {"questionId": int(questionId), "answer": pred}} 26 | 27 | 28 | def infovqa_test_aggregate_results(results, args): 29 | # save results as json 30 | file = generate_submission_file("infovqa_test_for_submission.json", args) 31 | with open(file, "w") as f: 32 | json.dump(results, f) 33 | lmms_logger.info(f"Results saved to {file}") 34 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-coco 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_bench_coco" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | process_results: !function utils.llava_process_results 19 | metric_list: 20 | - metric: gpt_eval_llava_all 21 | aggregation: !function utils.llava_all_aggregation 22 | higher_is_better: true 23 | - metric: gpt_eval_llava_conv 24 | aggregation: !function utils.llava_conv_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_detail 27 | aggregation: !function utils.llava_detail_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_complex 30 | aggregation: !function utils.llava_complex_aggregation 31 | higher_is_better: true 32 | metadata: 33 | version: 0.0 34 | gpt_eval_model_name: "gpt-4-0314" 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-in-the-wild 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_in_the_wild" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.llava_process_results 20 | metric_list: 21 | - metric: gpt_eval_llava_all 22 | aggregation: !function utils.llava_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_llava_conv 25 | aggregation: !function utils.llava_conv_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_llava_detail 28 | aggregation: !function utils.llava_detail_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_llava_complex 31 | aggregation: !function utils.llava_complex_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "gpt-4-0613" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" 40 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml: -------------------------------------------------------------------------------- 1 | group: mathverse 2 | task: 3 | - mathverse_testmini 4 | - mathverse_testmini_text_only 5 | - mathverse_testmini_text_lite 6 | - mathverse_testmini_text_dominant 7 | - mathverse_testmini_vision_intensive 8 | - mathverse_testmini_vision_dominant 9 | - mathverse_testmini_vision_only 10 | metadata: 11 | version: 0.0 12 | gpt_eval_model_name: "gpt-3.5-turbo" 13 | trunk_response: 30 14 | quick_match: false -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini" 6 | test_split: testmini 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_dominant" 6 | test_split: text_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_lite" 6 | test_split: text_lite 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_text_only 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_only" 6 | test_split: text_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_dominant" 6 | test_split: vision_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_intensive" 6 | test_split: vision_intensive 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_only" 6 | test_split: vision_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml: -------------------------------------------------------------------------------- 1 | group: mathvista 2 | task: 3 | - mathvista_testmini 4 | - mathvista_test 5 | metadata: 6 | version: 0.0 7 | gpt_eval_model_name: "gpt-4-0613" 8 | quick_extract: false -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_test" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | model_specific_generation_kwargs: 28 | llava: 29 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_testmini" 5 | test_split: testmini 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | phi3v: 28 | shot_type: "solution" 29 | model_specific_generation_kwargs: 30 | llava: 31 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | dataset_name: "cn" 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text 9 | generation_kwargs: 10 | max_new_tokens: 256 11 | temperature: 0 12 | top_p: 0 13 | num_beams: 1 14 | do_sample: false 15 | process_results: !function cn_utils.mmbench_process_results 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 20 | model_specific_generation_kwargs: 21 | llava: 22 | image_aspect_ratio: original 23 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | model_specific_prompt_kwargs: 6 | default: 7 | pre_prompt: "" 8 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual 10 | doc_to_text: !function en_utils.mmbench_doc_to_text 11 | doc_to_target: "answer" 12 | process_results: !function en_utils.mmbench_process_results 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | output_type: generate_until 17 | dataset_name: "en" 18 | generation_kwargs: 19 | until: 20 | - "ASSISTANT:" 21 | max_new_tokens: 1024 22 | temperature: 0 23 | top_p: 0 24 | num_beams: 1 25 | do_sample: false 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | - mmbench_cn_dev 6 | - mmbench_cn_test 7 | - mmbench_cn_cc 8 | metadata: 9 | version: 0.0 10 | sys_prompt: "There are several options:" 11 | gpt_eval_model_name: "gpt-3.5-turbo-0613" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_name: cc 3 | dataset_kwargs: 4 | token: True 5 | task: "mmbench_cn_cc" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 256 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function cc_utils.mmbench_cn_cc_process_results 18 | metric_list: 19 | - metric: gpt_eval_score 20 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval 21 | higher_is_better: true 22 | - metric: submission 23 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results 24 | metadata: 25 | version: 0.0 26 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | pre_prompt: "" 31 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_cn 2 | task: 3 | - mmbench_cn_dev 4 | - mmbench_cn_test 5 | - mmbench_cn_cc 6 | metadata: 7 | version: 0.0 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | sys_prompt: "有如下几个选项:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_cn_dev" 2 | test_split: "dev" 3 | metric_list: 4 | - metric: gpt_eval_score 5 | aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval 6 | higher_is_better: true 7 | - metric: submission 8 | higher_is_better: true 9 | aggregation: !function cn_utils.mmbench_aggregate_dev_results 10 | include: _default_template_mmbench_cn_yaml 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml: -------------------------------------------------------------------------------- 1 | task: mmbench_cn_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function cn_utils.mmbench_aggregate_test_results 6 | higher_is_better: true 7 | include: _default_template_mmbench_cn_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_en 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | metadata: 6 | version: 0.0 7 | sys_prompt: "There are several options:" 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_dev" 2 | test_split: dev 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: gpt_eval_score 6 | aggregation: !function en_utils.mmbench_aggregate_dev_results_eval 7 | higher_is_better: true 8 | - metric: submission 9 | aggregation: !function en_utils.mmbench_aggregate_dev_results_submission 10 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_test" 2 | test_split: test 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function en_utils.mmbench_aggregate_test_results 7 | higher_is_better: true 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mme/mme.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MME 2 | dataset_kwargs: 3 | token: True 4 | task: "mme" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mme_doc_to_visual 8 | doc_to_text: !function utils.mme_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | # The return value of process_results will be used by metrics 17 | process_results: !function utils.mme_process_results 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: mme_percetion_score 21 | aggregation: !function utils.mme_aggregate_results 22 | higher_is_better: true 23 | - metric: mme_cognition_score 24 | aggregation: !function utils.mme_aggregate_results 25 | higher_is_better: true 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | otterhd: 34 | pre_prompt: "" 35 | post_prompt: " Answer:" 36 | metadata: 37 | - version: 0.0 38 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml: -------------------------------------------------------------------------------- 1 | group: mmmu 2 | task: 3 | - mmmu_val 4 | - mmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img.yaml: -------------------------------------------------------------------------------- 1 | group: mmmu_group_img 2 | task: 3 | - mmmu_val_group_img 4 | - mmmu_test_group_img 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_test_group_img" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils_group_img.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: original 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils_group_img.mmmu_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_val_group_img" 3 | test_split: validation 4 | output_type: generate_until 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils_group_img.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | metric_list: 17 | - metric: mmmu_acc 18 | aggregation: !function utils_group_img.mmmu_aggregate_results 19 | higher_is_better: true 20 | metadata: 21 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: original 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.mmmu_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_val" 3 | test_split: validation 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | metric_list: 17 | - metric: mmmu_acc 18 | aggregation: !function utils.mmmu_aggregate_results 19 | higher_is_better: true 20 | metadata: 21 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMVet 2 | dataset_kwargs: 3 | token: True 4 | task: "mmvet" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mmvet_doc_to_visual 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"] 9 | doc_to_target: "{{answer}}" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mmvet_process_results # apply gpt eval here 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mmvet_aggregate_results 22 | higher_is_better: true 23 | metadata: 24 | version: 0.0 25 | gpt_eval_model_name: "gpt-4" 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "" 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml: -------------------------------------------------------------------------------- 1 | group: multidocvqa 2 | task: 3 | - multidocvqa_val 4 | - multidocvqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_test_results_for_submission 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_val" 3 | test_split: val 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_results 13 | metric_list: 14 | - metric: anls 15 | aggregation: !function utils.multidocvqa_aggregate_results_anls 16 | higher_is_better: true 17 | - metric: accuracy 18 | aggregation: !function utils.multidocvqa_aggregate_results_accuracy 19 | higher_is_better: true 20 | model_specific_prompt_kwargs: 21 | default: 22 | pre_prompt: "" 23 | post_prompt: "\nAnswer the question using a single word or phrase." 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_default_template.yaml: -------------------------------------------------------------------------------- 1 | test_split: train 2 | output_type: generate_until 3 | doc_to_visual: !function utils.llava_doc_to_visual 4 | doc_to_text: !function utils.llava_doc_to_text 5 | doc_to_target: "gpt_answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | image_aspect_ratio: original 10 | max_new_tokens: 1024 11 | temperature: 0 12 | top_p: 0 13 | num_beams: 1 14 | do_sample: false 15 | process_results: !function utils.llava_process_results 16 | metric_list: 17 | - metric: gpt_eval_llava_all 18 | aggregation: !function utils.llava_all_aggregation 19 | higher_is_better: true 20 | - metric: gpt_eval_llava_conv 21 | aggregation: !function utils.llava_conv_aggregation 22 | higher_is_better: true 23 | - metric: gpt_eval_llava_detail 24 | aggregation: !function utils.llava_detail_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_complex 27 | aggregation: !function utils.llava_complex_aggregation 28 | higher_is_better: true 29 | metadata: 30 | version: 0.0 31 | gpt_eval_model_name: "gpt-4-0613" 32 | model_specific_prompt_kwargs: 33 | default: 34 | pre_prompt: "" 35 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: arabic 4 | token: True 5 | task: "llava_in_the_wild_arabic" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: bengali 4 | token: True 5 | task: "llava_in_the_wild_bengali" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: chinese 4 | token: True 5 | task: "llava_in_the_wild_chinese" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: french 4 | token: True 5 | task: "llava_in_the_wild_french" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: hindi 4 | token: True 5 | task: "llava_in_the_wild_hindi" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: japanese 4 | token: True 5 | task: "llava_in_the_wild_japanese" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: russian 4 | token: True 5 | task: "llava_in_the_wild_russian" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: spanish 4 | token: True 5 | task: "llava_in_the_wild_spanish" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: "gagan3012/multilingual-llava-bench" 2 | dataset_kwargs: 3 | config: urdu 4 | token: True 5 | task: "llava_in_the_wild_urdu" 6 | include: _default_template.yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: "Provide a one-sentence caption for the provided image." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml: -------------------------------------------------------------------------------- 1 | group : nocaps 2 | task: 3 | - nocaps_test 4 | - nocaps_val -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/NoCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "nocaps_test" 5 | group : "nocaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.nocaps_doc_to_visual 9 | doc_to_text: !function utils.nocaps_doc_to_text 10 | doc_to_target: "annotations_captions" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.nocaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: nocaps_passthrough 21 | aggregation : !function utils.nocaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_nocaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: echo840/OCRBench 2 | dataset_kwargs: 3 | token: True 4 | task: "ocrbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual 8 | doc_to_text: !function utils.ocrbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.ocrbench_process_results 17 | metric_list: 18 | - metric: ocrbench_accuracy 19 | aggregation: !function utils.ocrbench_aggregate_accuracy 20 | higher_is_better: true 21 | metadata: 22 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OK-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual 4 | doc_to_text: !function utils.ok_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metric_list: 10 | - metric: exact_match 11 | aggregation: mean 12 | higher_is_better: true 13 | ignore_case: true 14 | ignore_punctuation: true 15 | - metric: submission 16 | aggregation: !function utils.ok_vqa_aggreate_submissions 17 | higher_is_better: true 18 | process_results: !function utils.ok_vqa_process_results 19 | model_specific_prompt_kwargs: 20 | default: 21 | pre_prompt: "" 22 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val2014"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"ok_vqa_{split}") 16 | 17 | save_path = f"./ok_vqa_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "ok_vqa", "task": dump_tasks} 23 | 24 | with open("./_ok_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: 3 | - ok_vqa_val2014 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: ok_vqa_val2014 3 | test_split: val2014 4 | include: _default_template_vqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml: -------------------------------------------------------------------------------- 1 | group: olympiadbench 2 | task: 3 | - olympiadbench_test_en 4 | - olympiadbench_test_cn 5 | metadata: 6 | - version: 0.0 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_cn" 5 | test_split: test_cn 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function cn_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function cn_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_en" 5 | test_split: test_en 6 | output_type: generate_until 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function en_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function en_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_kwargs: 3 | token: True 4 | task: "pope" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.pope_doc_to_visual 8 | doc_to_text: !function utils.pope_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.pope_process_results 17 | metric_list: 18 | - metric: pope_accuracy 19 | aggregation: !function utils.pope_aggregate_accuracy 20 | higher_is_better: true 21 | - metric: pope_precision 22 | aggregation: !function utils.pope_aggregate_precision 23 | higher_is_better: true 24 | - metric: pope_recall 25 | aggregation: !function utils.pope_aggregate_recall 26 | higher_is_better: true 27 | - metric: pope_f1_score 28 | aggregation: !function utils.pope_aggregate_f1_score 29 | higher_is_better: true 30 | - metric: pope_yes_ratio 31 | aggregation: !function utils.pope_aggregate_yes_ratio 32 | higher_is_better: true 33 | metadata: 34 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_adv.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_adv" 6 | test_split: adversarial 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_full.yaml: -------------------------------------------------------------------------------- 1 | group : pope_full 2 | task: 3 | - pope_adv 4 | - pope_pop 5 | - pope_random -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_pop.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_pop" 6 | test_split: popular 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope_random.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_name: Full 3 | dataset_kwargs: 4 | token: True 5 | task: "pope_random" 6 | test_split: random 7 | output_type: generate_until 8 | doc_to_visual: !function utils.pope_doc_to_visual 9 | doc_to_text: !function utils.pope_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.pope_process_results 18 | metric_list: 19 | - metric: pope_accuracy 20 | aggregation: !function utils.pope_aggregate_accuracy 21 | higher_is_better: true 22 | - metric: pope_precision 23 | aggregation: !function utils.pope_aggregate_precision 24 | higher_is_better: true 25 | - metric: pope_recall 26 | aggregation: !function utils.pope_aggregate_recall 27 | higher_is_better: true 28 | - metric: pope_f1_score 29 | aggregation: !function utils.pope_aggregate_f1_score 30 | higher_is_better: true 31 | - metric: pope_yes_ratio 32 | aggregation: !function utils.pope_aggregate_yes_ratio 33 | higher_is_better: true 34 | metadata: 35 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RealWorldQA 2 | dataset_kwargs: 3 | token: True 4 | task: "realworldqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual 8 | doc_to_text: !function utils.realworldqa_doc_to_text 9 | doc_to_target: "answer" 10 | 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | 18 | filter_list: 19 | - name: "flexible-extract" 20 | filter: 21 | - function: !function utils.NumberWordsToDigitsFilter 22 | - function: !function utils.MultiChoiceRegexFilter 23 | group_select: 0 24 | ignore_case: true 25 | ignore_punctuation: true 26 | regex_pattern: "(\\([A-Z]\\))" 27 | 28 | metric_list: 29 | - metric: exact_match 30 | aggregation: mean 31 | higher_is_better: true 32 | ignore_case: true 33 | ignore_punctuation: true 34 | 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" 39 | gpt4v: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | metadata: 43 | - version: 0.0 44 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOPlus 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "val", "testA", "testB"] 5 | splits = ["val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco+_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco+", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+ 2 | task: 3 | - refcoco+_seg_val 4 | - refcoco+_seg_testA 5 | - refcoco+_seg_testB 6 | - refcoco+_bbox_val 7 | - refcoco+_bbox_testA 8 | - refcoco+_bbox_testB 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testA 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testB 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testA 3 | include: _default_template_bbox_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testB 3 | include: _default_template_bbox_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testA 3 | include: _default_template_seg_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testB 3 | include: _default_template_seg_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val", "testA", "testB"] 5 | splits = ["test", "val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco 2 | task: 3 | - refcoco_seg_test 4 | - refcoco_seg_val 5 | - refcoco_seg_testA 6 | - refcoco_seg_testB 7 | - refcoco_bbox_test 8 | - refcoco_bbox_val 9 | - refcoco_bbox_testA 10 | - refcoco_bbox_testB 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_test 3 | test_split: test 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testA 3 | test_split: testA 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testB 3 | test_split: testB 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_val 3 | test_split: val 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_test 3 | test_split: test 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testA 3 | test_split: testA 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testB 3 | test_split: testB 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_val 3 | test_split: val 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_test 3 | test_split: test 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testA 3 | test_split: testA 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testB 3 | test_split: testB 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_val 3 | test_split: val 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val"] 5 | splits = ["test", "val"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcocog_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcocog", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog 2 | task: 3 | - refcocog_seg_test 4 | - refcocog_seg_val 5 | - refcocog_bbox_test 6 | - refcocog_bbox_val 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_test 3 | include: _default_template_bbox_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_test 3 | include: _default_template_bbox_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_test 3 | include: _default_template_seg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-FULL 3 | task: "scienceqa" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | 33 | model_specific_generation_kwargs: 34 | llava: 35 | image_aspect_ratio: original 36 | 37 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml: -------------------------------------------------------------------------------- 1 | group: scienceqa_full 2 | task: 3 | - scienceqa 4 | - scienceqa_img -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-IMG 3 | task: "scienceqa_img" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | idefics2: 33 | format: default 34 | pre_prompt: "" 35 | post_prompt: "\nAnswer:" 36 | model_specific_generation_kwargs: 37 | llava: 38 | image_aspect_ratio: original 39 | 40 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text 5 | doc_to_target: "bbox" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils_rec.screenspot_rec_process_result 10 | metric_list: 11 | - metric: screenspot_IoU 12 | aggregation : !function utils_rec.screenspot_rec_iou 13 | higher_is_better : true 14 | - metric: screenspot_ACC@0.1 15 | aggregation : !function utils_rec.screenspot_rec_acc01 16 | higher_is_better : true 17 | - metric: screenspot_ACC@0.3 18 | aggregation : !function utils_rec.screenspot_rec_acc03 19 | higher_is_better : true 20 | - metric: screenspot_ACC@0.5 21 | aggregation : !function utils_rec.screenspot_rec_acc05 22 | higher_is_better : true 23 | - metric: screenspot_ACC@0.7 24 | aggregation : !function utils_rec.screenspot_rec_acc07 25 | higher_is_better : true 26 | - metric: screenspot_ACC@0.9 27 | aggregation : !function utils_rec.screenspot_rec_acc09 28 | higher_is_better : true 29 | - metric: screenspot_Center_ACC 30 | aggregation : !function utils_rec.screenspot_rec_center_acc 31 | higher_is_better : true 32 | metadata: 33 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual 4 | doc_to_text: !function utils.screenspot_doc_to_text 5 | doc_to_target: "instruction" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.screenspot_process_result 10 | metric_list: 11 | - metric: screenspot_CIDEr 12 | aggregation : !function utils.screenspot_cider 13 | higher_is_better : true 14 | metadata: 15 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot 2 | task: 3 | - screenspot_reg_test 4 | - screenspot_rec_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_rec 2 | task: screenspot_rec_test 3 | include: _default_template_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_reg 2 | task: screenspot_reg_test 3 | include: _default_template_reg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | # The return value of process_results will be used by metrics 15 | process_results: !function utils.seed_process_result 16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 17 | metric_list: 18 | - metric: seed_image 19 | aggregation: !function utils.seed_aggregation_result 20 | higher_is_better: true 21 | - metric: seed_video 22 | aggregation: !function utils.seed_aggregation_result 23 | higher_is_better: true 24 | - metric: seed_all 25 | aggregation: !function utils.seed_aggregation_result 26 | higher_is_better: true 27 | metadata: 28 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench_ppl" 5 | test_split: test 6 | output_type: multiple_choice 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text_mc 9 | doc_to_choice : !function utils.seed_doc_to_choice 10 | doc_to_target: !function utils.seed_doc_to_mc_target 11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 12 | metric_list: 13 | - metric: acc 14 | metadata: 15 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ST-VQA 2 | task: "stvqa" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.stvqa_doc_to_visual 6 | doc_to_text: !function utils.stvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.stvqa_process_results 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.stvqa_aggregate_submissions 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs): 9 | question = doc["question"] 10 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 11 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 12 | return f"{pre_prompt}{question}{post_prompt}" 13 | 14 | 15 | def stvqa_doc_to_visual(doc): 16 | return [doc["image"].convert("RGB")] 17 | 18 | 19 | def stvqa_process_results(doc, results): 20 | answer = results[0] 21 | return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}} 22 | 23 | 24 | def stvqa_aggregate_submissions(results, args): 25 | file = generate_submission_file("stvqa_test_for_submission.json", args) 26 | with open(file, "w") as f: 27 | json.dump(results, f) 28 | logging.getLogger("lmms-eval").info(f"Results saved to {file}") 29 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: Provide a one-sentence caption for the provided image. -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml: -------------------------------------------------------------------------------- 1 | group : textcaps 2 | task: 3 | - textcaps_val 4 | - textcaps_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "textcaps_test" 5 | group : "textcaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.textcaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: textcaps_passthrough 21 | aggregation : !function utils.textcaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/textvqa 2 | output_type: generate_until 3 | doc_to_visual: !function utils.textvqa_doc_to_visual 4 | doc_to_text: !function utils.textvqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.textvqa_process_results 10 | model_specific_prompt_kwargs: 11 | default: 12 | pre_prompt: "" 13 | post_prompt: "\nAnswer the question using a single word or phrase." 14 | ocr: false 15 | qwen_vl: 16 | pre_prompt: "" 17 | post_prompt: " Answer:" 18 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml: -------------------------------------------------------------------------------- 1 | group: textvqa 2 | task: 3 | - textvqa_val 4 | - textvqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function utils.textvqa_aggreate_submissions 6 | higher_is_better: true 7 | include: _default_template_textvqa_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_val 2 | test_split: validation 3 | metric_list: 4 | - metric: exact_match 5 | aggregation: mean 6 | higher_is_better: true 7 | ignore_case: true 8 | ignore_punctuation: true 9 | - metric: submission 10 | aggregation: !function utils.textvqa_aggreate_submissions 11 | higher_is_better: true 12 | include: _default_template_textvqa_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VizWiz-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metadata: 10 | - version: 0.0 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 15 | process_results: !function utils.vizwiz_vqa_process_results 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val", "test"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"vizwiz_{task}_{split}") 16 | 17 | save_path = f"./vizwiz_{task}_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "vizwiz_vqa", "task": dump_tasks} 23 | 24 | with open("./_vizwiz_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: 3 | - vizwiz_vqa_val 4 | - vizwiz_vqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_test 3 | test_split: test 4 | include: _default_template_vqa_yaml 5 | process_results: !function utils.vizwiz_vqa_process_results 6 | metric_list: 7 | # - metric: exact_match 8 | # aggregation: mean 9 | # higher_is_better: true 10 | # ignore_case: true 11 | # ignore_punctuation: true 12 | - metric: submission 13 | aggregation: !function utils.vizwiz_vqa_aggreate_submissions 14 | higher_is_better: true 15 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_val 3 | test_split: val 4 | include: _default_template_vqa_yaml 5 | metric_list: 6 | - metric: exact_match 7 | aggregation: mean 8 | higher_is_better: true 9 | ignore_case: true 10 | ignore_punctuation: true 11 | # - metric: submission 12 | # aggregation: !function utils.vizwiz_vqa_aggreate_submissions 13 | # higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VQAv2 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.vqav2_doc_to_visual 6 | doc_to_text: !function utils.vqav2_doc_to_text 7 | doc_to_target: "answer" 8 | generation_kwargs: 9 | max_new_tokens: 16 10 | metadata: 11 | - version: 0.0 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml: -------------------------------------------------------------------------------- 1 | group: vqav2 2 | task: 3 | - vqav2_val 4 | - vqav2_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_test" 2 | include: _default_template_vqav2_yaml 3 | test_split: test 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.vqav2_aggreate_submissions 7 | higher_is_better: true 8 | process_results: !function utils.vqav2_process_results_test 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_val" 2 | include: _default_template_vqav2_yaml 3 | test_split: validation 4 | metric_list: 5 | - metric: exact_match 6 | aggregation: mean 7 | higher_is_better: true 8 | ignore_case: true 9 | ignore_punctuation: true 10 | process_results: !function utils.vqav2_process_results_val 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc.yaml: -------------------------------------------------------------------------------- 1 | group: websrc 2 | task: 3 | - websrc_val 4 | - websrc_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc-test 2 | task: "websrc_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.websrc_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc 2 | task: "websrc_val" 3 | test_split: dev 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: websrc_squad_f1 16 | aggregation: !function utils.websrc_aggregate_results 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/miscs/llava_result_check.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/miscs/llava_result_check.md -------------------------------------------------------------------------------- /lmms-eval/miscs/repr_scripts.sh: -------------------------------------------------------------------------------- 1 | # install lmms_eval without building dependencies 2 | cd lmms_eval; 3 | pip install --no-deps -U -e . 4 | 5 | # install LLaVA without building dependencies 6 | cd LLaVA 7 | pip install --no-deps -U -e . 8 | 9 | # install all the requirements that require for reproduce llava results 10 | pip install -r llava_repr_requirements.txt 11 | 12 | # Run and exactly reproduce llava_v1.5 results! 13 | # mme as an example 14 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto" --tasks mme --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/ -------------------------------------------------------------------------------- /lmms-eval/miscs/script.sh: -------------------------------------------------------------------------------- 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme_llava_prompt --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 2 | 3 | 4 | gpu = 8 bs 1: 5 | 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1 7 | | Tasks |Version|Filter|n-shot| Metric |Value| |Stderr | 8 | |----------------|-------|------|-----:|-----------|----:|---|------:| 9 | |mme_llava_prompt|Yaml |none | 0|exact_match| 1873|± |38.4331| 10 | 11 | gpu = 8 bs 1 use_flash_attention_2=True: 12 | 13 | 14 | 15 | 16 | 17 | gpu = 4 bs 1 use_flash_attention_2=True: 18 | 19 | 20 | 21 | accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks scienceqa --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 22 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_llava.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | 4 | import torch 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration 6 | 7 | model_id = "llava-hf/llava-1.5-7b-hf" 8 | 9 | prompt_1 = "USER: \nWhat does this image show?\nASSISTANT:" 10 | prompt_2 = "USER: \nWhat is the difference between these two images?\nASSISTANT:" 11 | image_file_1 = "image1.png" 12 | image_file_2 = "image2.png" 13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0) 14 | processor = AutoProcessor.from_pretrained(model_id) 15 | raw_image_1 = Image.open(image_file_1) 16 | raw_image_2 = Image.open(image_file_2) 17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16) 18 | import pdb 19 | 20 | pdb.set_trace() 21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False) 22 | print(processor.batch_decode(output, skip_special_tokens=True)) 23 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_scienceqa.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"] 4 | for doc in dataset: 5 | print(doc["id"]) 6 | -------------------------------------------------------------------------------- /lmms-eval/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # This is to make sure that the package supports editable installs 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/000_caps.txt: -------------------------------------------------------------------------------- 1 | A man wearing multiple neck ties making a goofy face. 2 | A man in a white shirt wearing very many ties. 3 | a man with ties on poses for a picture 4 | A man wearing multiple ties on his neck. 5 | A young man smiles while wearing several ties. 6 | 7 | tie: [0.574, 0.298, 0.752, 0.704] 8 | tie: [0.464, 0.339, 0.639, 0.789] 9 | tie: [0.349, 0.363, 0.563, 0.732] 10 | tie: [0.259, 0.255, 0.668, 0.805] 11 | person: [0.019, 0.065, 0.962, 0.988] 12 | person: [0.0, 0.24, 0.214, 1.0] 13 | tie: [0.316, 0.778, 0.443, 0.867] 14 | tie: [0.386, 0.707, 0.496, 0.801] 15 | tie: [0.251, 0.354, 0.402, 0.747] 16 | tie: [0.44, 0.318, 0.63, 0.397] 17 | tie: [0.566, 0.33, 0.655, 0.543] 18 | tie: [0.25, 0.409, 0.359, 0.556] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/000_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What is unusual about this photo? 3 | === 4 | Answer: 5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/001_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges do these people face? 3 | === 4 | Answer: 5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/002_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. 6 | 7 | fire hydrant: [0.326, 0.612, 0.426, 0.72] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/002_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges might this city face? 3 | === 4 | Answer: 5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail. 4 | 5 | Create complex questions beyond describing the scene. 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request. Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first. 7 | 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 9 | 10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/000_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/001_caps.txt: -------------------------------------------------------------------------------- 1 | A man is skiing in the open snow covered hills 2 | A skier is making his way into the snow. 3 | A skier on the foothills of a huge mountain range. 4 | A skier looks at mountains while standing near a trail sign. 5 | a single person skiing by an area with a lot of bushes -------------------------------------------------------------------------------- /playground/data/prompts/conversation/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image. 2 | 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. 4 | Ask diverse questions and give corresponding answers. 5 | 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers: 7 | (1) one can see the content in the image that the question asks about and can answer confidently; 8 | (2) one can determine confidently from the image that it is not in the image. 9 | Do not ask any question that cannot be answered confidently. 10 | 11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details. 12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. You can include multiple paragraphs if necessary. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/000_caps.txt: -------------------------------------------------------------------------------- 1 | A harbor filled with lots of boats next to a building. 2 | A bicycle parked in front of several boats at a dock. 3 | A red bicycle in front of a line of docked white yachts 4 | A bike sits before boats which sit before a long building. 5 | A bicycle is a convenient means of land transportation when you live on a boat. 6 | 7 | bicycle: [0.287, 0.641, 0.507, 0.874] 8 | bicycle: [0.566, 0.667, 0.63, 0.731] 9 | boat: [0.318, 0.579, 0.575, 0.724] 10 | boat: [0.704, 0.607, 0.818, 0.727] 11 | boat: [0.818, 0.601, 0.942, 0.744] 12 | boat: [0.002, 0.53, 0.243, 0.71] 13 | boat: [0.541, 0.611, 0.668, 0.731] 14 | person: [0.778, 0.527, 0.797, 0.57] 15 | cup: [0.708, 0.733, 0.724, 0.758] 16 | boat: [0.236, 0.532, 0.404, 0.64] 17 | boat: [0.81, 0.632, 0.836, 0.676] 18 | boat: [0.957, 0.526, 1.0, 0.752] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/000_conv.txt: -------------------------------------------------------------------------------- 1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor. 2 | 3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/001_conv.txt: -------------------------------------------------------------------------------- 1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 2 | 3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 4 | 5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/002_caps.txt: -------------------------------------------------------------------------------- 1 | A man holds a Wii-mote above his head while another looks on. 2 | A guy and his friend are playing Nintendo Wii. 3 | A young man is holding a video game remote over his head. 4 | two men standing in a room while one plays with a wii mote 5 | Some guys standing and playing a video game. 6 | 7 | couch: [0.697, 0.759, 0.995, 1.0] 8 | dining table: [0.426, 0.755, 1.0, 0.987] 9 | person: [0.082, 0.252, 0.342, 1.0] 10 | person: [0.399, 0.085, 0.742, 0.982] 11 | remote: [0.477, 0.135, 0.516, 0.187] 12 | sink: [0.016, 0.501, 0.063, 0.52] 13 | potted plant: [0.798, 0.384, 0.888, 0.645] 14 | refrigerator: [0.305, 0.389, 0.414, 0.547] 15 | chair: [0.72, 0.509, 0.858, 0.725] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/002_conv.txt: -------------------------------------------------------------------------------- 1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 2 | 3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | Using the provided caption and bounding box information, describe the scene in a detailed manner. 4 | 5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 6 | 7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.2.2.post1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.1.2", "torchvision==0.16.2", 17 | "transformers==4.36.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", "opencv-python", "moviepy", "protobuf" 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://llava-vl.github.io" 31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /scripts/finetune_sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | deepspeed llava/train/train_mem.py \ 6 | --deepspeed ./scripts/zero2.json \ 7 | --model_name_or_path lmsys/vicuna-13b-v1.3 \ 8 | --version $PROMPT_VERSION \ 9 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ 10 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \ 11 | --vision_tower openai/clip-vit-large-patch14 \ 12 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ 18 | --num_train_epochs 12 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/pretrain_xformers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | # MODEL_VERSION=vicuna-v1-3-7b 6 | # MODEL_VERSION=llama-2-7b-chat 7 | 8 | ########### DO NOT CHANGE ########### 9 | ########### USE THIS FOR BOTH ########### 10 | PROMPT_VERSION=plain 11 | ########### DO NOT CHANGE ########### 12 | 13 | deepspeed llava/train/train_xformers.py \ 14 | --deepspeed ./scripts/zero2.json \ 15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 16 | --version $PROMPT_VERSION \ 17 | --data_path /path/to/pretrain_data.json \ 18 | --image_folder /path/to/images \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 False \ 25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 4 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 2e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 False \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | -------------------------------------------------------------------------------- /scripts/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /scripts/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /scripts/upload_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Step 0: Clean up 4 | rm -rf dist 5 | 6 | # Step 1: Change the package name to "llava-torch" 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml 8 | 9 | # Step 2: Build the package 10 | python -m build 11 | 12 | # Step 3: Revert the changes in pyproject.toml to the original 13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml 14 | 15 | # Step 4: Upload to PyPI 16 | python -m twine upload dist/* 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | matryoshka_vis_token_scale=$1 8 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path mucai/llava-v1.5-7b-m3 \ 15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/gqa/data/images \ 17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 38 | 39 | cd $GQADIR 40 | python eval/eval.py --tier testdev_balanced 41 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | 5 | python -m llava.eval.model_vqa \ 6 | --model-path mucai/llava-v1.5-7b-m3 \ 7 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 8 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 9 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/$CKPT.jsonl \ 10 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 15 | 16 | python llava/eval/eval_gpt_review_bench.py \ 17 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 18 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 19 | --rule llava/eval/table/rule.json \ 20 | --answer-list \ 21 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 22 | playground/data/eval/llava-bench-in-the-wild/answers/$CKPT.jsonl \ 23 | --output \ 24 | playground/data/eval/llava-bench-in-the-wild/reviews/$CKPT.jsonl 25 | 26 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/$CKPT.jsonl 27 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | python -m llava.eval.model_vqa_mmbench \ 7 | --model-path mucai/llava-v1.5-7b-m3 \ 8 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 13 | --conv-mode vicuna_v1 14 | 15 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 16 | 17 | python scripts/convert_mmbench_for_submission.py \ 18 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 19 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 20 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 21 | --experiment $CKPT 22 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | SPLIT="mmbench_dev_cn_20231003" 5 | 6 | python -m llava.eval.model_vqa_mmbench \ 7 | --model-path mucai/llava-v1.5-7b-m3 \ 8 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 9 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/$CKPT.jsonl \ 10 | --lang cn \ 11 | --single-pred-prompt \ 12 | --temperature 0 \ 13 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 14 | --conv-mode vicuna_v1 15 | 16 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 17 | 18 | python scripts/convert_mmbench_for_submission.py \ 19 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 20 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 21 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 22 | --experiment $CKPT 23 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme-llava-next.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-next-vicuna-7b-m3-${matryoshka_vis_token_scale} 4 | 5 | python -m llava.eval.model_vqa_loader \ 6 | --model-path ./checkpoints/llava-next-vicuna-7b-m3 \ 7 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 8 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 9 | --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 12 | --conv-mode vicuna_v1 13 | 14 | cd ./playground/data/eval/MME 15 | 16 | python convert_answer_to_mme.py --experiment $CKPT 17 | 18 | cd eval_tool 19 | 20 | python calculation.py --results_dir answers/$CKPT 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | 5 | python -m llava.eval.model_vqa_loader \ 6 | --model-path ./checkpoints/llava-v1.5-7b-m3 \ 7 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 8 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 9 | --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 12 | --conv-mode vicuna_v1 13 | 14 | cd ./playground/data/eval/MME 15 | 16 | python convert_answer_to_mme.py --experiment $CKPT 17 | 18 | cd eval_tool 19 | 20 | python calculation.py --results_dir answers/$CKPT 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | python -m llava.eval.model_vqa \ 5 | --model-path mucai/llava-v1.5-7b-m3 \ 6 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 7 | --image-folder ./playground/data/eval/mm-vet/images \ 8 | --answers-file ./playground/data/eval/mm-vet/answers/$CKPT.jsonl \ 9 | --temperature 0 \ 10 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p ./playground/data/eval/mm-vet/results 14 | 15 | python scripts/convert_mmvet_for_eval.py \ 16 | --src ./playground/data/eval/mm-vet/answers/$CKPT.jsonl \ 17 | --dst ./playground/data/eval/mm-vet/results/$CKPT.json 18 | 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | python -m llava.eval.model_vqa_loader \ 5 | --model-path mucai/llava-v1.5-7b-m3 \ 6 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 7 | --image-folder ./playground/data/eval/pope/val2014 \ 8 | --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \ 9 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | python llava/eval/eval_pope.py \ 14 | --annotation-dir ./playground/data/eval/pope/coco \ 15 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 16 | --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | if [ "$1" = "dev" ]; then 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | echo "Evaluating in 'test' split." 8 | else 9 | echo "Unknown split, please choose between 'dev' and 'test'." 10 | exit 1 11 | fi 12 | 13 | python -m llava.eval.model_vqa_qbench \ 14 | --model-path mucai/llava-v1.5-7b-m3 \ 15 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 16 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 17 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 18 | --conv-mode llava_v1 \ 19 | --lang en 20 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | if [ "$1" = "dev" ]; then 5 | ZH_SPLIT="验证集" 6 | echo "Evaluating in 'dev' split." 7 | elif [ "$1" = "test" ]; then 8 | ZH_SPLIT="测试集" 9 | echo "Evaluating in 'test' split." 10 | else 11 | echo "Unknown split, please choose between 'dev' and 'test'." 12 | exit 1 13 | fi 14 | 15 | python -m llava.eval.model_vqa_qbench \ 16 | --model-path mucai/llava-v1.5-7b-m3 \ 17 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 18 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 19 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 20 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 21 | --conv-mode llava_v1 \ 22 | --lang zh 23 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="llava-v1.5-13b" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path mucai/llava-v1.5-7b-m3 \ 14 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \ 15 | --image-folder ./playground/data/eval/seed_bench \ 16 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | # Evaluate 37 | python scripts/convert_seed_for_submission.py \ 38 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 39 | --result-file $output_file \ 40 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/$CKPT.jsonl 41 | 42 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | python -m llava.eval.model_vqa_science \ 5 | --model-path mucai/llava-v1.5-7b-m3 \ 6 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 7 | --image-folder ./playground/data/eval/scienceqa/images/test \ 8 | --answers-file ./playground/data/eval/scienceqa/answers/$CKPT.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 12 | --conv-mode vicuna_v1 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ./playground/data/eval/scienceqa \ 16 | --result-file ./playground/data/eval/scienceqa/answers/$CKPT.jsonl \ 17 | --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \ 18 | --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | python -m llava.eval.model_vqa_loader \ 5 | --model-path mucai/llava-v1.5-7b-m3 \ 6 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 7 | --image-folder ./playground/data/eval/textvqa/train_images \ 8 | --answers-file ./playground/data/eval/textvqa/answers/${CKPT}.jsonl \ 9 | --temperature 0 \ 10 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 11 | --conv-mode vicuna_v1 12 | 13 | python -m llava.eval.eval_textvqa \ 14 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 15 | --result-file ./playground/data/eval/textvqa/answers/${CKPT}.jsonl 16 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | python -m llava.eval.model_vqa_loader \ 5 | --model-path mucai/llava-v1.5-7b-m3 \ 6 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 7 | --image-folder ./playground/data/eval/vizwiz/test \ 8 | --answers-file ./playground/data/eval/vizwiz/answers/${CKPT}.jsonl \ 9 | --temperature 0 \ 10 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 11 | --conv-mode vicuna_v1 12 | 13 | python scripts/convert_vizwiz_for_submission.py \ 14 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 15 | --result-file ./playground/data/eval/vizwiz/answers/${CKPT}.jsonl \ 16 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${CKPT}.json 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | matryoshka_vis_token_scale=$1 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale} 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path mucai/llava-v1.5-7b-m3 \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --matryoshka_vis_token_scale $matryoshka_vis_token_scale \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 37 | 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path liuhaotian/llava-v1.5-7b \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --mm_vision_select_layer -2 \ 12 | --mm_use_im_start_end False \ 13 | --mm_use_im_patch_token False \ 14 | --image_aspect_ratio pad \ 15 | --group_by_modality_length True \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-v1.5-13b-m3 \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 4 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 4 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --matryoshka_vis_token_scale "1,9,36,144,576" \ 37 | --unfreeze_mm_vision_tower True \ 38 | --mm_vision_tower_lr 2e-5 \ 39 | --report_to wandb 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path liuhaotian/llava-v1.5-7b \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-7b-m3-lora \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 4 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-4 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --matryoshka_vis_token_scale "1,9,36,144,576" \ 38 | --unfreeze_mm_vision_tower True \ 39 | --mm_vision_tower_lr 2e-5 \ 40 | --report_to wandb 41 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path mucai/llava-v1.5-7b-m3 \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --mm_vision_select_layer -2 \ 12 | --mm_use_im_start_end False \ 13 | --mm_use_im_patch_token False \ 14 | --image_aspect_ratio pad \ 15 | --group_by_modality_length True \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-v1.5-7b-m3-task \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 4 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 4 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --matryoshka_vis_token_scale "1,9,36,144,576" \ 37 | --unfreeze_mm_vision_tower True \ 38 | --mm_vision_tower_lr 2e-5 \ 39 | --report_to wandb 40 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path mucai/llava-v1.5-7b-m3 \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-7b-m3-task-lora \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 4 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-4 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --matryoshka_vis_token_scale "1,9,36,144,576" \ 38 | --unfreeze_mm_vision_tower True \ 39 | --mm_vision_tower_lr 2e-5 \ 40 | --report_to wandb 41 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version plain \ 7 | --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./playground/data/LLaVA-Pretrain/images \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } --------------------------------------------------------------------------------