├── .devcontainer
    ├── Dockerfile
    ├── devcontainer.env
    ├── devcontainer.json
    └── postCreateCommand.sh
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── 1-usage.yaml
    │   ├── 2-feature-request.yaml
    │   ├── 3-question.yaml
    │   └── 4-discussion.yaml
├── .gitignore
├── IG-VLM
    ├── LICENSE
    ├── README.md
    ├── data
    │   ├── multiple_choice_qa
    │   │   ├── EgoSchema.csv
    │   │   ├── IntentQA.csv
    │   │   ├── NExT_QA.csv
    │   │   ├── STAR.csv
    │   │   └── TVQA.csv
    │   ├── open_ended_qa
    │   │   ├── ActivityNet_QA.csv
    │   │   ├── MSRVTT_QA.csv
    │   │   ├── MSVD_QA.csv
    │   │   └── TGIF_FrameQA.csv
    │   └── text_generation_benchmark
    │   │   ├── Consistency_QA1.csv
    │   │   ├── Consistency_QA2.csv
    │   │   ├── Generic_QA.csv
    │   │   └── Temporal_QA.csv
    ├── eval_gpt4v_multiplechoice.py
    ├── eval_gpt4v_openended.py
    ├── eval_gpt4v_textgeneration_openended.py
    ├── eval_llava_multiplechoice.py
    ├── eval_llava_openended.py
    ├── eval_llava_textgeneration_openended.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── direct_answer_eval.py
    │   ├── gpt3_consistency_utils.py
    │   └── gpt3_evaluation_utils.py
    ├── example
    │   ├── QzulzuWr0Zw_20_45.mp4
    │   ├── extraction_sample
    │   │   ├── rlQ2kW-FvMk_66_79
    │   │   │   ├── 1.jpg
    │   │   │   ├── 2.jpg
    │   │   │   ├── 3.jpg
    │   │   │   ├── 4.jpg
    │   │   │   ├── 5.jpg
    │   │   │   └── 6.jpg
    │   │   └── ysTmUTQ5wZE_17_45
    │   │   │   ├── 1.jpg
    │   │   │   ├── 2.jpg
    │   │   │   ├── 3.jpg
    │   │   │   ├── 4.jpg
    │   │   │   ├── 5.jpg
    │   │   │   └── 6.jpg
    │   ├── imagegrid_sample
    │   │   ├── QzulzuWr0Zw_20_45.jpg
    │   │   └── rlQ2kW-FvMk_66_79.jpg
    │   ├── rlQ2kW-FvMk_66_79.mp4
    │   └── ysTmUTQ5wZE_17_45.mp4
    ├── model_processor
    │   ├── __init__.py
    │   ├── base_model_inference.py
    │   ├── gpt4_model_processor.py
    │   └── llava2_model_processor.py
    ├── pipeline_processor
    │   ├── __init__.py
    │   ├── gpt4_pipeline.py
    │   ├── llava_pipeline.py
    │   └── record.py
    ├── requirements.txt
    ├── scripts
    │   └── nextqa_multi.sh
    └── vision_processor
    │   ├── __init__.py
    │   ├── base_frame_extractor.py
    │   ├── base_post_processor.py
    │   ├── decorator_processor.py
    │   ├── fps_extractor.py
    │   ├── fps_gridview_processor.py
    │   ├── gridview_generator.py
    │   └── video_validation.py
├── LICENSE
├── README.md
├── cog.yaml
├── docs
    ├── Customize_Component.md
    ├── Data.md
    ├── Evaluation.md
    ├── Finetune_Custom_Data.md
    ├── Intel.md
    ├── LLaVA_Bench.md
    ├── LLaVA_from_LLaMA2.md
    ├── LoRA.md
    ├── MODEL_ZOO.md
    ├── ScienceQA.md
    ├── Windows.md
    └── macOS.md
├── images
    ├── demo_cli.gif
    ├── llava_example_cmp.png
    ├── llava_logo.png
    ├── llava_v1_5_radar.jpg
    └── teaser.png
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   ├── table
    │   │   ├── answer
    │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   ├── answer_bard.jsonl
    │   │   │   ├── answer_gpt35.jsonl
    │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   └── answer_vicuna-13b.jsonl
    │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   ├── model.jsonl
    │   │   ├── prompt.jsonl
    │   │   ├── question.jsonl
    │   │   ├── results
    │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   ├── review
    │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   ├── reviewer.jsonl
    │   │   └── rule.json
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_llama.py
    │   │   ├── llava_mistral.py
    │   │   └── llava_mpt.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   ├── m3-demo-1.jpg
    │   │   ├── m3-demo.jpg
    │   │   ├── sample_demo_1.mp4
    │   │   ├── sample_demo_3.mp4
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── lmms-eval
    ├── LICENSE
    ├── README.md
    ├── docs
    │   ├── README.md
    │   ├── commands.md
    │   ├── model_guide.md
    │   └── task_guide.md
    ├── example_eval.yaml
    ├── llava_repr_requirements.txt
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── filter.py
    │   │   ├── instance.py
    │   │   ├── metrics.py
    │   │   ├── model.py
    │   │   ├── registry.py
    │   │   ├── samplers.py
    │   │   └── task.py
    │   ├── evaluator.py
    │   ├── filters
    │   │   ├── __init__.py
    │   │   ├── decontamination.py
    │   │   ├── extraction.py
    │   │   ├── selection.py
    │   │   └── transformation.py
    │   ├── logging_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── fuyu.py
    │   │   ├── gpt4v.py
    │   │   ├── idefics2.py
    │   │   ├── instructblip.py
    │   │   ├── llava.py
    │   │   ├── llava_hf.py
    │   │   ├── llava_sglang.py
    │   │   ├── minicpm_v.py
    │   │   ├── model_utils
    │   │   │   ├── __init__.py
    │   │   │   └── qwen
    │   │   │   │   └── qwen_generate_utils.py
    │   │   ├── phi3v.py
    │   │   ├── qwen_vl.py
    │   │   ├── qwen_vl_api.py
    │   │   └── xcomposer2_4KHD.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── _task_utils
    │   │   │   ├── file_utils.py
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   └── vqa_eval_metric.py
    │   │   ├── ai2d
    │   │   │   ├── ai2d.yaml
    │   │   │   ├── upload_ai2d.py
    │   │   │   └── utils.py
    │   │   ├── chartqa
    │   │   │   ├── chartqa.yaml
    │   │   │   ├── upload_chartqa.py
    │   │   │   └── utils.py
    │   │   ├── cmmmu
    │   │   │   ├── _cmmmu.yaml
    │   │   │   ├── _default_template_cmmmu_yaml
    │   │   │   ├── cmmmu_test.yaml
    │   │   │   ├── cmmmu_val.yaml
    │   │   │   └── utils.py
    │   │   ├── coco_cap
    │   │   │   ├── coco2014_cap.yaml
    │   │   │   ├── coco2014_cap_test.yaml
    │   │   │   ├── coco2014_cap_val.yaml
    │   │   │   ├── coco2017_cap.yaml
    │   │   │   ├── coco2017_cap_test.yaml
    │   │   │   ├── coco2017_cap_val.yaml
    │   │   │   ├── coco_cap.yaml
    │   │   │   └── utils.py
    │   │   ├── docvqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── docvqa.yaml
    │   │   │   ├── docvqa_test.yaml
    │   │   │   ├── docvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ferret
    │   │   │   ├── ferret.yaml
    │   │   │   └── utils.py
    │   │   ├── flickr30k
    │   │   │   ├── flickr30k.yaml
    │   │   │   ├── flickr30k_test.yaml
    │   │   │   └── utils.py
    │   │   ├── gqa
    │   │   │   ├── gqa.yaml
    │   │   │   └── utils.py
    │   │   ├── hallusion_bench
    │   │   │   ├── evaluate_hb.py
    │   │   │   ├── hallusion_bench_image.yaml
    │   │   │   └── utils.py
    │   │   ├── iconqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── iconqa.yaml
    │   │   │   ├── iconqa_test.yaml
    │   │   │   ├── iconqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── infovqa
    │   │   │   ├── _default_template_infovqa_yaml
    │   │   │   ├── infovqa.yaml
    │   │   │   ├── infovqa_test.yaml
    │   │   │   ├── infovqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-bench-coco
    │   │   │   ├── llava-bench-coco.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-in-the-wild
    │   │   │   ├── llava-in-the-wild.yaml
    │   │   │   └── utils.py
    │   │   ├── mathverse
    │   │   │   ├── mathverse.yaml
    │   │   │   ├── mathverse_evals.py
    │   │   │   ├── mathverse_testmini.yaml
    │   │   │   ├── mathverse_testmini_text_dominant.yaml
    │   │   │   ├── mathverse_testmini_text_lite.yaml
    │   │   │   ├── mathverse_testmini_text_only.yaml
    │   │   │   ├── mathverse_testmini_vision_dominant.yaml
    │   │   │   ├── mathverse_testmini_vision_intensive.yaml
    │   │   │   ├── mathverse_testmini_vision_only.yaml
    │   │   │   └── utils.py
    │   │   ├── mathvista
    │   │   │   ├── mathvista.yaml
    │   │   │   ├── mathvista_evals.py
    │   │   │   ├── mathvista_test.yaml
    │   │   │   ├── mathvista_testmini.yaml
    │   │   │   └── utils.py
    │   │   ├── mmbench
    │   │   │   ├── _default_template_mmbench_cn_yaml
    │   │   │   ├── _default_template_mmbench_en_yaml
    │   │   │   ├── cc_utils.py
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── mmbench.yaml
    │   │   │   ├── mmbench_cc.yaml
    │   │   │   ├── mmbench_cn.yaml
    │   │   │   ├── mmbench_cn_dev.yaml
    │   │   │   ├── mmbench_cn_test.yaml
    │   │   │   ├── mmbench_en.yaml
    │   │   │   ├── mmbench_en_dev.yaml
    │   │   │   ├── mmbench_en_test.yaml
    │   │   │   └── mmbench_evals.py
    │   │   ├── mme
    │   │   │   ├── mme.yaml
    │   │   │   └── utils.py
    │   │   ├── mmmu
    │   │   │   ├── mmmu.yaml
    │   │   │   ├── mmmu_group_img.yaml
    │   │   │   ├── mmmu_group_img_test.yaml
    │   │   │   ├── mmmu_group_img_val.yaml
    │   │   │   ├── mmmu_test.yaml
    │   │   │   ├── mmmu_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_group_img.py
    │   │   ├── mmvet
    │   │   │   ├── mmvet.yaml
    │   │   │   └── utils.py
    │   │   ├── multidocvqa
    │   │   │   ├── multidocvqa.yaml
    │   │   │   ├── multidocvqa_test.yaml
    │   │   │   ├── multidocvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── multilingual-llava-bench-in-the-wild
    │   │   │   ├── README.md
    │   │   │   ├── _default_template.yaml
    │   │   │   ├── arabic_llava_in_the_wild.yaml
    │   │   │   ├── bengali_llava_in_the_wild.yaml
    │   │   │   ├── chinese_llava_in_the_wild.yaml
    │   │   │   ├── french_llava_in_the_wild.yaml
    │   │   │   ├── hindi_llava_in_the_wild.yaml
    │   │   │   ├── japanese_llava_in_the_wild.yaml
    │   │   │   ├── russian_llava_in_the_wild.yaml
    │   │   │   ├── spanish_llava_in_the_wild.yaml
    │   │   │   ├── urdu_llava_in_the_wild.yaml
    │   │   │   └── utils.py
    │   │   ├── nocaps
    │   │   │   ├── _default_template_nocaps_yaml
    │   │   │   ├── nocaps.yaml
    │   │   │   ├── nocaps_test.yaml
    │   │   │   ├── nocaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ocrbench
    │   │   │   ├── ocrbench.yaml
    │   │   │   ├── upload_ocrbench.py
    │   │   │   └── utils.py
    │   │   ├── ok_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _ok_vqa.yaml
    │   │   │   ├── ok_vqa_val2014.yaml
    │   │   │   └── utils.py
    │   │   ├── olympiadbench
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── olympiadbench.yaml
    │   │   │   ├── olympiadbench_evals.py
    │   │   │   ├── olympiadbench_test_cn.yaml
    │   │   │   └── olympiadbench_test_en.yaml
    │   │   ├── pope
    │   │   │   ├── pope.yaml
    │   │   │   ├── pope_adv.yaml
    │   │   │   ├── pope_full.yaml
    │   │   │   ├── pope_pop.yaml
    │   │   │   ├── pope_random.yaml
    │   │   │   └── utils.py
    │   │   ├── realworldqa
    │   │   │   ├── realworldqa.yaml
    │   │   │   └── utils.py
    │   │   ├── refcoco+
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco+_bbox_rec_testA.yaml
    │   │   │   ├── refcoco+_bbox_rec_testB.yaml
    │   │   │   ├── refcoco+_bbox_rec_val.yaml
    │   │   │   ├── refcoco+_bbox_testA.yaml
    │   │   │   ├── refcoco+_bbox_testB.yaml
    │   │   │   ├── refcoco+_bbox_val.yaml
    │   │   │   ├── refcoco+_seg_testA.yaml
    │   │   │   ├── refcoco+_seg_testB.yaml
    │   │   │   ├── refcoco+_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── refcoco
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco_bbox_rec_test.yaml
    │   │   │   ├── refcoco_bbox_rec_testA.yaml
    │   │   │   ├── refcoco_bbox_rec_testB.yaml
    │   │   │   ├── refcoco_bbox_rec_val.yaml
    │   │   │   ├── refcoco_bbox_test.yaml
    │   │   │   ├── refcoco_bbox_testA.yaml
    │   │   │   ├── refcoco_bbox_testB.yaml
    │   │   │   ├── refcoco_bbox_val.yaml
    │   │   │   ├── refcoco_seg_test.yaml
    │   │   │   ├── refcoco_seg_testA.yaml
    │   │   │   ├── refcoco_seg_testB.yaml
    │   │   │   ├── refcoco_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── refcocog
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcocog_bbox_rec_test.yaml
    │   │   │   ├── refcocog_bbox_rec_val.yaml
    │   │   │   ├── refcocog_bbox_test.yaml
    │   │   │   ├── refcocog_bbox_val.yaml
    │   │   │   ├── refcocog_seg_test.yaml
    │   │   │   ├── refcocog_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── scienceqa
    │   │   │   ├── scienceqa.yaml
    │   │   │   ├── scienceqa_full.yaml
    │   │   │   ├── scienceqa_img.yaml
    │   │   │   └── utils.py
    │   │   ├── screenspot
    │   │   │   ├── README.md
    │   │   │   ├── _default_template_rec_yaml
    │   │   │   ├── _default_template_reg_yaml
    │   │   │   ├── _screenspot.yaml
    │   │   │   ├── screenspot_rec_test.yaml
    │   │   │   ├── screenspot_reg_test.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── seedbench
    │   │   │   ├── seedbench.yaml
    │   │   │   ├── seedbench_ppl.yaml
    │   │   │   └── utils.py
    │   │   ├── seedbench_2
    │   │   │   ├── seedbench_2.yaml
    │   │   │   └── utils.py
    │   │   ├── stvqa
    │   │   │   ├── stvqa.yaml
    │   │   │   └── utils.py
    │   │   ├── textcaps
    │   │   │   ├── _default_template_textcaps_yaml
    │   │   │   ├── textcaps.yaml
    │   │   │   ├── textcaps_test.yaml
    │   │   │   ├── textcaps_train.yaml
    │   │   │   ├── textcaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── textvqa
    │   │   │   ├── _default_template_textvqa_yaml
    │   │   │   ├── _textvqa.yaml
    │   │   │   ├── textvqa_test.yaml
    │   │   │   ├── textvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── vizwiz_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _vizwiz_vqa.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vizwiz_vqa_test.yaml
    │   │   │   └── vizwiz_vqa_val.yaml
    │   │   ├── vqav2
    │   │   │   ├── _default_template_vqav2_yaml
    │   │   │   ├── _vqav2.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vqav2_test.yaml
    │   │   │   └── vqav2_val.yaml
    │   │   └── websrc
    │   │   │   ├── README.md
    │   │   │   ├── utils.py
    │   │   │   ├── websrc.yaml
    │   │   │   ├── websrc_test.yaml
    │   │   │   └── websrc_val.yaml
    │   └── utils.py
    ├── miscs
    │   ├── llava_result_check.md
    │   ├── repr_scripts.sh
    │   ├── repr_torch_envs.txt
    │   ├── scienceqa_id.txt
    │   ├── script.sh
    │   ├── test_llava.py
    │   └── test_scienceqa.py
    ├── pyproject.toml
    └── setup.py
├── playground
    └── data
    │   ├── coco2014_val_gpt4_qa_30x3.jsonl
    │   ├── coco2014_val_qa_eval
    │       ├── qa90_gpt4_answer.jsonl
    │       └── qa90_questions.jsonl
    │   └── prompts
    │       ├── complex_reasoning
    │           ├── 000_caps.txt
    │           ├── 000_conv.txt
    │           ├── 001_caps.txt
    │           ├── 001_conv.txt
    │           ├── 002_caps.txt
    │           ├── 002_conv.txt
    │           └── system_message.txt
    │       ├── conversation
    │           ├── 000_caps.txt
    │           ├── 000_conv.txt
    │           ├── 001_caps.txt
    │           ├── 001_conv.txt
    │           └── system_message.txt
    │       └── detail_description
    │           ├── 000_caps.txt
    │           ├── 000_conv.txt
    │           ├── 001_caps.txt
    │           ├── 001_conv.txt
    │           ├── 002_caps.txt
    │           ├── 002_conv.txt
    │           └── system_message.txt
├── predict.py
├── pyproject.toml
└── scripts
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── extract_mm_projector.py
    ├── finetune.sh
    ├── finetune_full_schedule.sh
    ├── finetune_lora.sh
    ├── finetune_qlora.sh
    ├── finetune_sqa.sh
    ├── merge_lora_weights.py
    ├── pretrain.sh
    ├── pretrain_xformers.sh
    ├── sqa_eval_batch.sh
    ├── sqa_eval_gather.sh
    ├── upload_pypi.sh
    ├── v1_5
        ├── eval
        │   ├── gqa.sh
        │   ├── llavabench.sh
        │   ├── mmbench.sh
        │   ├── mmbench_cn.sh
        │   ├── mme-llava-next.sh
        │   ├── mme.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── qbench.sh
        │   ├── qbench_zh.sh
        │   ├── seed.sh
        │   ├── sqa.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune.sh
        ├── finetune_lora.sh
        ├── finetune_task.sh
        ├── finetune_task_lora.sh
        └── pretrain.sh
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json


/.devcontainer/devcontainer.env:
--------------------------------------------------------------------------------
1 | SAMPLE_ENV_VAR1="Sample Value"
2 | SAMPLE_ENV_VAR2=332431bf-68bf


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # The .dockerignore file excludes files from the container build process.
 2 | #
 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file
 4 | 
 5 | # Exclude Git files
 6 | .git
 7 | .github
 8 | .gitignore
 9 | 
10 | # Exclude Python cache files
11 | __pycache__
12 | .mypy_cache
13 | .pytest_cache
14 | .ruff_cache
15 | 
16 | # Exclude Python virtual environment
17 | /venv
18 | 
19 | # Exclude some weights
20 | /openai
21 | /liuhaotian
22 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | # Unix-style newlines with a newline ending every file
 4 | [*]
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | charset = utf-8
 9 | 
10 | # 4 space indentation
11 | [*.{py,json}]
12 | indent_style = space
13 | indent_size = 4
14 | 
15 | # 2 space indentation
16 | [*.{md,sh,yaml,yml}]
17 | indent_style = space
18 | indent_size = 2


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # https://git-scm.com/docs/gitattributes
 2 | 
 3 | # Set the default behavior, in case people don't have core.autocrlf set.
 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion
 5 | * text=auto
 6 | 
 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
 8 | # Source files
 9 | # ============
10 | *.pxd    text diff=python
11 | *.py     text diff=python
12 | *.py3    text diff=python
13 | *.pyw    text diff=python
14 | *.pyx    text diff=python
15 | *.pyz    text diff=python
16 | *.pyi    text diff=python
17 | 
18 | # Binary files
19 | # ============
20 | *.db     binary
21 | *.p      binary
22 | *.pkl    binary
23 | *.pickle binary
24 | *.pyc    binary export-ignore
25 | *.pyo    binary export-ignore
26 | *.pyd    binary
27 | 
28 | # Jupyter notebook
29 | *.ipynb  text eol=lf
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-usage.yaml:
--------------------------------------------------------------------------------
 1 | name: Usage issues
 2 | description: Report issues in usage.
 3 | title: "[Usage] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for taking the time to fill out this form.  Please give as detailed description as possible for us to better assist with the issue :)
 9 |   - type: textarea
10 |     id: what-happened
11 |     attributes:
12 |       label: Describe the issue
13 |       description: Please give as detailed description as possible for us to better assist with the issue.  Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub.
14 |       placeholder: Issue
15 |       value: |
16 |         Issue:
17 |         
18 |         Command:
19 |         ```
20 |         PASTE THE COMMANDS HERE.
21 |         ```
22 |         
23 |         Log: 
24 |         ```
25 |         PASTE THE LOGS HERE.
26 |         ```
27 |         
28 |         Screenshots:
29 |         You may attach screenshots if it better explains the issue.
30 |     validations:
31 |       required: true
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Request for a new feature
 3 | title: "[Feature request] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  Please share your thoughts of the new features below.
 9 |   - type: textarea
10 |     id: feature
11 |     attributes:
12 |       label: feature
13 |       placeholder: Start your thoughts here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-question.yaml:
--------------------------------------------------------------------------------
 1 | name: Questions
 2 | description: General questions about the work
 3 | title: "[Question] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: question
11 |     attributes:
12 |       label: Question
13 |       placeholder: Start question here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/4-discussion.yaml:
--------------------------------------------------------------------------------
 1 | name: Discussions
 2 | description: General discussions about the work
 3 | title: "[Discussion] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: discussion
11 |     attributes:
12 |       label: Discussion
13 |       placeholder: Start discussion here...


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | *.json
11 | *.jsonl
12 | 
13 | # Data
14 | !**/alpaca-data-conversation.json
15 | 
16 | # Editor
17 | .idea
18 | *.swp
19 | 
20 | # Other
21 | .DS_Store
22 | wandb
23 | output
24 | 
25 | checkpoints
26 | ckpts*
27 | 
28 | .ipynb_checkpoints
29 | *.ipynb
30 | 
31 | # DevContainer
32 | !.devcontainer/*
33 | 
34 | # Demo
35 | serve_images/
36 | llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
37 | IG-VLM/.git_org
38 | lmms-eval/.git_org
39 | llava/eval/table/results/test_sqa_llava_13b_v0.json
40 | 


--------------------------------------------------------------------------------
/IG-VLM/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright <2024> <Deep Representation Learning Research Group, Seoul National University>
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 
13 | 


--------------------------------------------------------------------------------
/IG-VLM/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/evaluation/__init__.py


--------------------------------------------------------------------------------
/IG-VLM/example/QzulzuWr0Zw_20_45.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/QzulzuWr0Zw_20_45.mp4


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/1.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/2.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/3.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/4.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/5.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/rlQ2kW-FvMk_66_79/6.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/1.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/2.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/3.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/4.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/5.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/extraction_sample/ysTmUTQ5wZE_17_45/6.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/imagegrid_sample/QzulzuWr0Zw_20_45.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/imagegrid_sample/QzulzuWr0Zw_20_45.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/imagegrid_sample/rlQ2kW-FvMk_66_79.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/imagegrid_sample/rlQ2kW-FvMk_66_79.jpg


--------------------------------------------------------------------------------
/IG-VLM/example/rlQ2kW-FvMk_66_79.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/rlQ2kW-FvMk_66_79.mp4


--------------------------------------------------------------------------------
/IG-VLM/example/ysTmUTQ5wZE_17_45.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/example/ysTmUTQ5wZE_17_45.mp4


--------------------------------------------------------------------------------
/IG-VLM/model_processor/__init__.py:
--------------------------------------------------------------------------------
1 | # from llava_model_processor import LlavaProcessor
2 | # from base_model_inference import BaseModelInference
3 | # from videollava_model_processor import VideoLLavaProcessor
4 | 


--------------------------------------------------------------------------------
/IG-VLM/model_processor/base_model_inference.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2024, Deep Representation Learning Research Group, Seoul National University.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | from abc import ABC, abstractmethod
10 | from enum import Enum
11 | 
12 | 
13 | class BaseModelInference(ABC):
14 |     def __init__(self, model_name, local_save_path):
15 |         self.model_name = model_name
16 |         self.local_save_path = local_save_path
17 |         self.error_list = []
18 | 
19 |     @abstractmethod
20 |     def load_model(self, **kwargs):
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def inference(self, **kwargs):
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def extract_answers(self):
29 |         pass
30 | 
31 |     def save_local_file(self, answer):
32 |         directory = os.path.dirname(self.local_save_path)
33 |         if not os.path.exists(directory):
34 |             os.makedirs(directory)
35 | 
36 |         with open(self.local_save_path, "w") as file:
37 |             file.write(answer)
38 | 
39 |     def infer_and_save(self, **kwargs):
40 |         try:
41 |             self.inference(**kwargs)
42 |             answer = self.extract_answers()
43 |             return answer
44 |         except Exception as e:
45 |             self.error_list.append(e)
46 |             print(e)
47 |             return -1
48 | 


--------------------------------------------------------------------------------
/IG-VLM/pipeline_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/pipeline_processor/__init__.py


--------------------------------------------------------------------------------
/IG-VLM/pipeline_processor/record.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2024, Deep Representation Learning Research Group, Seoul National University.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from enum import Enum
 9 | 
10 | 
11 | class EvaluationType(Enum):
12 |     DEFAULT = 0
13 |     CORRECTNESS = 1
14 |     DETAILED_ORIENTATION = 2
15 |     CONTEXT = 3
16 |     TEMPORAL = 4
17 | 


--------------------------------------------------------------------------------
/IG-VLM/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | moviepy
 3 | opencv-python
 4 | openai==1.14.0
 5 | torch==2.2.0
 6 | bitsandbytes==0.42.0
 7 | flash_attn==2.5.3
 8 | transformers==4.36.2
 9 | transformers-stream-generator==0.0.4
10 | torchvision==0.17.0
11 | pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d


--------------------------------------------------------------------------------
/IG-VLM/scripts/nextqa_multi.sh:
--------------------------------------------------------------------------------
 1 | matryoshka_vis_token_scale = 1
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | echo 'CHUNKS NUM:' $CHUNKS
 9 | for IDX in $(seq 0 $((CHUNKS-1))); do
10 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python eval_llava_multiplechoice.py --path_qa_pair_csv ./data/multiple_choice_qa/NExT_QA.csv --path_video /data/NExTVideo_all/%s.mp4 --path_result ./result_nextqa-$matryoshka_vis_token_scale/ --llm_size 7b --matryoshka_vis_token_scale=$matryoshka_vis_token_scale \
11 |       --num_chunks $CHUNKS \
12 |       --chunk_idx $IDX &
13 | done
14 | 
15 | wait
16 | 
17 | 


--------------------------------------------------------------------------------
/IG-VLM/vision_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/IG-VLM/vision_processor/__init__.py


--------------------------------------------------------------------------------
/IG-VLM/vision_processor/base_post_processor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | from abc import ABC, abstractmethod
 4 | import numpy as np
 5 | from enum import Enum
 6 | 
 7 | from PIL import Image
 8 | 
 9 | import os
10 | import sys
11 | 
12 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
13 | from .decorator_processor import *
14 | from .base_frame_extractor import *
15 | 
16 | 
17 | class BasePostProcessor(ABC):
18 |     def __init__(self):
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def post_process(self, **kwargs):
23 |         pass
24 | 
25 |     def _post_processo_on_option(self, *args, **kwargs):
26 |         data = self.post_process(*args, **kwargs)
27 |         return data
28 | 
29 |     def post_process_based_on_options(self, option, *args, **kwargs):
30 |         self.option = option
31 |         data = self._post_processo_on_option(*args, **kwargs)
32 |         if option == SaveOption.BASE64:
33 |             return self._save_data_to_base64(data, quality=self.quality)
34 |         elif option == SaveOption.IMAGE:
35 |             return Image.fromarray(data)
36 |         else:
37 |             raise ValueError("Invalid option: {}".format(option))
38 | 
39 |     @save_to_base64
40 |     def _save_data_to_base64(self, data, quality=95):
41 |         return data
42 | 
43 |     @save_to_file
44 |     def _save_data_to_file(self, data, filename=None, quality=95):
45 |         cv2.imwrite(filename, data, [int(cv2.IMWRITE_JPEG_QUALITY), quality])
46 |         pass
47 | 


--------------------------------------------------------------------------------
/IG-VLM/vision_processor/decorator_processor.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import base64
 3 | import os
 4 | from io import BytesIO
 5 | from PIL import Image
 6 | 
 7 | 
 8 | def save_to_bytes(func):
 9 |     def wrapper(self, data):
10 |         bytes_data = pickle.dumps(data)
11 |         return func(self, bytes_data)
12 | 
13 |     return wrapper
14 | 
15 | 
16 | def save_to_one_file(func):
17 |     def wrapper(self, data, filename):
18 |         with open(filename, "wb") as file:
19 |             func(self, data, file)
20 | 
21 |     return wrapper
22 | 
23 | 
24 | def save_to_file(func):
25 |     def wrapper(self, data, filename, quality):
26 |         os.makedirs(filename, exist_ok=True)
27 |         for i, image_data in enumerate(data):
28 |             file_path = os.path.join(filename, f"{i+1}.jpg")
29 | 
30 |             func(self, image_data, file_path, quality)
31 | 
32 |     return wrapper
33 | 
34 | 
35 | def save_to_base64(func):
36 |     def wrapper(self, data, quality=95):
37 |         rlt = Image.fromarray(data)
38 | 
39 |         with BytesIO() as byte_output:
40 |             rlt.save(byte_output, format="JPEG", quality=quality)
41 |             byte_output.seek(0)
42 |             byte_data = byte_output.read()
43 |         base64_data = base64.b64encode(byte_data).decode("utf-8")
44 |         return func(self, base64_data)
45 | 
46 |     return wrapper
47 | 


--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 | 
 7 |   python_version: "3.11"
 8 | 
 9 |   python_packages:
10 |     - "torch==2.0.1"
11 |     - "accelerate==0.21.0"
12 |     - "bitsandbytes==0.41.0"
13 |     - "deepspeed==0.9.5"
14 |     - "einops-exts==0.0.4"
15 |     - "einops==0.6.1"
16 |     - "gradio==3.35.2"
17 |     - "gradio_client==0.2.9"
18 |     - "httpx==0.24.0"
19 |     - "markdown2==2.4.10"
20 |     - "numpy==1.26.0"
21 |     - "peft==0.4.0"
22 |     - "scikit-learn==1.2.2"
23 |     - "sentencepiece==0.1.99"
24 |     - "shortuuid==1.0.11"
25 |     - "timm==0.6.13"
26 |     - "tokenizers==0.13.3"
27 |     - "torch==2.0.1"
28 |     - "torchvision==0.15.2"
29 |     - "transformers==4.31.0"
30 |     - "wandb==0.15.12"
31 |     - "wavedrom==2.0.3.post3"
32 |     - "Pygments==2.16.1"
33 |   run:
34 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 | 
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/docs/Customize_Component.md:
--------------------------------------------------------------------------------
 1 | # Customize Components in M3
 2 | 
 3 | This is an initial guide on how to replace the LLMs, visual encoders, etc. with your choice of components.
 4 | 
 5 | ## LLM
 6 | 
 7 | It is quite simple to swap out LLaMA to any other LLMs.  You can refer to our implementation of [`llava_llama.py`](https://raw.githubusercontent.com/mu-cai/matryoshka-mm/main/llava/model/language_model/llava_llama.py) for an example of how to replace the LLM.
 8 | 
 9 | Although it may seem that it still needs ~100 lines of code, most of them are copied from the original `llama.py` from HF.  The only part that is different is to insert some lines for processing the multimodal inputs.
10 | 
11 | In `forward` function, you can see that we call `self.prepare_inputs_labels_for_multimodal` to process the multimodal inputs.  This function is defined in `LlavaMetaForCausalLM` and you just need to insert it into the `forward` function of your LLM.
12 | 
13 | In `prepare_inputs_for_generation` function, you can see that we add `images` to the `model_inputs`.  This is because we need to pass the images to the LLM during generation.
14 | 
15 | These are basically all the changes you need to make to replace the LLM.
16 | 
17 | ## Visual Encoder
18 | 
19 | You can check out [`clip_encoder.py`](https://github.com/mu-cai/matryoshka-mm/blob/main/llava/model/multimodal_encoder/clip_encoder.py) on how we implement the CLIP visual encoder.
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/Intel.md:
--------------------------------------------------------------------------------
1 | # Intel Platforms 
2 | 
3 | * Support [Intel GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)    
4 | * Support [Intel CPU Sapphire Rapides](https://ark.intel.com/content/www/us/en/ark/products/codename/126212/products-formerly-sapphire-rapids.html)    
5 | * Based on [Intel Extension for Pytorch](https://intel.github.io/intel-extension-for-pytorch)    
6 | 
7 | More details in  [**intel branch**](https://github.com/mu-cai/matryoshka-mm/tree/intel/docs/intel)
8 | 


--------------------------------------------------------------------------------
/docs/MODEL_ZOO.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | 
 4 | If you are interested in including any other details in Model Zoo, please open an issue :)
 5 | 
 6 | The usage of LLaVA checkpoints should comply with the base LLM's model license.
 7 | 
 8 | ## LLaVA-NeXT-M3
 9 | 
10 | 
11 | Checkpoint is [here](https://huggingface.co/mucai/llava-next-vicuna-7b-m3).
12 | 
13 | 
14 | Base model: Vicuna v1.5-7B
15 | 
16 | 
17 | ## LLaVA-v1.5-M3
18 | 
19 | 
20 | Checkpoint is [here](https://huggingface.co/mucai/llava-v1.5-7b-m3).
21 | 
22 | Base model: Vicuna v1.5-7B
23 | 


--------------------------------------------------------------------------------
/docs/Windows.md:
--------------------------------------------------------------------------------
 1 | # Run LLaVA on Windows
 2 | 
 3 | *NOTE: LLaVA on Windows is not fully supported. Currently we only support 16-bit inference. For a more complete support, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) for now. More functionalities on Windows is to be added soon, stay tuned.*
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Clone this repository and navigate to LLaVA folder
 8 | ```bash
 9 | git clone https://github.com/mu-cai/matryoshka-mm.git
10 | cd LLaVA
11 | ```
12 | 
13 | 2. Install Package
14 | ```Shell
15 | conda create -n matryoshka-mm python=3.10 -y
16 | conda activate matryoshka-mm
17 | python -m pip install --upgrade pip  # enable PEP 660 support
18 | pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu117
19 | pip install -e .
20 | pip uninstall bitsandbytes
21 | ```
22 | 
23 | ## Run demo
24 | 
25 | See instructions [here](https://github.com/mu-cai/matryoshka-mm#demo).
26 | 
27 | Note that quantization (4-bit, 8-bit) is *NOT* supported on Windows. Stay tuned for the 4-bit support on Windows!
28 | 


--------------------------------------------------------------------------------
/docs/macOS.md:
--------------------------------------------------------------------------------
 1 | # Run LLaVA on macOS
 2 | 
 3 | *NOTE: LLaVA on macOS is not fully supported. Currently we only support 16-bit inference. More functionalities on macOS is to be added soon, stay tuned.*
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Clone this repository and navigate to LLaVA folder
 8 | ```bash
 9 | git clone https://github.com/mu-cai/matryoshka-mm.git
10 | cd LLaVA
11 | ```
12 | 
13 | 2. Install Package
14 | ```Shell
15 | conda create -n llava python=3.10 -y
16 | conda activate llava
17 | python -mpip install --upgrade pip  # enable PEP 660 support
18 | pip install -e .
19 | pip install torch==2.1.0 torchvision==0.16.0
20 | pip uninstall bitsandbytes
21 | ```
22 | 
23 | ## Run demo
24 | 
25 | Specify `--device mps` when launching model worker or CLI.
26 | 
27 | See instructions [here](https://github.com/mu-cai/matryoshka-mm#demo).
28 | 
29 | Note that quantization (4-bit, 8-bit) is *NOT* supported on macOS. Stay tuned for the 4-bit support on macOS!
30 | 


--------------------------------------------------------------------------------
/images/demo_cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/demo_cli.gif


--------------------------------------------------------------------------------
/images/llava_example_cmp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_example_cmp.png


--------------------------------------------------------------------------------
/images/llava_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_logo.png


--------------------------------------------------------------------------------
/images/llava_v1_5_radar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/llava_v1_5_radar.jpg


--------------------------------------------------------------------------------
/images/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/images/teaser.png


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/llava/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/llava/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 |     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 |     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | except:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/m3-demo-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/m3-demo-1.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/m3-demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/m3-demo.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/sample_demo_1.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/sample_demo_3.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/lmms-eval/docs/README.md:
--------------------------------------------------------------------------------
 1 | # LMMs Eval Documentation
 2 | 
 3 | Welcome to the docs for `lmms-eval`!
 4 | 
 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).


--------------------------------------------------------------------------------
/lmms-eval/example_eval.yaml:
--------------------------------------------------------------------------------
 1 | - model: llava
 2 |   model_args: pretrained=liuhaotian/llava-v1.5-7b
 3 |   tasks: ai2d
 4 |   batch_size: 1
 5 |   log_samples: true
 6 |   log_samples_suffix: eval_vizwiz_vqa
 7 |   output_path: "./logs/"
 8 | 
 9 | - model: llava
10 |   model_args: pretrained=liuhaotian/llava-v1.5-13b
11 |   tasks: mme
12 |   batch_size: 1
13 |   log_samples: true
14 |   log_samples_suffix: mme
15 |   output_path: "./logs/"
16 | 


--------------------------------------------------------------------------------
/lmms-eval/llava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | datasets==2.16.1
 3 | evaluate==0.4.1
 4 | hf_transfer==0.1.6
 5 | Jinja2==3.1.3
 6 | numpy==1.26.4
 7 | openai==1.13.3
 8 | packaging==23.2
 9 | pandas==2.2.1
10 | Pillow==10.2.0
11 | protobuf==4.25.3
12 | pycocoevalcap==1.2
13 | pycocotools==2.0.7
14 | pytablewriter==1.2.0
15 | pytest==8.0.2
16 | python_Levenshtein==0.25.0
17 | pytz==2024.1
18 | PyYAML==6.0.1
19 | PyYAML==6.0.1
20 | Requests==2.31.0
21 | sacrebleu==2.4.0
22 | scikit_learn==1.2.2
23 | sentencepiece==0.1.99
24 | setuptools==68.2.2
25 | sglang==0.1.12
26 | shortuuid==1.0.12
27 | sqlitedict==2.1.0
28 | tenacity==8.2.3
29 | torch==2.0.1
30 | openai>=1.0.0
31 | pycocoevalcap
32 | tokenizers==0.15.2
33 | tqdm==4.66.2
34 | tqdm-multiprocess
35 | transformers==4.37.2
36 | zstandard
37 | pillow
38 | pyyaml
39 | sympy
40 | mpmath
41 | Jinja2
42 | openpyxl
43 | Levenshtein
44 | hf_transfer
45 | tenacity
46 | wandb>=0.16.0
47 | transformers-stream-generator
48 | tiktoken
49 | pre-commit


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/api/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | AVAILABLE_MODELS = {
 4 |     "llava": "Llava",
 5 |     "llava_hf": "LlavaHf",
 6 |     "llava_sglang": "LlavaSglang",
 7 |     "qwen_vl": "Qwen_VL",
 8 |     "fuyu": "Fuyu",
 9 |     "gpt4v": "GPT4V",
10 |     "instructblip": "InstructBLIP",
11 |     "minicpm_v": "MiniCPM_V",
12 |     "idefics2": "Idefics2",
13 |     "qwen_vl_api": "Qwen_VL_API",
14 |     "phi3v": "Phi3v",
15 | }
16 | 
17 | for model_name, model_class in AVAILABLE_MODELS.items():
18 |     try:
19 |         exec(f"from .{model_name} import {model_class}")
20 |     except ImportError:
21 |         pass
22 | 
23 | 
24 | import hf_transfer
25 | 
26 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/models/model_utils/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ai2d
 2 | task: "ai2d"
 3 | dataset_kwargs:
 4 |   token: True
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ai2d_doc_to_visual
 8 | doc_to_text: !function utils.ai2d_doc_to_text
 9 | doc_to_target: !function utils.ai2d_doc_to_target
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | metric_list:
15 |   - metric: exact_match
16 |     aggregation: mean
17 |     higher_is_better: true
18 |     ignore_case: true
19 |     ignore_punctuation: true
20 | metadata:
21 |   - version: 0.0
22 | 
23 | model_specific_prompt_kwargs:
24 |   default:
25 |     prompt_format: mcq
26 |     pre_prompt: ""
27 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
28 |   # qwen formulate ai2d as question answering instead of mcq
29 |   qwen_vl:
30 |     prompt_format: qa
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 | 
34 | model_specific_target_kwargs:
35 |   default: "mcq"
36 |   qwen_vl: "qa"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/utils.py:
--------------------------------------------------------------------------------
 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None):
 2 |     question, choices = doc["question"], doc["options"]
 3 |     len_choices = len(choices)
 4 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
 5 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
 6 |     if model_specific_prompt_kwargs["prompt_format"] == "mcq":
 7 |         options = [chr(ord("A") + i) for i in range(len_choices)]
 8 |         choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
 9 |         return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"
10 |     elif model_specific_prompt_kwargs["prompt_format"] == "qa":
11 |         options = "\n".join(choices)
12 |         return f"{pre_prompt}{question}{options}{post_prompt}"
13 |     else:
14 |         raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}")
15 | 
16 | 
17 | def ai2d_doc_to_visual(doc):
18 |     return [doc["image"].convert("RGB")]
19 | 
20 | 
21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs):
22 |     if model_specific_target_kwargs == "mcq":
23 |         len_choices = len(doc["options"])
24 |         options = [chr(ord("A") + i) for i in range(len_choices)]
25 |         return options[int(doc["answer"])]
26 |     elif model_specific_target_kwargs == "qa":
27 |         return doc["options"][int(doc["answer"])]
28 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ChartQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "chartqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.chartqa_doc_to_visual
 8 | doc_to_text: !function utils.chartqa_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | process_results: !function utils.chartqa_process_results
15 | metric_list:
16 |   - metric: relaxed_overall
17 |     aggregation: mean
18 |     higher_is_better: true
19 |   - metric: relaxed_human_split
20 |     aggregation: mean
21 |     higher_is_better: true
22 |   - metric: relaxed_augmented_split
23 |     aggregation: mean
24 |     higher_is_better: true
25 | metadata:
26 |   - version: 0.0
27 | model_specific_prompt_kwargs:
28 |   default:
29 |     pre_prompt: ""
30 |     post_prompt: "\nAnswer the question with a single word."
31 |   qwen_vl:
32 |     pre_prompt: ""
33 |     post_prompt: " Answer:"
34 | 
35 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml:
--------------------------------------------------------------------------------
1 | group: cmmmu
2 | task:
3 | - cmmmu_val
4 | - cmmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/CMMMU
2 | output_type: generate_until
3 | doc_to_visual: !function utils.cmmmu_doc_to_visual
4 | doc_to_text: !function utils.cmmmu_doc_to_text
5 | doc_to_target: "answer"
6 | generation_kwargs:
7 |   max_new_tokens: 16
8 |   image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_test"
 2 | test_split: test
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_test_results_for_submission
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | metric_list:
 7 |   - metric: submission
 8 |     aggregation: !function utils.cmmmu_test_aggregate_results_for_submission
 9 |     higher_is_better: false
10 | metadata:
11 |   - version: 0.0
12 | include: _default_template_cmmmu_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_val"
 2 | test_split: val
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_results
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   image_aspect_ratio: original
 9 | metric_list:
10 |   - metric: cmmmu_acc
11 |     aggregation: !function utils.cmmmu_aggregate_results
12 |     higher_is_better: true
13 | metadata:
14 |   - version: 0.0
15 | include: _default_template_cmmmu_yaml
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2014_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2014_cap_test"
 5 | group : "coco_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2017_cap
2 | task:
3 |   - coco2017_cap_val
4 |   - coco2017_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2017_cap_test"
 5 | group : "coco_caption2017"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "coco2017_cap_val"
 5 | group : "coco_caption2017"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_Bleu_4 
21 |     aggregation : !function utils.coco_bleu4
22 |     higher_is_better : true
23 |   - metric: coco_Bleu_3
24 |     aggregation : !function utils.coco_bleu3
25 |     higher_is_better : true
26 |   - metric: coco_Bleu_2
27 |     aggregation : !function utils.coco_bleu2
28 |     higher_is_better : true
29 |   - metric: coco_Bleu_1
30 |     aggregation : !function utils.coco_bleu1
31 |     higher_is_better : true
32 |   - metric: coco_METEOR
33 |     aggregation : !function utils.coco_meteor
34 |     higher_is_better : true
35 |   - metric: coco_ROUGE_L
36 |     aggregation : !function utils.coco_rougel
37 |     higher_is_better : true
38 |   - metric: coco_CIDEr
39 |     aggregation : !function utils.coco_cider
40 |     higher_is_better : true
41 |   #- metric: coco_SPICE
42 |   #  aggregation : !function utils.coco_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test
5 |   - coco2017_cap_val
6 |   - coco2017_cap_test
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: DocVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | output_type: generate_until
 6 | doc_to_visual: !function utils.docvqa_doc_to_visual
 7 | doc_to_text: !function utils.docvqa_doc_to_text
 8 | doc_to_target: "answers"
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer the question using a single word or phrase."
17 |   qwen_vl:
18 |     pre_prompt: ""
19 |     post_prompt: " Answer:"
20 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml:
--------------------------------------------------------------------------------
1 | group: docvqa
2 | task:
3 | - docvqa_val
4 | - docvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_test"
2 | test_split: test
3 | process_results: !function utils.docvqa_test_process_results
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.docvqa_test_aggregate_results
7 |     higher_is_better: true
8 | include: _default_template_docvqa_yaml
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_val"
2 | test_split: validation
3 | metric_list:
4 |   - metric: anls
5 |     aggregation: mean
6 |     higher_is_better: true
7 | include: _default_template_docvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | logger = logging.getLogger("lmms-eval")
 8 | 
 9 | 
10 | def docvqa_doc_to_visual(doc):
11 |     return [doc["image"].convert("RGB")]
12 | 
13 | 
14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs):
15 |     question = doc["question"]
16 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
17 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
18 |     return f"{pre_prompt}{question}{post_prompt}"
19 | 
20 | 
21 | def docvqa_test_process_results(doc, results):
22 |     pred = results[0]
23 |     questionId = doc["questionId"]
24 |     return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}}
25 | 
26 | 
27 | def docvqa_test_aggregate_results(results, args):
28 |     # save results as json
29 |     path = generate_submission_file("docvqa_test_for_submission.json", args)
30 |     with open(path, "w") as f:
31 |         json.dump(results, f)
32 |     logger.info(f"Results saved to {path}")
33 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ferret/ferret.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/Ferret-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ferret"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ferret_doc_to_visual
 8 | doc_to_text: !function utils.ferret_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.ferret_process_results
20 | metric_list:
21 |   - metric: gpt_eval_ferret_all
22 |     aggregation: !function utils.ferret_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_ferret_refer_desc
25 |     aggregation: !function utils.ferret_refer_desc_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_ferret_refer_reason
28 |     aggregation: !function utils.ferret_refer_reason_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_ferret_ground_conv
31 |     aggregation: !function utils.ferret_ground_conv_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0314"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml:
--------------------------------------------------------------------------------
1 | group: flickr30k
2 | task:
3 | - flickr30k_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/gqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/GQA
 2 | dataset_name: testdev_balanced_instructions
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "gqa"
 6 | test_split: testdev
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.gqa_doc_to_visual
 9 | doc_to_text: !function utils.gqa_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |     ignore_case: true
22 |     ignore_punctuation: true
23 | metadata:
24 |   - version: 0.0
25 |   
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | GQA_RAW_IMAGE_DATASET = None
 4 | GQA_ID2IMAGE = None
 5 | 
 6 | 
 7 | def gqa_doc_to_visual(doc):
 8 |     global GQA_RAW_IMAGE_DATASET
 9 |     global GQA_ID2IMAGE
10 |     if GQA_RAW_IMAGE_DATASET is None:
11 |         GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
12 |         GQA_ID2IMAGE = {}
13 |         for row in GQA_RAW_IMAGE_DATASET:
14 |             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
15 |     image = GQA_ID2IMAGE[doc["imageId"]]
16 |     return [image]
17 | 
18 | 
19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
20 |     question = doc["question"]
21 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
22 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
23 |     return f"{pre_prompt}{question}{post_prompt}"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/HallusionBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "hallusion_bench_image"
 5 | test_split: image
 6 | output_type: generate_until
 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual
 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text
 9 | doc_to_target: "gt_answer_details"
10 | process_results: !function evaluate_hb.hb_process_results
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: ""
15 | generation_kwargs:
16 |   max_new_tokens: 128
17 |   temperature: 0
18 |   top_p: 0
19 |   num_beams: 1
20 |   do_sample: false
21 | metric_list:
22 |   - metric: aAcc
23 |     aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
24 |     higher_is_better: true
25 |   - metric: qAcc
26 |     aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
27 |     higher_is_better: true
28 |   - metric: fAcc
29 |     aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
30 |     higher_is_better: true
31 |   # - metric: aAcc
32 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
33 |   #  higher_is_better: true
34 |   # - metric: qAcc
35 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
36 |   #  higher_is_better: true
37 |   # - metric: fAcc
38 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
39 |   #  higher_is_better: true
40 | metadata:
41 |   - version: 0.0
42 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ICON-QA
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.doc_to_visual
 6 | doc_to_text: !function utils.doc_to_text
 7 | doc_to_target: "answers"
 8 | # process_results: !function utils.test_process_results
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     statement: "Given a set of images and a question, please provide the answer to the question.\n"
17 |     options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
18 |     freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
19 | metric_list:
20 |   - metric: anls
21 |     aggregation: mean
22 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml:
--------------------------------------------------------------------------------
1 | group: iconqa
2 | task:
3 | - iconqa_val
4 | - iconqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_test"
2 | test_split: test
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_val"
2 | test_split: val
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: InfographicVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | doc_to_target: "answers"
 6 | doc_to_visual: !function utils.infovqa_doc_to_visual
 7 | doc_to_text: !function utils.infovqa_doc_to_text
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml:
--------------------------------------------------------------------------------
1 | group: infovqa
2 | task:
3 | - infovqa_val
4 | - infovqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "infovqa_test"
 2 | test_split: test
 3 | output_type: generate_until
 4 | process_results: !function utils.infovqa_test_process_results
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.infovqa_test_aggregate_results
 8 |     higher_is_better: true
 9 | include: _default_template_infovqa_yaml
10 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "infovqa_val"
2 | test_split: validation
3 | output_type: generate_until
4 | metric_list:
5 |   - metric: anls
6 |     aggregation: mean
7 |     higher_is_better: true
8 | include: _default_template_infovqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | 
 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 7 | 
 8 | lmms_logger = logging.getLogger("lmms-eval")
 9 | 
10 | 
11 | def infovqa_doc_to_visual(doc):
12 |     return [doc["image"].convert("RGB")]
13 | 
14 | 
15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs):
16 |     question = doc["question"]
17 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
18 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
19 |     return f"{pre_prompt}{question}{post_prompt}"
20 | 
21 | 
22 | def infovqa_test_process_results(doc, results):
23 |     pred = results[0]
24 |     questionId = doc["questionId"]
25 |     return {"submission": {"questionId": int(questionId), "answer": pred}}
26 | 
27 | 
28 | def infovqa_test_aggregate_results(results, args):
29 |     # save results as json
30 |     file = generate_submission_file("infovqa_test_for_submission.json", args)
31 |     with open(file, "w") as f:
32 |         json.dump(results, f)
33 |     lmms_logger.info(f"Results saved to {file}")
34 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-coco
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_bench_coco"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 | process_results: !function utils.llava_process_results
19 | metric_list:
20 |   - metric: gpt_eval_llava_all
21 |     aggregation: !function utils.llava_all_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_conv
24 |     aggregation: !function utils.llava_conv_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_detail
27 |     aggregation: !function utils.llava_detail_aggregation
28 |     higher_is_better: true
29 |   - metric: gpt_eval_llava_complex
30 |     aggregation: !function utils.llava_complex_aggregation
31 |     higher_is_better: true
32 | metadata:
33 |   version: 0.0
34 |   gpt_eval_model_name: "gpt-4-0314"
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-in-the-wild
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_in_the_wild"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.llava_process_results
20 | metric_list:
21 |   - metric: gpt_eval_llava_all
22 |     aggregation: !function utils.llava_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_llava_conv
25 |     aggregation: !function utils.llava_conv_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_llava_detail
28 |     aggregation: !function utils.llava_detail_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_llava_complex
31 |     aggregation: !function utils.llava_complex_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0613"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml:
--------------------------------------------------------------------------------
 1 | group: mathverse
 2 | task:
 3 |   - mathverse_testmini
 4 |   - mathverse_testmini_text_only
 5 |   - mathverse_testmini_text_lite
 6 |   - mathverse_testmini_text_dominant
 7 |   - mathverse_testmini_vision_intensive
 8 |   - mathverse_testmini_vision_dominant
 9 |   - mathverse_testmini_vision_only
10 | metadata:
11 |   version: 0.0
12 |   gpt_eval_model_name: "gpt-3.5-turbo"
13 |   trunk_response: 30
14 |   quick_match: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini"
 6 | test_split: testmini
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_dominant"
 6 | test_split: text_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_lite"
 6 | test_split: text_lite
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_text_only
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_only"
 6 | test_split: text_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_dominant"
 6 | test_split: vision_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_intensive"
 6 | test_split: vision_intensive
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_only"
 6 | test_split: vision_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml:
--------------------------------------------------------------------------------
1 | group: mathvista
2 | task:
3 |   - mathvista_testmini
4 |   - mathvista_test
5 | metadata:
6 |   version: 0.0
7 |   gpt_eval_model_name: "gpt-4-0613"
8 |   quick_extract: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 | model_specific_generation_kwargs:
28 |   llava:
29 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_testmini"
 5 | test_split: testmini
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 |   phi3v:
28 |     shot_type: "solution"
29 | model_specific_generation_kwargs:
30 |   llava:
31 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | dataset_name: "cn"
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text
 9 | generation_kwargs:
10 |   max_new_tokens: 256
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function cn_utils.mmbench_process_results
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
20 | model_specific_generation_kwargs:
21 |   llava:
22 |     image_aspect_ratio: original
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | model_specific_prompt_kwargs:
 6 |   default:
 7 |     pre_prompt: ""
 8 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual
10 | doc_to_text: !function en_utils.mmbench_doc_to_text
11 | doc_to_target: "answer"
12 | process_results: !function en_utils.mmbench_process_results
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | output_type: generate_until
17 | dataset_name: "en"
18 | generation_kwargs:
19 |   until:
20 |     - "ASSISTANT:"
21 |   max_new_tokens: 1024
22 |   temperature: 0
23 |   top_p: 0
24 |   num_beams: 1
25 |   do_sample: false
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml:
--------------------------------------------------------------------------------
 1 | group: mmbench
 2 | task:
 3 |   - mmbench_en_dev
 4 |   - mmbench_en_test
 5 |   - mmbench_cn_dev
 6 |   - mmbench_cn_test
 7 |   - mmbench_cn_cc
 8 | metadata:
 9 |   version: 0.0
10 |   sys_prompt: "There are several options:"
11 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_name: cc
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "mmbench_cn_cc"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual
 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 256
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function cc_utils.mmbench_cn_cc_process_results
18 | metric_list:
19 |   - metric: gpt_eval_score
20 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
21 |     higher_is_better: true
22 |   - metric: submission
23 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
24 | metadata:
25 |   version: 0.0
26 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     pre_prompt: ""
31 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_cn
2 | task:
3 |   - mmbench_cn_dev
4 |   - mmbench_cn_test
5 |   - mmbench_cn_cc
6 | metadata:
7 |   version: 0.0
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 |   sys_prompt: "有如下几个选项："


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_cn_dev"
 2 | test_split: "dev"
 3 | metric_list:
 4 |   - metric: gpt_eval_score
 5 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
 6 |     higher_is_better: true
 7 |   - metric: submission
 8 |     higher_is_better: true
 9 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results
10 | include: _default_template_mmbench_cn_yaml
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml:
--------------------------------------------------------------------------------
1 | task: mmbench_cn_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function cn_utils.mmbench_aggregate_test_results
6 |     higher_is_better: true
7 | include: _default_template_mmbench_cn_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_en
2 | task:
3 |   - mmbench_en_dev
4 |   - mmbench_en_test
5 | metadata:
6 |   version: 0.0
7 |   sys_prompt: "There are several options:"
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_en_dev"
 2 | test_split: dev
 3 | include: _default_template_mmbench_en_yaml
 4 | metric_list:
 5 |   - metric: gpt_eval_score
 6 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
 7 |     higher_is_better: true
 8 |   - metric: submission
 9 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
10 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml:
--------------------------------------------------------------------------------
1 | task: "mmbench_en_test"
2 | test_split: test
3 | include: _default_template_mmbench_en_yaml
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function en_utils.mmbench_aggregate_test_results
7 |     higher_is_better: true
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mme/mme.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MME
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mme"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mme_doc_to_visual
 8 | doc_to_text: !function utils.mme_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | # The return value of process_results will be used by metrics
17 | process_results: !function utils.mme_process_results
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: mme_percetion_score
21 |     aggregation: !function utils.mme_aggregate_results
22 |     higher_is_better: true
23 |   - metric: mme_cognition_score
24 |     aggregation: !function utils.mme_aggregate_results
25 |     higher_is_better: true
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:  
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 |   otterhd:
34 |     pre_prompt: ""
35 |     post_prompt: " Answer:"
36 | metadata:
37 |   - version: 0.0
38 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu
2 | task:
3 | - mmmu_val
4 | - mmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu_group_img
2 | task:
3 | - mmmu_val_group_img
4 | - mmmu_test_group_img
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test_group_img"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual
 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils_group_img.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils_group_img.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_group_img_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val_group_img"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils_group_img.mmmu_doc_to_visual
 6 | doc_to_text: !function utils_group_img.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils_group_img.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | metric_list:
17 |   - metric: mmmu_acc
18 |     aggregation: !function utils_group_img.mmmu_aggregate_results
19 |     higher_is_better: true
20 | metadata:
21 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | metric_list:
17 |   - metric: mmmu_acc
18 |     aggregation: !function utils.mmmu_aggregate_results
19 |     higher_is_better: true
20 | metadata:
21 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMVet
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mmvet"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mmvet_doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"]
 9 | doc_to_target: "{{answer}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mmvet_process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mmvet_aggregate_results
22 |     higher_is_better: true
23 | metadata:
24 |   version: 0.0
25 |   gpt_eval_model_name: "gpt-4"
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml:
--------------------------------------------------------------------------------
1 | group: multidocvqa
2 | task:
3 | - multidocvqa_val
4 | - multidocvqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_test_results_for_submission
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_val"
 3 | test_split: val
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_results
13 | metric_list:
14 |   - metric: anls
15 |     aggregation: !function utils.multidocvqa_aggregate_results_anls
16 |     higher_is_better: true
17 |   - metric: accuracy
18 |     aggregation: !function utils.multidocvqa_aggregate_results_accuracy
19 |     higher_is_better: true
20 | model_specific_prompt_kwargs:
21 |   default:
22 |     pre_prompt: ""
23 |     post_prompt: "\nAnswer the question using a single word or phrase."
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_default_template.yaml:
--------------------------------------------------------------------------------
 1 | test_split: train
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.llava_doc_to_visual
 4 | doc_to_text: !function utils.llava_doc_to_text
 5 | doc_to_target: "gpt_answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 |   image_aspect_ratio: original
10 |   max_new_tokens: 1024
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function utils.llava_process_results
16 | metric_list:
17 |   - metric: gpt_eval_llava_all
18 |     aggregation: !function utils.llava_all_aggregation
19 |     higher_is_better: true
20 |   - metric: gpt_eval_llava_conv
21 |     aggregation: !function utils.llava_conv_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_detail
24 |     aggregation: !function utils.llava_detail_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_complex
27 |     aggregation: !function utils.llava_complex_aggregation
28 |     higher_is_better: true
29 | metadata:
30 |   version: 0.0
31 |   gpt_eval_model_name: "gpt-4-0613"
32 | model_specific_prompt_kwargs:
33 |   default:
34 |     pre_prompt: ""
35 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: arabic
4 |     token: True
5 | task: "llava_in_the_wild_arabic"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: bengali
4 |     token: True
5 | task: "llava_in_the_wild_bengali"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: chinese
4 |     token: True
5 | task: "llava_in_the_wild_chinese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: french
4 |     token: True
5 | task: "llava_in_the_wild_french"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: hindi
4 |     token: True
5 | task: "llava_in_the_wild_hindi"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: japanese
4 |     token: True
5 | task: "llava_in_the_wild_japanese"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: russian
4 |     token: True
5 | task: "llava_in_the_wild_russian"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |   config: spanish
4 |   token: True
5 | task: "llava_in_the_wild_spanish"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
1 | dataset_path: "gagan3012/multilingual-llava-bench"
2 | dataset_kwargs:
3 |     config: urdu
4 |     token: True
5 | task: "llava_in_the_wild_urdu"
6 | include: _default_template.yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: "Provide a one-sentence caption for the provided image."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml:
--------------------------------------------------------------------------------
1 | group : nocaps
2 | task:
3 |   - nocaps_test
4 |   - nocaps_val


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "nocaps_test"
 5 | group : "nocaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_passthrough 
21 |     aggregation : !function utils.nocaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: echo840/OCRBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ocrbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual
 8 | doc_to_text: !function utils.ocrbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.ocrbench_process_results
17 | metric_list:
18 |   - metric: ocrbench_accuracy
19 |     aggregation: !function utils.ocrbench_aggregate_accuracy
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OK-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual
 4 | doc_to_text: !function utils.ok_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metric_list:
10 |   - metric: exact_match
11 |     aggregation: mean
12 |     higher_is_better: true
13 |     ignore_case: true
14 |     ignore_punctuation: true
15 |   - metric: submission
16 |     aggregation: !function utils.ok_vqa_aggreate_submissions
17 |     higher_is_better: true
18 | process_results: !function utils.ok_vqa_process_results
19 | model_specific_prompt_kwargs:
20 |   default:
21 |     pre_prompt: ""
22 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val2014"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"ok_vqa_{split}")
16 | 
17 |             save_path = f"./ok_vqa_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "ok_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_ok_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task:
3 | - ok_vqa_val2014


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task: ok_vqa_val2014
3 | test_split: val2014
4 | include: _default_template_vqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml:
--------------------------------------------------------------------------------
1 | group: olympiadbench
2 | task:
3 | - olympiadbench_test_en
4 | - olympiadbench_test_cn
5 | metadata:
6 |   - version: 0.0
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_cn"
 5 | test_split: test_cn
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function cn_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function cn_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_en"
 5 | test_split: test_en
 6 | output_type: generate_until
 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function en_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function en_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "pope"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.pope_doc_to_visual
 8 | doc_to_text: !function utils.pope_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.pope_process_results
17 | metric_list:
18 |   - metric: pope_accuracy
19 |     aggregation: !function utils.pope_aggregate_accuracy
20 |     higher_is_better: true
21 |   - metric: pope_precision
22 |     aggregation: !function utils.pope_aggregate_precision
23 |     higher_is_better: true
24 |   - metric: pope_recall
25 |     aggregation: !function utils.pope_aggregate_recall
26 |     higher_is_better: true
27 |   - metric: pope_f1_score
28 |     aggregation: !function utils.pope_aggregate_f1_score
29 |     higher_is_better: true
30 |   - metric: pope_yes_ratio
31 |     aggregation: !function utils.pope_aggregate_yes_ratio
32 |     higher_is_better: true
33 | metadata:
34 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_adv.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_adv"
 6 | test_split: adversarial 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_full.yaml:
--------------------------------------------------------------------------------
1 | group : pope_full
2 | task:
3 |   - pope_adv
4 |   - pope_pop
5 |   - pope_random


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_pop.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_pop"
 6 | test_split: popular 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope_random.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_name: Full
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "pope_random"
 6 | test_split: random 
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.pope_doc_to_visual
 9 | doc_to_text: !function utils.pope_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.pope_process_results
18 | metric_list:
19 |   - metric: pope_accuracy
20 |     aggregation: !function utils.pope_aggregate_accuracy
21 |     higher_is_better: true
22 |   - metric: pope_precision
23 |     aggregation: !function utils.pope_aggregate_precision
24 |     higher_is_better: true
25 |   - metric: pope_recall
26 |     aggregation: !function utils.pope_aggregate_recall
27 |     higher_is_better: true
28 |   - metric: pope_f1_score
29 |     aggregation: !function utils.pope_aggregate_f1_score
30 |     higher_is_better: true
31 |   - metric: pope_yes_ratio
32 |     aggregation: !function utils.pope_aggregate_yes_ratio
33 |     higher_is_better: true
34 | metadata:
35 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RealWorldQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "realworldqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual
 8 | doc_to_text: !function utils.realworldqa_doc_to_text
 9 | doc_to_target: "answer"
10 | 
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | 
18 | filter_list:
19 |   - name: "flexible-extract"
20 |     filter:
21 |       - function: !function utils.NumberWordsToDigitsFilter
22 |       - function: !function utils.MultiChoiceRegexFilter
23 |         group_select: 0
24 |         ignore_case: true
25 |         ignore_punctuation: true
26 |         regex_pattern: "(\\([A-Z]\\))"
27 | 
28 | metric_list:
29 |   - metric: exact_match
30 |     aggregation: mean
31 |     higher_is_better: true
32 |     ignore_case: true
33 |     ignore_punctuation: true
34 |       
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""
39 |   gpt4v:
40 |     pre_prompt: ""
41 |     post_prompt: ""
42 | metadata:
43 |   - version: 0.0
44 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOPlus
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "val", "testA", "testB"]
 5 | splits = ["val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco+_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco+", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+
2 | task:
3 | - refcoco+_seg_val
4 | - refcoco+_seg_testA
5 | - refcoco+_seg_testB
6 | - refcoco+_bbox_val
7 | - refcoco+_bbox_testA
8 | - refcoco+_bbox_testB
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testA
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testB
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testA
3 | include: _default_template_bbox_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testB
3 | include: _default_template_bbox_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testA
3 | include: _default_template_seg_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testB
3 | include: _default_template_seg_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val", "testA", "testB"]
 5 | splits = ["test", "val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml:
--------------------------------------------------------------------------------
 1 | group: refcoco
 2 | task:
 3 | - refcoco_seg_test
 4 | - refcoco_seg_val
 5 | - refcoco_seg_testA
 6 | - refcoco_seg_testB
 7 | - refcoco_bbox_test
 8 | - refcoco_bbox_val
 9 | - refcoco_bbox_testA
10 | - refcoco_bbox_testB
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_test
3 | test_split: test
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testA
3 | test_split: testA
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testB
3 | test_split: testB
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_val
3 | test_split: val
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_test
3 | test_split: test
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testA
3 | test_split: testA
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testB
3 | test_split: testB
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_val
3 | test_split: val
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_test
3 | test_split: test
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testA
3 | test_split: testA
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testB
3 | test_split: testB
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_val
3 | test_split: val
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val"]
 5 | splits = ["test", "val"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcocog_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcocog", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog
2 | task:
3 | - refcocog_seg_test
4 | - refcocog_seg_val
5 | - refcocog_bbox_test
6 | - refcocog_bbox_val
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_test
3 | include: _default_template_bbox_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_test
3 | include: _default_template_bbox_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_test
3 | include: _default_template_seg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-FULL
 3 | task: "scienceqa"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   
33 | model_specific_generation_kwargs:
34 |   llava:
35 |     image_aspect_ratio: original
36 |   
37 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml:
--------------------------------------------------------------------------------
1 | group: scienceqa_full
2 | task:
3 |   - scienceqa
4 |   - scienceqa_img


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-IMG
 3 | task: "scienceqa_img"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   idefics2:
33 |     format: default
34 |     pre_prompt: ""
35 |     post_prompt: "\nAnswer:"
36 | model_specific_generation_kwargs:
37 |   llava:
38 |     image_aspect_ratio: original
39 |   
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual
 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text
 5 | doc_to_target: "bbox"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils_rec.screenspot_rec_process_result
10 | metric_list:
11 |   - metric: screenspot_IoU
12 |     aggregation : !function utils_rec.screenspot_rec_iou
13 |     higher_is_better : true
14 |   - metric: screenspot_ACC@0.1
15 |     aggregation : !function utils_rec.screenspot_rec_acc01
16 |     higher_is_better : true
17 |   - metric: screenspot_ACC@0.3
18 |     aggregation : !function utils_rec.screenspot_rec_acc03
19 |     higher_is_better : true
20 |   - metric: screenspot_ACC@0.5
21 |     aggregation : !function utils_rec.screenspot_rec_acc05
22 |     higher_is_better : true
23 |   - metric: screenspot_ACC@0.7
24 |     aggregation : !function utils_rec.screenspot_rec_acc07
25 |     higher_is_better : true
26 |   - metric: screenspot_ACC@0.9
27 |     aggregation : !function utils_rec.screenspot_rec_acc09
28 |     higher_is_better : true
29 |   - metric: screenspot_Center_ACC
30 |     aggregation : !function utils_rec.screenspot_rec_center_acc
31 |     higher_is_better : true
32 | metadata:
33 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual
 4 | doc_to_text: !function utils.screenspot_doc_to_text
 5 | doc_to_target: "instruction"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.screenspot_process_result
10 | metric_list:
11 |   - metric: screenspot_CIDEr
12 |     aggregation : !function utils.screenspot_cider
13 |     higher_is_better : true
14 | metadata:
15 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot
2 | task:
3 | - screenspot_reg_test
4 | - screenspot_rec_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_rec
2 | task: screenspot_rec_test
3 | include: _default_template_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_reg
2 | task: screenspot_reg_test
3 | include: _default_template_reg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 | # The return value of process_results will be used by metrics
15 | process_results: !function utils.seed_process_result
16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17 | metric_list:
18 |   - metric: seed_image
19 |     aggregation: !function utils.seed_aggregation_result
20 |     higher_is_better: true
21 |   - metric: seed_video
22 |     aggregation: !function utils.seed_aggregation_result
23 |     higher_is_better: true
24 |   - metric: seed_all
25 |     aggregation: !function utils.seed_aggregation_result
26 |     higher_is_better: true
27 | metadata:
28 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench_ppl"
 5 | test_split: test
 6 | output_type: multiple_choice
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text_mc
 9 | doc_to_choice : !function utils.seed_doc_to_choice
10 | doc_to_target: !function utils.seed_doc_to_mc_target
11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
12 | metric_list:
13 |   - metric: acc
14 | metadata:
15 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ST-VQA
 2 | task: "stvqa"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.stvqa_doc_to_visual
 6 | doc_to_text: !function utils.stvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.stvqa_process_results
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.stvqa_aggregate_submissions
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | 
 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs):
 9 |     question = doc["question"]
10 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
11 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
12 |     return f"{pre_prompt}{question}{post_prompt}"
13 | 
14 | 
15 | def stvqa_doc_to_visual(doc):
16 |     return [doc["image"].convert("RGB")]
17 | 
18 | 
19 | def stvqa_process_results(doc, results):
20 |     answer = results[0]
21 |     return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}}
22 | 
23 | 
24 | def stvqa_aggregate_submissions(results, args):
25 |     file = generate_submission_file("stvqa_test_for_submission.json", args)
26 |     with open(file, "w") as f:
27 |         json.dump(results, f)
28 |     logging.getLogger("lmms-eval").info(f"Results saved to {file}")
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: Provide a one-sentence caption for the provided image.


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml:
--------------------------------------------------------------------------------
1 | group : textcaps
2 | task:
3 |   - textcaps_val
4 |   - textcaps_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_test"
 5 | group : "textcaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_passthrough 
21 |     aggregation : !function utils.textcaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/textvqa
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.textvqa_doc_to_visual
 4 | doc_to_text: !function utils.textvqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.textvqa_process_results
10 | model_specific_prompt_kwargs:
11 |   default:
12 |     pre_prompt: ""
13 |     post_prompt: "\nAnswer the question using a single word or phrase."
14 |     ocr: false
15 |   qwen_vl:
16 |     pre_prompt: ""
17 |     post_prompt: " Answer:"
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml:
--------------------------------------------------------------------------------
1 | group: textvqa
2 | task:
3 | - textvqa_val
4 | - textvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: textvqa_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function utils.textvqa_aggreate_submissions
6 |     higher_is_better: true
7 | include: _default_template_textvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | task: textvqa_val
 2 | test_split: validation
 3 | metric_list:
 4 |   - metric: exact_match
 5 |     aggregation: mean
 6 |     higher_is_better: true
 7 |     ignore_case: true
 8 |     ignore_punctuation: true
 9 |   - metric: submission
10 |     aggregation: !function utils.textvqa_aggreate_submissions
11 |     higher_is_better: true
12 | include: _default_template_textvqa_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VizWiz-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metadata:
10 |   - version: 0.0
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
15 | process_results: !function utils.vizwiz_vqa_process_results
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val", "test"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"vizwiz_{task}_{split}")
16 | 
17 |             save_path = f"./vizwiz_{task}_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_vizwiz_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: vizwiz_vqa
2 | task:
3 | - vizwiz_vqa_val
4 | - vizwiz_vqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_test
 3 | test_split: test
 4 | include: _default_template_vqa_yaml
 5 | process_results: !function utils.vizwiz_vqa_process_results
 6 | metric_list:
 7 |   # - metric: exact_match
 8 |   #   aggregation: mean
 9 |   #   higher_is_better: true
10 |   #   ignore_case: true
11 |   #   ignore_punctuation: true
12 |   - metric: submission
13 |     aggregation: !function utils.vizwiz_vqa_aggreate_submissions
14 |     higher_is_better: true
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_val
 3 | test_split: val
 4 | include: _default_template_vqa_yaml
 5 | metric_list:
 6 |   - metric: exact_match
 7 |     aggregation: mean
 8 |     higher_is_better: true
 9 |     ignore_case: true
10 |     ignore_punctuation: true
11 |   # - metric: submission
12 |   #   aggregation: !function utils.vizwiz_vqa_aggreate_submissions
13 |   #   higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VQAv2
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vqav2_doc_to_visual
 6 | doc_to_text: !function utils.vqav2_doc_to_text
 7 | doc_to_target: "answer"
 8 | generation_kwargs:
 9 |   max_new_tokens: 16
10 | metadata:
11 |   - version: 0.0
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml:
--------------------------------------------------------------------------------
1 | group: vqav2
2 | task:
3 | - vqav2_val
4 | - vqav2_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml:
--------------------------------------------------------------------------------
1 | task: "vqav2_test"
2 | include: _default_template_vqav2_yaml
3 | test_split: test
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.vqav2_aggreate_submissions
7 |     higher_is_better: true
8 | process_results: !function utils.vqav2_process_results_test
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "vqav2_val"
 2 | include: _default_template_vqav2_yaml
 3 | test_split: validation
 4 | metric_list:
 5 |   - metric: exact_match
 6 |     aggregation: mean
 7 |     higher_is_better: true
 8 |     ignore_case: true
 9 |     ignore_punctuation: true
10 | process_results: !function utils.vqav2_process_results_val
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc.yaml:
--------------------------------------------------------------------------------
1 | group: websrc
2 | task:
3 | - websrc_val
4 | - websrc_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc-test
 2 | task: "websrc_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.websrc_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc
 2 | task: "websrc_val"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: websrc_squad_f1
16 |     aggregation: !function utils.websrc_aggregate_results
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mu-cai/matryoshka-mm/ec30cd37f54934789501c6bf78ddc59e950b8d9a/lmms-eval/miscs/llava_result_check.md


--------------------------------------------------------------------------------
/lmms-eval/miscs/repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install LLaVA without building dependencies
 6 | cd LLaVA
 7 | pip install --no-deps -U -e .
 8 | 
 9 | # install all the requirements that require for reproduce llava results
10 | pip install -r llava_repr_requirements.txt
11 | 
12 | # Run and exactly reproduce llava_v1.5 results!
13 | # mme as an example
14 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval/miscs/script.sh:
--------------------------------------------------------------------------------
 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b"   --tasks mme_llava_prompt  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
 2 | 
 3 | 
 4 | gpu = 8 bs 1:
 5 | 
 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 7 | |     Tasks      |Version|Filter|n-shot|  Metric   |Value|   |Stderr |
 8 | |----------------|-------|------|-----:|-----------|----:|---|------:|
 9 | |mme_llava_prompt|Yaml   |none  |     0|exact_match| 1873|±  |38.4331|
10 | 
11 | gpu = 8 bs 1 use_flash_attention_2=True:
12 | 
13 | 
14 | 
15 | 
16 | 
17 | gpu = 4 bs 1 use_flash_attention_2=True:
18 | 
19 | 
20 | 
21 | accelerate launch --num_processes=8 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-13b"   --tasks scienceqa  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
22 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_llava.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | 
 4 | import torch
 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 6 | 
 7 | model_id = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | prompt_1 = "USER: <image>\nWhat does this image show?\nASSISTANT:"
10 | prompt_2 = "USER: <image> <image> \nWhat is the difference between these two images?\nASSISTANT:"
11 | image_file_1 = "image1.png"
12 | image_file_2 = "image2.png"
13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 | raw_image_1 = Image.open(image_file_1)
16 | raw_image_2 = Image.open(image_file_2)
17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16)
18 | import pdb
19 | 
20 | pdb.set_trace()
21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
22 | print(processor.batch_decode(output, skip_special_tokens=True))
23 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_scienceqa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"]
4 | for doc in dataset:
5 |     print(doc["id"])
6 | 


--------------------------------------------------------------------------------
/lmms-eval/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | # This is to make sure that the package supports editable installs
4 | setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A man wearing multiple neck ties making a goofy face.
 2 | A man in a white shirt wearing very many ties.
 3 | a man with ties on poses for a picture
 4 | A man wearing multiple ties on his neck.
 5 | A young man smiles while wearing several ties.
 6 | 
 7 | tie: [0.574, 0.298, 0.752, 0.704]
 8 | tie: [0.464, 0.339, 0.639, 0.789]
 9 | tie: [0.349, 0.363, 0.563, 0.732]
10 | tie: [0.259, 0.255, 0.668, 0.805]
11 | person: [0.019, 0.065, 0.962, 0.988]
12 | person: [0.0, 0.24, 0.214, 1.0]
13 | tie: [0.316, 0.778, 0.443, 0.867]
14 | tie: [0.386, 0.707, 0.496, 0.801]
15 | tie: [0.251, 0.354, 0.402, 0.747]
16 | tie: [0.44, 0.318, 0.63, 0.397]
17 | tie: [0.566, 0.33, 0.655, 0.543]
18 | tie: [0.25, 0.409, 0.359, 0.556]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/000_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What is unusual about this photo?
3 | ===
4 | Answer:
5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/001_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges do these people face?
3 | ===
4 | Answer:
5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/002_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.
6 | 
7 | fire hydrant: [0.326, 0.612, 0.426, 0.72]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/002_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges might this city face?
3 | ===
4 | Answer:
5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
 2 | 
 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail.
 4 | 
 5 | Create complex questions beyond describing the scene.
 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request.  Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.
 7 | 
 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.  
 9 | 
10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/000_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/001_caps.txt:
--------------------------------------------------------------------------------
1 | A man is skiing in the open snow covered hills
2 | A skier is making his way into the snow.
3 | A skier on the foothills of a huge mountain range.
4 | A skier looks at mountains while standing near a trail sign.
5 | a single person skiing by an area with a lot of bushes


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image.
 2 | 
 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question.
 4 | Ask diverse questions and give corresponding answers.
 5 | 
 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers:
 7 | (1) one can see the content in the image that the question asks about and can answer confidently;
 8 | (2) one can determine confidently from the image that it is not in the image.
 9 | Do not ask any question that cannot be answered confidently.
10 | 
11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.
12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized.  You can include multiple paragraphs if necessary.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A harbor filled with lots of boats next to a building.
 2 | A bicycle parked in front of several boats at a dock.
 3 | A red bicycle in front of a line of docked white yachts
 4 | A bike sits before boats which sit before a long building.
 5 | A bicycle is a convenient means of land transportation when you live on a boat.
 6 | 
 7 | bicycle: [0.287, 0.641, 0.507, 0.874]
 8 | bicycle: [0.566, 0.667, 0.63, 0.731]
 9 | boat: [0.318, 0.579, 0.575, 0.724]
10 | boat: [0.704, 0.607, 0.818, 0.727]
11 | boat: [0.818, 0.601, 0.942, 0.744]
12 | boat: [0.002, 0.53, 0.243, 0.71]
13 | boat: [0.541, 0.611, 0.668, 0.731]
14 | person: [0.778, 0.527, 0.797, 0.57]
15 | cup: [0.708, 0.733, 0.724, 0.758]
16 | boat: [0.236, 0.532, 0.404, 0.64]
17 | boat: [0.81, 0.632, 0.836, 0.676]
18 | boat: [0.957, 0.526, 1.0, 0.752]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/000_conv.txt:
--------------------------------------------------------------------------------
1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor.
2 | 
3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/001_conv.txt:
--------------------------------------------------------------------------------
1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 
2 | 
3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 
4 | 
5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/002_caps.txt:
--------------------------------------------------------------------------------
 1 | A man holds a Wii-mote above his head while another looks on.
 2 | A guy and his friend are playing Nintendo Wii.
 3 | A young man is holding a video game remote over his head.
 4 | two men standing in a room while one plays with a wii mote
 5 | Some guys standing and playing a video game.
 6 | 
 7 | couch: [0.697, 0.759, 0.995, 1.0]
 8 | dining table: [0.426, 0.755, 1.0, 0.987]
 9 | person: [0.082, 0.252, 0.342, 1.0]
10 | person: [0.399, 0.085, 0.742, 0.982]
11 | remote: [0.477, 0.135, 0.516, 0.187]
12 | sink: [0.016, 0.501, 0.063, 0.52]
13 | potted plant: [0.798, 0.384, 0.888, 0.645]
14 | refrigerator: [0.305, 0.389, 0.414, 0.547]
15 | chair: [0.72, 0.509, 0.858, 0.725]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/002_conv.txt:
--------------------------------------------------------------------------------
1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 
2 | 
3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/system_message.txt:
--------------------------------------------------------------------------------
1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
2 | 
3 | Using the provided caption and bounding box information, describe the scene in a detailed manner.
4 | 
5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.
6 | 
7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.2.2.post1"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.1.2", "torchvision==0.16.2",
17 |     "transformers==4.36.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft", "bitsandbytes",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==4.16.0", "gradio_client==0.8.1",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", "opencv-python", "moviepy", "protobuf"
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.12.6", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://llava-vl.github.io"
31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
38 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/scripts/finetune_sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | deepspeed llava/train/train_mem.py \
 6 |     --deepspeed ./scripts/zero2.json \
 7 |     --model_name_or_path lmsys/vicuna-13b-v1.3 \
 8 |     --version $PROMPT_VERSION \
 9 |     --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
10 |     --image_folder /Data/ScienceQA/data/scienceqa/images/train \
11 |     --vision_tower openai/clip-vit-large-patch14 \
12 |     --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
18 |     --num_train_epochs 12 \
19 |     --per_device_train_batch_size 16 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 1 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 50000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 2e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --report_to wandb
37 | 


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/pretrain_xformers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | # MODEL_VERSION=vicuna-v1-3-7b
 6 | # MODEL_VERSION=llama-2-7b-chat
 7 | 
 8 | ########### DO NOT CHANGE ###########
 9 | ########### USE THIS FOR BOTH ###########
10 | PROMPT_VERSION=plain
11 | ########### DO NOT CHANGE ###########
12 | 
13 | deepspeed llava/train/train_xformers.py \
14 |     --deepspeed ./scripts/zero2.json \
15 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
16 |     --version $PROMPT_VERSION \
17 |     --data_path /path/to/pretrain_data.json \
18 |     --image_folder /path/to/images \
19 |     --vision_tower openai/clip-vit-large-patch14 \
20 |     --tune_mm_mlp_adapter True \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --bf16 False \
25 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 4 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 2e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 False \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb
45 | 


--------------------------------------------------------------------------------
/scripts/sqa_eval_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | for IDX in {0..7}; do
 5 |     CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
 6 |         --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
 7 |         --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
 8 |         --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
 9 |         --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10 |         --num-chunks $CHUNKS \
11 |         --chunk-idx $IDX \
12 |         --conv-mode llava_v1 &
13 | done
14 | 


--------------------------------------------------------------------------------
/scripts/sqa_eval_gather.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | output_file="test_llava-13b.jsonl"
 5 | 
 6 | # Clear out the output file if it exists.
 7 | > "$output_file"
 8 | 
 9 | # Loop through the indices and concatenate each file.
10 | for idx in $(seq 0 $((CHUNKS-1))); do
11 |   cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
12 | done
13 | 
14 | python llava/eval/eval_science_qa.py \
15 |     --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
16 |     --result-file ./test_llava-13b.jsonl \
17 |     --output-file ./test_llava-13b_output.json \
18 |     --output-result ./test_llava-13b_result.json
19 | 


--------------------------------------------------------------------------------
/scripts/upload_pypi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Step 0: Clean up
 4 | rm -rf dist
 5 | 
 6 | # Step 1: Change the package name to "llava-torch"
 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml
 8 | 
 9 | # Step 2: Build the package
10 | python -m build
11 | 
12 | # Step 3: Revert the changes in pyproject.toml to the original
13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml
14 | 
15 | # Step 4: Upload to PyPI
16 | python -m twine upload dist/*
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | matryoshka_vis_token_scale=$1
 8 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 |         --model-path mucai/llava-v1.5-7b-m3 \
15 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
16 |         --image-folder ./playground/data/eval/gqa/data/images \
17 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
38 | 
39 | cd $GQADIR
40 | python eval/eval.py --tier testdev_balanced
41 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | 
 5 | python -m llava.eval.model_vqa \
 6 |     --model-path mucai/llava-v1.5-7b-m3 \
 7 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
 8 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
 9 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/$CKPT.jsonl \
10 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
11 |     --temperature 0 \
12 |     --conv-mode vicuna_v1
13 | 
14 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
15 | 
16 | python llava/eval/eval_gpt_review_bench.py \
17 |     --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
18 |     --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
19 |     --rule llava/eval/table/rule.json \
20 |     --answer-list \
21 |         playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
22 |         playground/data/eval/llava-bench-in-the-wild/answers/$CKPT.jsonl \
23 |     --output \
24 |         playground/data/eval/llava-bench-in-the-wild/reviews/$CKPT.jsonl
25 | 
26 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/$CKPT.jsonl
27 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | python -m llava.eval.model_vqa_mmbench \
 7 |     --model-path mucai/llava-v1.5-7b-m3 \
 8 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
13 |     --conv-mode vicuna_v1
14 | 
15 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
16 | 
17 | python scripts/convert_mmbench_for_submission.py \
18 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
19 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
20 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
21 |     --experiment $CKPT
22 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | SPLIT="mmbench_dev_cn_20231003"
 5 | 
 6 | python -m llava.eval.model_vqa_mmbench \
 7 |     --model-path mucai/llava-v1.5-7b-m3 \
 8 |     --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
 9 |     --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/$CKPT.jsonl \
10 |     --lang cn \
11 |     --single-pred-prompt \
12 |     --temperature 0 \
13 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
14 |     --conv-mode vicuna_v1
15 | 
16 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
17 | 
18 | python scripts/convert_mmbench_for_submission.py \
19 |     --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
20 |     --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \
21 |     --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \
22 |     --experiment $CKPT
23 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme-llava-next.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-next-vicuna-7b-m3-${matryoshka_vis_token_scale}
 4 | 
 5 | python -m llava.eval.model_vqa_loader \
 6 |     --model-path ./checkpoints/llava-next-vicuna-7b-m3 \
 7 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 8 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
12 |     --conv-mode vicuna_v1
13 | 
14 | cd ./playground/data/eval/MME
15 | 
16 | python convert_answer_to_mme.py --experiment $CKPT
17 | 
18 | cd eval_tool
19 | 
20 | python calculation.py --results_dir answers/$CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | 
 5 | python -m llava.eval.model_vqa_loader \
 6 |     --model-path ./checkpoints/llava-v1.5-7b-m3 \
 7 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 8 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
12 |     --conv-mode vicuna_v1
13 | 
14 | cd ./playground/data/eval/MME
15 | 
16 | python convert_answer_to_mme.py --experiment $CKPT
17 | 
18 | cd eval_tool
19 | 
20 | python calculation.py --results_dir answers/$CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | python -m llava.eval.model_vqa \
 5 |     --model-path mucai/llava-v1.5-7b-m3 \
 6 |     --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
 7 |     --image-folder ./playground/data/eval/mm-vet/images \
 8 |     --answers-file ./playground/data/eval/mm-vet/answers/$CKPT.jsonl \
 9 |     --temperature 0 \
10 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p ./playground/data/eval/mm-vet/results
14 | 
15 | python scripts/convert_mmvet_for_eval.py \
16 |     --src ./playground/data/eval/mm-vet/answers/$CKPT.jsonl \
17 |     --dst ./playground/data/eval/mm-vet/results/$CKPT.json
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | python -m llava.eval.model_vqa_loader \
 5 |     --model-path mucai/llava-v1.5-7b-m3 \
 6 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
 7 |     --image-folder ./playground/data/eval/pope/val2014 \
 8 |     --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \
 9 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | python llava/eval/eval_pope.py \
14 |     --annotation-dir ./playground/data/eval/pope/coco \
15 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
16 |     --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | if [ "$1" = "dev" ]; then
 5 |     echo "Evaluating in 'dev' split."
 6 | elif [ "$1" = "test" ]; then
 7 |     echo "Evaluating in 'test' split."
 8 | else
 9 |     echo "Unknown split, please choose between 'dev' and 'test'."
10 |     exit 1
11 | fi
12 | 
13 | python -m llava.eval.model_vqa_qbench \
14 |     --model-path mucai/llava-v1.5-7b-m3 \
15 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
16 |     --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \
17 |     --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \
18 |     --conv-mode llava_v1 \
19 |     --lang en
20 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench_zh.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | if [ "$1" = "dev" ]; then
 5 |     ZH_SPLIT="验证集"
 6 |     echo "Evaluating in 'dev' split."
 7 | elif [ "$1" = "test" ]; then
 8 |     ZH_SPLIT="测试集"
 9 |     echo "Evaluating in 'test' split."
10 | else
11 |     echo "Unknown split, please choose between 'dev' and 'test'."
12 |     exit 1
13 | fi
14 | 
15 | python -m llava.eval.model_vqa_qbench \
16 |     --model-path mucai/llava-v1.5-7b-m3 \
17 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
18 |     --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \
19 |     --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \
20 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
21 |     --conv-mode llava_v1 \
22 |     --lang zh
23 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="llava-v1.5-13b"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path mucai/llava-v1.5-7b-m3 \
14 |         --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \
15 |         --image-folder ./playground/data/eval/seed_bench \
16 |         --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
20 |         --temperature 0 \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | # Evaluate
37 | python scripts/convert_seed_for_submission.py \
38 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
39 |     --result-file $output_file \
40 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/$CKPT.jsonl
41 | 
42 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | python -m llava.eval.model_vqa_science \
 5 |     --model-path mucai/llava-v1.5-7b-m3 \
 6 |     --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
 7 |     --image-folder ./playground/data/eval/scienceqa/images/test \
 8 |     --answers-file ./playground/data/eval/scienceqa/answers/$CKPT.jsonl \
 9 |     --single-pred-prompt \
10 |     --temperature 0 \
11 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
12 |     --conv-mode vicuna_v1
13 | 
14 | python llava/eval/eval_science_qa.py \
15 |     --base-dir ./playground/data/eval/scienceqa \
16 |     --result-file ./playground/data/eval/scienceqa/answers/$CKPT.jsonl \
17 |     --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \
18 |     --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json
19 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | python -m llava.eval.model_vqa_loader \
 5 |     --model-path mucai/llava-v1.5-7b-m3 \
 6 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
 7 |     --image-folder ./playground/data/eval/textvqa/train_images \
 8 |     --answers-file ./playground/data/eval/textvqa/answers/${CKPT}.jsonl \
 9 |     --temperature 0 \
10 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
11 |     --conv-mode vicuna_v1
12 | 
13 | python -m llava.eval.eval_textvqa \
14 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
15 |     --result-file ./playground/data/eval/textvqa/answers/${CKPT}.jsonl
16 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | python -m llava.eval.model_vqa_loader \
 5 |     --model-path mucai/llava-v1.5-7b-m3 \
 6 |     --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
 7 |     --image-folder ./playground/data/eval/vizwiz/test \
 8 |     --answers-file ./playground/data/eval/vizwiz/answers/${CKPT}.jsonl \
 9 |     --temperature 0 \
10 |     --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
11 |     --conv-mode vicuna_v1
12 | 
13 | python scripts/convert_vizwiz_for_submission.py \
14 |     --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
15 |     --result-file ./playground/data/eval/vizwiz/answers/${CKPT}.jsonl \
16 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${CKPT}.json
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | matryoshka_vis_token_scale=$1
 3 | CKPT=llava-v1.5-7b-m3-${matryoshka_vis_token_scale}
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path mucai/llava-v1.5-7b-m3 \
14 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 |         --image-folder ./playground/data/eval/vqav2/test2015 \
16 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --matryoshka_vis_token_scale $matryoshka_vis_token_scale \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path liuhaotian/llava-v1.5-7b \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --mm_vision_select_layer -2 \
12 |     --mm_use_im_start_end False \
13 |     --mm_use_im_patch_token False \
14 |     --image_aspect_ratio pad \
15 |     --group_by_modality_length True \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-v1.5-13b-m3 \
18 |     --num_train_epochs 1 \
19 |     --per_device_train_batch_size 4 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 4 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 50000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 2e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --matryoshka_vis_token_scale "1,9,36,144,576" \
37 |     --unfreeze_mm_vision_tower True \
38 |     --mm_vision_tower_lr 2e-5 \
39 |     --report_to wandb
40 | 
41 |     
42 | 
43 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 5 |     --deepspeed ./scripts/zero3.json \
 6 |     --model_name_or_path liuhaotian/llava-v1.5-7b \
 7 |     --version v1 \
 8 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 9 |     --image_folder ./playground/data \
10 |     --vision_tower openai/clip-vit-large-patch14-336 \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-7b-m3-lora \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 4 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 4 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-4 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --matryoshka_vis_token_scale "1,9,36,144,576" \
38 |     --unfreeze_mm_vision_tower True \
39 |     --mm_vision_tower_lr 2e-5 \
40 |     --report_to wandb
41 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path mucai/llava-v1.5-7b-m3 \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --mm_vision_select_layer -2 \
12 |     --mm_use_im_start_end False \
13 |     --mm_use_im_patch_token False \
14 |     --image_aspect_ratio pad \
15 |     --group_by_modality_length True \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-v1.5-7b-m3-task \
18 |     --num_train_epochs 1 \
19 |     --per_device_train_batch_size 4 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 4 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 50000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 2e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --matryoshka_vis_token_scale "1,9,36,144,576" \
37 |     --unfreeze_mm_vision_tower True \
38 |     --mm_vision_tower_lr 2e-5 \
39 |     --report_to wandb
40 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 5 |     --deepspeed ./scripts/zero3.json \
 6 |     --model_name_or_path mucai/llava-v1.5-7b-m3 \
 7 |     --version v1 \
 8 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 9 |     --image_folder ./playground/data \
10 |     --vision_tower openai/clip-vit-large-patch14-336 \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-7b-m3-task-lora \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 4 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 4 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-4 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --matryoshka_vis_token_scale "1,9,36,144,576" \
38 |     --unfreeze_mm_vision_tower True \
39 |     --mm_vision_tower_lr 2e-5 \
40 |     --report_to wandb
41 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version plain \
 7 |     --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./playground/data/LLaVA-Pretrain/images \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --tune_mm_mlp_adapter True \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --bf16 True \
16 |     --output_dir ./checkpoints/llava-v1.5-13b-pretrain \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 32 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 24000 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-3 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --dataloader_num_workers 4 \
34 |     --lazy_preprocess True \
35 |     --report_to wandb
36 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------