├── .dockerignore ├── .editorconfig ├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── 1-usage.yaml │ ├── 2-feature-request.yaml │ ├── 3-question.yaml │ └── 4-discussion.yaml ├── .gitignore ├── LICENSE ├── README.md ├── img ├── p-mod.png ├── table1.png └── table2.png ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ └── summarize_gpt_review.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_mistral.py │ │ ├── llava_mpt.py │ │ ├── modeling_llama_pmod.py │ │ └── pmod_llava_llama.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── lmms-eval ├── .github │ ├── issue_template.md │ ├── pull_request_template.md │ └── workflows │ │ └── black.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── docs │ ├── README.md │ ├── commands.md │ ├── current_tasks.md │ ├── model_guide.md │ └── task_guide.md ├── lmms_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── evaluator.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── logging_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── batch_gpt4.py │ │ ├── claude.py │ │ ├── from_log.py │ │ ├── fuyu.py │ │ ├── gemini_api.py │ │ ├── gpt4v.py │ │ ├── idefics2.py │ │ ├── instructblip.py │ │ ├── internvl.py │ │ ├── llama_vid.py │ │ ├── llava.py │ │ ├── llava_hf.py │ │ ├── llava_sglang.py │ │ ├── llava_vid.py │ │ ├── longva.py │ │ ├── minicpm_v.py │ │ ├── model_utils │ │ │ ├── __init__.py │ │ │ ├── load_video.py │ │ │ └── qwen │ │ │ │ └── qwen_generate_utils.py │ │ ├── mplug_owl_video.py │ │ ├── mplug_owl_video │ │ │ ├── __init__.py │ │ │ ├── configuration_mplug_owl.py │ │ │ ├── modeling_mplug_owl.py │ │ │ ├── processing_mplug_owl.py │ │ │ └── tokenization_mplug_owl.py │ │ ├── phi3v.py │ │ ├── qwen_vl.py │ │ ├── qwen_vl_api.py │ │ ├── reka.py │ │ ├── tinyllava.py │ │ ├── video_chatgpt.py │ │ ├── video_chatgpt │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── eval │ │ │ │ ├── __init__.py │ │ │ │ └── model_utils.py │ │ │ ├── inference.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── consolidate.py │ │ │ │ ├── make_delta.py │ │ │ │ ├── utils.py │ │ │ │ └── video_chatgpt.py │ │ │ ├── single_video_inference.py │ │ │ ├── utils.py │ │ │ └── video_conversation.py │ │ ├── video_llava.py │ │ ├── xcomposer2_4KHD.py │ │ └── xcomposer2_4khd.py │ ├── tasks │ │ ├── __init__.py │ │ ├── _task_utils │ │ │ ├── file_utils.py │ │ │ ├── gpt_eval_utils.py │ │ │ ├── video_loader.py │ │ │ └── vqa_eval_metric.py │ │ ├── activitynetqa │ │ │ ├── _default_template_yaml │ │ │ ├── activitynetqa_generation.yaml │ │ │ └── utils.py │ │ ├── ai2d │ │ │ ├── ai2d.yaml │ │ │ ├── upload_ai2d.py │ │ │ └── utils.py │ │ ├── chartqa │ │ │ ├── chartqa.yaml │ │ │ ├── upload_chartqa.py │ │ │ └── utils.py │ │ ├── cmmmu │ │ │ ├── _cmmmu.yaml │ │ │ ├── _default_template_cmmmu_yaml │ │ │ ├── cmmmu_test.yaml │ │ │ ├── cmmmu_val.yaml │ │ │ └── utils.py │ │ ├── coco_cap │ │ │ ├── coco2014_cap.yaml │ │ │ ├── coco2014_cap_test.yaml │ │ │ ├── coco2014_cap_val.yaml │ │ │ ├── coco2017_cap.yaml │ │ │ ├── coco2017_cap_test.yaml │ │ │ ├── coco2017_cap_val.yaml │ │ │ ├── coco_cap.yaml │ │ │ └── utils.py │ │ ├── conbench │ │ │ ├── conbench.yaml │ │ │ └── utils.py │ │ ├── cvrr │ │ │ ├── _cvrr.yaml │ │ │ ├── _default_template_yaml │ │ │ ├── cvrr_fine_grained_action_understanding.yaml │ │ │ ├── cvrr_interpretation_of_social_context.yaml │ │ │ ├── cvrr_interpretation_of_visual_context.yaml │ │ │ ├── cvrr_multiple_actions_in_a_single_video.yaml │ │ │ ├── cvrr_non_existent_actions_with_existent_scene_depictions.yaml │ │ │ ├── cvrr_non_existent_actions_with_non_existent_scene_depictions.yaml │ │ │ ├── cvrr_object_instance_count.yaml │ │ │ ├── cvrr_partial_actions.yaml │ │ │ ├── cvrr_time_order_understanding.yaml │ │ │ ├── cvrr_understanding_emotional_context.yaml │ │ │ ├── cvrr_unusual_and_physically_anomalous_activities.yaml │ │ │ └── utils.py │ │ ├── docvqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── docvqa.yaml │ │ │ ├── docvqa_test.yaml │ │ │ ├── docvqa_val.yaml │ │ │ └── utils.py │ │ ├── egoschema │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── egoschema.yaml │ │ │ ├── egoschema_mcppl.yaml │ │ │ ├── egoschema_subset.yaml │ │ │ ├── egoschema_subset_mcppl.yaml │ │ │ └── utils.py │ │ ├── ferret │ │ │ ├── ferret.yaml │ │ │ └── utils.py │ │ ├── flickr30k │ │ │ ├── flickr30k.yaml │ │ │ ├── flickr30k_test.yaml │ │ │ └── utils.py │ │ ├── gqa │ │ │ ├── gqa.yaml │ │ │ └── utils.py │ │ ├── gqa_ru │ │ │ ├── gqa_ru.yaml │ │ │ └── utils.py │ │ ├── hallusion_bench │ │ │ ├── evaluate_hb.py │ │ │ ├── hallusion_bench_image.yaml │ │ │ └── utils.py │ │ ├── iconqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── iconqa.yaml │ │ │ ├── iconqa_test.yaml │ │ │ ├── iconqa_val.yaml │ │ │ └── utils.py │ │ ├── ii_bench │ │ │ ├── ii_bench.yaml │ │ │ └── utils.py │ │ ├── infovqa │ │ │ ├── _default_template_infovqa_yaml │ │ │ ├── infovqa.yaml │ │ │ ├── infovqa_test.yaml │ │ │ ├── infovqa_val.yaml │ │ │ └── utils.py │ │ ├── internal_eval │ │ │ ├── _default_template_internal_eval_yaml │ │ │ ├── d170_cn.yaml │ │ │ ├── d170_cn_utils.py │ │ │ ├── d170_en.yaml │ │ │ ├── d170_en_utils.py │ │ │ ├── dc100_en.yaml │ │ │ ├── dc100_en_utils.py │ │ │ ├── dc200_cn.yaml │ │ │ ├── dc200_cn_utils.py │ │ │ ├── internal_eval.yaml │ │ │ └── utils.py │ │ ├── llava-bench-coco │ │ │ ├── llava-bench-coco.yaml │ │ │ └── utils.py │ │ ├── llava-in-the-wild │ │ │ ├── llava-in-the-wild.yaml │ │ │ └── utils.py │ │ ├── llava_wilder │ │ │ ├── _default_template_wilder_yaml │ │ │ ├── llava_wilder_small.yaml │ │ │ └── utils.py │ │ ├── longvideobench │ │ │ ├── longvideobench_val_i.yaml │ │ │ ├── longvideobench_val_v.yaml │ │ │ └── utils.py │ │ ├── mathverse │ │ │ ├── mathverse.yaml │ │ │ ├── mathverse_evals.py │ │ │ ├── mathverse_testmini.yaml │ │ │ ├── mathverse_testmini_text_dominant.yaml │ │ │ ├── mathverse_testmini_text_lite.yaml │ │ │ ├── mathverse_testmini_text_only.yaml │ │ │ ├── mathverse_testmini_vision_dominant.yaml │ │ │ ├── mathverse_testmini_vision_intensive.yaml │ │ │ ├── mathverse_testmini_vision_only.yaml │ │ │ └── utils.py │ │ ├── mathvista │ │ │ ├── mathvista.yaml │ │ │ ├── mathvista_evals.py │ │ │ ├── mathvista_test.yaml │ │ │ ├── mathvista_testmini.yaml │ │ │ └── utils.py │ │ ├── mmbench │ │ │ ├── _default_template_mmbench_cn_yaml │ │ │ ├── _default_template_mmbench_en_yaml │ │ │ ├── _default_template_mmbench_ru_yaml │ │ │ ├── cc_utils.py │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── mmbench.yaml │ │ │ ├── mmbench_cc.yaml │ │ │ ├── mmbench_cn.yaml │ │ │ ├── mmbench_cn_dev.yaml │ │ │ ├── mmbench_cn_test.yaml │ │ │ ├── mmbench_en.yaml │ │ │ ├── mmbench_en_dev.yaml │ │ │ ├── mmbench_en_test.yaml │ │ │ ├── mmbench_evals.py │ │ │ ├── mmbench_ru_dev.yaml │ │ │ └── ru_utils.py │ │ ├── mme │ │ │ ├── mme.yaml │ │ │ └── utils.py │ │ ├── mmmu │ │ │ ├── arial.ttf │ │ │ ├── mmmu.yaml │ │ │ ├── mmmu_group_img.yaml │ │ │ ├── mmmu_group_img_test.yaml │ │ │ ├── mmmu_group_img_val.yaml │ │ │ ├── mmmu_test.yaml │ │ │ ├── mmmu_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_group_img.py │ │ ├── mmupd │ │ │ ├── _default_template_mmupd_yaml │ │ │ ├── mmaad_base.yaml │ │ │ ├── mmaad_instruction.yaml │ │ │ ├── mmaad_option.yaml │ │ │ ├── mmiasd_base.yaml │ │ │ ├── mmiasd_instruction.yaml │ │ │ ├── mmiasd_option.yaml │ │ │ ├── mmivqd_base.yaml │ │ │ ├── mmivqd_instruction.yaml │ │ │ ├── mmivqd_option.yaml │ │ │ ├── mmupd.yaml │ │ │ ├── mmupd_base.yaml │ │ │ ├── mmupd_evals.py │ │ │ ├── mmupd_instruction.yaml │ │ │ ├── mmupd_option.yaml │ │ │ └── utils.py │ │ ├── mmvet │ │ │ ├── mmvet.yaml │ │ │ └── utils.py │ │ ├── multidocvqa │ │ │ ├── multidocvqa.yaml │ │ │ ├── multidocvqa_test.yaml │ │ │ ├── multidocvqa_val.yaml │ │ │ └── utils.py │ │ ├── multilingual-llava-bench-in-the-wild │ │ │ ├── README.md │ │ │ ├── _default_template.yaml │ │ │ ├── arabic_llava_in_the_wild.yaml │ │ │ ├── bengali_llava_in_the_wild.yaml │ │ │ ├── chinese_llava_in_the_wild.yaml │ │ │ ├── french_llava_in_the_wild.yaml │ │ │ ├── hindi_llava_in_the_wild.yaml │ │ │ ├── japanese_llava_in_the_wild.yaml │ │ │ ├── russian_llava_in_the_wild.yaml │ │ │ ├── spanish_llava_in_the_wild.yaml │ │ │ ├── urdu_llava_in_the_wild.yaml │ │ │ └── utils.py │ │ ├── nextqa │ │ │ ├── _default_template_yaml │ │ │ ├── nextqa.yaml │ │ │ ├── nextqa_mc_test.yaml │ │ │ ├── nextqa_oe_test.yaml │ │ │ ├── nextqa_oe_val.yaml │ │ │ ├── stopwords.csv │ │ │ └── utils.py │ │ ├── nocaps │ │ │ ├── _default_template_nocaps_yaml │ │ │ ├── nocaps.yaml │ │ │ ├── nocaps_test.yaml │ │ │ ├── nocaps_val.yaml │ │ │ └── utils.py │ │ ├── ocrbench │ │ │ ├── ocrbench.yaml │ │ │ ├── upload_ocrbench.py │ │ │ └── utils.py │ │ ├── ok_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _ok_vqa.yaml │ │ │ ├── ok_vqa_val2014.yaml │ │ │ └── utils.py │ │ ├── olympiadbench │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── olympiadbench.yaml │ │ │ ├── olympiadbench_evals.py │ │ │ ├── olympiadbench_test_cn.yaml │ │ │ └── olympiadbench_test_en.yaml │ │ ├── perceptiontest │ │ │ ├── test │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── perceptiontest_mc.yaml │ │ │ │ ├── perceptiontest_mcppl.yaml │ │ │ │ └── utils.py │ │ │ └── val │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── perceptiontest_mc.yaml │ │ │ │ ├── perceptiontest_mcppl.yaml │ │ │ │ └── utils.py │ │ ├── pope │ │ │ ├── pope.yaml │ │ │ ├── pope_adv.yaml │ │ │ ├── pope_full.yaml │ │ │ ├── pope_pop.yaml │ │ │ ├── pope_random.yaml │ │ │ └── utils.py │ │ ├── qbench │ │ │ ├── abench_dev.yaml │ │ │ ├── qbench2_dev.yaml │ │ │ ├── qbench_dev.yaml │ │ │ ├── qbenchs_dev.yaml │ │ │ └── utils.py │ │ ├── realworldqa │ │ │ ├── realworldqa.yaml │ │ │ └── utils.py │ │ ├── refcoco+ │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco+_bbox_testA.yaml │ │ │ ├── refcoco+_bbox_testB.yaml │ │ │ ├── refcoco+_bbox_val.yaml │ │ │ ├── refcoco+_seg_testA.yaml │ │ │ ├── refcoco+_seg_testB.yaml │ │ │ ├── refcoco+_seg_val.yaml │ │ │ └── utils.py │ │ ├── refcoco │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco_bbox_test.yaml │ │ │ ├── refcoco_bbox_testA.yaml │ │ │ ├── refcoco_bbox_testB.yaml │ │ │ ├── refcoco_bbox_val.yaml │ │ │ ├── refcoco_seg_test.yaml │ │ │ ├── refcoco_seg_testA.yaml │ │ │ ├── refcoco_seg_testB.yaml │ │ │ ├── refcoco_seg_val.yaml │ │ │ └── utils.py │ │ ├── refcocog │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcocog_bbox_test.yaml │ │ │ ├── refcocog_bbox_val.yaml │ │ │ ├── refcocog_seg_test.yaml │ │ │ ├── refcocog_seg_val.yaml │ │ │ └── utils.py │ │ ├── scienceqa │ │ │ ├── scienceqa.yaml │ │ │ ├── scienceqa_full.yaml │ │ │ ├── scienceqa_img.yaml │ │ │ └── utils.py │ │ ├── screenspot │ │ │ ├── README.md │ │ │ ├── _default_template_rec_yaml │ │ │ ├── _default_template_reg_yaml │ │ │ ├── _screenspot.yaml │ │ │ ├── screenspot_rec_test.yaml │ │ │ ├── screenspot_reg_test.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── seedbench │ │ │ ├── seedbench.yaml │ │ │ ├── seedbench_ppl.yaml │ │ │ └── utils.py │ │ ├── seedbench_2 │ │ │ ├── seedbench_2.yaml │ │ │ └── utils.py │ │ ├── stvqa │ │ │ ├── stvqa.yaml │ │ │ └── utils.py │ │ ├── synthdog │ │ │ ├── donut_evaluator.py │ │ │ ├── synthdog.yaml │ │ │ ├── synthdog_en.yaml │ │ │ ├── synthdog_zh.yaml │ │ │ └── utils.py │ │ ├── tempcompass │ │ │ ├── _default_template_yaml │ │ │ ├── _tempcompass.yaml │ │ │ ├── tempcompass_caption_matching.yaml │ │ │ ├── tempcompass_captioning.yaml │ │ │ ├── tempcompass_mc.yaml │ │ │ ├── tempcompass_yes_no.yaml │ │ │ └── utils.py │ │ ├── textcaps │ │ │ ├── _default_template_textcaps_yaml │ │ │ ├── textcaps.yaml │ │ │ ├── textcaps_test.yaml │ │ │ ├── textcaps_train.yaml │ │ │ ├── textcaps_val.yaml │ │ │ └── utils.py │ │ ├── textvqa │ │ │ ├── _default_template_textvqa_yaml │ │ │ ├── _textvqa.yaml │ │ │ ├── textvqa_test.yaml │ │ │ ├── textvqa_val.yaml │ │ │ └── utils.py │ │ ├── vatex │ │ │ ├── _vatex.yaml │ │ │ ├── utils.py │ │ │ ├── vatex_test.yaml │ │ │ └── vatex_val_zh.yaml │ │ ├── vcr_wiki │ │ │ ├── _default_template_vcr_yaml │ │ │ ├── utils.py │ │ │ ├── vcr_wiki_en_easy.yaml │ │ │ ├── vcr_wiki_en_easy_100.yaml │ │ │ ├── vcr_wiki_en_easy_500.yaml │ │ │ ├── vcr_wiki_en_hard.yaml │ │ │ ├── vcr_wiki_en_hard_100.yaml │ │ │ ├── vcr_wiki_en_hard_500.yaml │ │ │ ├── vcr_wiki_zh_easy.yaml │ │ │ ├── vcr_wiki_zh_easy_100.yaml │ │ │ ├── vcr_wiki_zh_easy_500.yaml │ │ │ ├── vcr_wiki_zh_hard.yaml │ │ │ ├── vcr_wiki_zh_hard_100.yaml │ │ │ └── vcr_wiki_zh_hard_500.yaml │ │ ├── video_detail_description │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── utils.py │ │ │ └── video_detail_description.yaml │ │ ├── videochatgpt │ │ │ ├── _default_template_yaml │ │ │ ├── _videochatgpt.yaml │ │ │ ├── utils.py │ │ │ ├── videochatgpt_consistency.yaml │ │ │ ├── videochatgpt_generic.yaml │ │ │ └── videochatgpt_temporal.yaml │ │ ├── videomme │ │ │ ├── utils.py │ │ │ ├── videomme.yaml │ │ │ └── videomme_w_subtitle.yaml │ │ ├── vitatecs │ │ │ ├── _default_template_yaml │ │ │ ├── _vitatecs.yaml │ │ │ ├── utils.py │ │ │ ├── vitatecs_compositionality.yaml │ │ │ ├── vitatecs_direction.yaml │ │ │ ├── vitatecs_intensity.yaml │ │ │ ├── vitatecs_localization.yaml │ │ │ ├── vitatecs_sequence.yaml │ │ │ └── vitatecs_type.yaml │ │ ├── vizwiz_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _vizwiz_vqa.yaml │ │ │ ├── utils.py │ │ │ ├── vizwiz_vqa_test.yaml │ │ │ └── vizwiz_vqa_val.yaml │ │ ├── vqav2 │ │ │ ├── _default_template_vqav2_yaml │ │ │ ├── _vqav2.yaml │ │ │ ├── utils.py │ │ │ ├── vqav2_test.yaml │ │ │ └── vqav2_val.yaml │ │ ├── websrc │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── websrc.yaml │ │ │ ├── websrc_test.yaml │ │ │ └── websrc_val.yaml │ │ ├── wild_vision_bench │ │ │ ├── _default_template_yaml │ │ │ ├── utils.py │ │ │ └── wild_vision_bench0617.yaml │ │ ├── worldqa │ │ │ ├── _default_template_yaml │ │ │ ├── utils.py │ │ │ ├── worldqa.yaml │ │ │ ├── worldqa_generation.yaml │ │ │ ├── worldqa_mc.yaml │ │ │ ├── worldqa_mc_evaluator.py │ │ │ └── worldqa_mcppl.yaml │ │ └── youcook2 │ │ │ ├── _default_template_yaml │ │ │ ├── utils.py │ │ │ └── youcook2_val.yaml │ └── utils.py ├── miscs │ ├── example_eval.yaml │ ├── llava_repr_requirements.txt │ ├── llava_result_check.md │ ├── llava_sglang_result_check.md │ ├── repr_scripts.sh │ ├── repr_torch_envs.txt │ ├── scienceqa_id.txt │ ├── script.sh │ ├── test_llava.py │ ├── test_scienceqa.py │ ├── tinyllava_repr_requirements.txt │ └── tinyllava_repr_scripts.sh ├── pyproject.toml ├── setup.py └── tools │ ├── get_video_avg_time.py │ ├── make_image_hf_dataset.ipynb │ ├── make_video_hf_dataset.ipynb │ └── makecvrr.ipynb ├── pyproject.toml ├── scripts ├── lmms-eval │ └── eval.sh ├── train │ ├── finetune_eval_7b_pmod_llava_1_5.sh │ └── finetune_eval_7b_pmod_llava_next.sh ├── upload_pypi.sh ├── zero1.json ├── zero2.json ├── zero3.json └── zero3_offload.json └── util_scripts ├── clean_data_json.py ├── demo.py ├── download_llava-next_data.py └── setup_env.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | # The .dockerignore file excludes files from the container build process. 2 | # 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file 4 | 5 | # Exclude Git files 6 | .git 7 | .github 8 | .gitignore 9 | 10 | # Exclude Python cache files 11 | __pycache__ 12 | .mypy_cache 13 | .pytest_cache 14 | .ruff_cache 15 | 16 | # Exclude Python virtual environment 17 | /venv 18 | 19 | # Exclude some weights 20 | /openai 21 | /liuhaotian 22 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | # Unix-style newlines with a newline ending every file 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | charset = utf-8 9 | 10 | # 4 space indentation 11 | [*.{py,json}] 12 | indent_style = space 13 | indent_size = 4 14 | 15 | # 2 space indentation 16 | [*.{md,sh,yaml,yml}] 17 | indent_style = space 18 | indent_size = 2 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitattributes 2 | 3 | # Set the default behavior, in case people don't have core.autocrlf set. 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion 5 | * text=auto 6 | 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | *.pyz text diff=python 16 | *.pyi text diff=python 17 | 18 | # Binary files 19 | # ============ 20 | *.db binary 21 | *.p binary 22 | *.pkl binary 23 | *.pickle binary 24 | *.pyc binary export-ignore 25 | *.pyo binary export-ignore 26 | *.pyd binary 27 | 28 | # Jupyter notebook 29 | *.ipynb text eol=lf 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-usage.yaml: -------------------------------------------------------------------------------- 1 | name: Usage issues 2 | description: Report issues in usage. 3 | title: "[Usage] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this form. Please give as detailed description as possible for us to better assist with the issue :) 9 | - type: textarea 10 | id: what-happened 11 | attributes: 12 | label: Describe the issue 13 | description: Please give as detailed description as possible for us to better assist with the issue. Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub. 14 | placeholder: Issue 15 | value: | 16 | Issue: 17 | 18 | Command: 19 | ``` 20 | PASTE THE COMMANDS HERE. 21 | ``` 22 | 23 | Log: 24 | ``` 25 | PASTE THE LOGS HERE. 26 | ``` 27 | 28 | Screenshots: 29 | You may attach screenshots if it better explains the issue. 30 | validations: 31 | required: true 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Request for a new feature 3 | title: "[Feature request] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. Please share your thoughts of the new features below. 9 | - type: textarea 10 | id: feature 11 | attributes: 12 | label: feature 13 | placeholder: Start your thoughts here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-question.yaml: -------------------------------------------------------------------------------- 1 | name: Questions 2 | description: General questions about the work 3 | title: "[Question] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: question 11 | attributes: 12 | label: Question 13 | placeholder: Start question here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-discussion.yaml: -------------------------------------------------------------------------------- 1 | name: Discussions 2 | description: General discussions about the work 3 | title: "[Discussion] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: discussion 11 | attributes: 12 | label: Discussion 13 | placeholder: Start discussion here... -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | *.pyc 4 | *.egg-info 5 | dist 6 | 7 | # Log 8 | *.log 9 | *.log.* 10 | *.json 11 | *.jsonl 12 | 13 | # Data 14 | !**/alpaca-data-conversation.json 15 | 16 | # Editor 17 | .idea 18 | *.swp 19 | 20 | # Other 21 | .DS_Store 22 | wandb 23 | output 24 | 25 | checkpoints 26 | ckpts* 27 | 28 | .ipynb_checkpoints 29 | # *.ipynb 30 | 31 | # DevContainer 32 | !.devcontainer/* 33 | 34 | # Demo 35 | serve_images/ 36 | 37 | # Ignore training and evaluation data 38 | playground/data/* 39 | !playground/data/coco2014_val_qa_eval 40 | !playground/data/prompts 41 | !playground/data/coco2014_val_gpt4_qa_30x3.jsonl 42 | 43 | # VSCode jsons 44 | #!.vscode/launch.json 45 | #!.vscode/settings.json 46 | 47 | # deepspeed zero{1,2,3}.json config files 48 | !scripts/*.json 49 | -------------------------------------------------------------------------------- /img/p-mod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/p-mod.png -------------------------------------------------------------------------------- /img/table1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/table1.png -------------------------------------------------------------------------------- /img/table2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/img/table2.png -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import PmodLlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.pmod_llava_llama import PmodLlavaLlamaForCausalLM, PmodLlavaConfig 2 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /lmms-eval/.github/issue_template.md: -------------------------------------------------------------------------------- 1 | Before you open an issue, please check if a similar issue already exists or has been closed before. 2 | 3 | ### When you open an issue, please be sure to include the following 4 | 5 | - [ ] A descriptive title: [xxx] XXXX 6 | - [ ] A detailed description 7 | 8 | Thank you for your contributions! 9 | -------------------------------------------------------------------------------- /lmms-eval/.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Before you open a pull-request, please check if a similar issue already exists or has been closed before. 2 | 3 | ### When you open a pull-request, please be sure to include the following 4 | 5 | - [ ] A descriptive title: [xxx] XXXX 6 | - [ ] A detailed description 7 | 8 | Thank you for your contributions! 9 | -------------------------------------------------------------------------------- /lmms-eval/.github/workflows/black.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Set up Python 11 | uses: actions/setup-python@v4 12 | with: 13 | python-version: '3.9' 14 | - name: Install specific version of Black 15 | run: pip install black==23.9.1 16 | - name: Run Black 17 | run: black --line-length=240 ./ -------------------------------------------------------------------------------- /lmms-eval/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.pyc 3 | output/ 4 | data/ 5 | lm_cache 6 | .idea 7 | build 8 | dist 9 | *.egg-info 10 | venv 11 | .vscode/ 12 | temp 13 | __pycache__ 14 | .ipynb_checkpoints 15 | temp 16 | # IPython 17 | profile_default/ 18 | ipython_config.py 19 | logs/ 20 | scripts/ 21 | wandb/ 22 | SimSun.ttf 23 | submissions/ 24 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json 25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json 26 | zk.log 27 | cache_dir 28 | ckpt 29 | pretrained/ 30 | LLaVA/ 31 | *logs 32 | temp/ 33 | InternVL/ 34 | logs/ 35 | data/ 36 | llava-video/ 37 | Video-MME/ 38 | VATEX/ 39 | lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc 40 | -------------------------------------------------------------------------------- /lmms-eval/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.12.1 4 | hooks: 5 | - id: black 6 | language_version: python3 -------------------------------------------------------------------------------- /lmms-eval/docs/README.md: -------------------------------------------------------------------------------- 1 | # LMMs Eval Documentation 2 | 3 | Welcome to the docs for `lmms-eval`! 4 | 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) 6 | 7 | ## Table of Contents 8 | 9 | * To learn about the command line flags, see the [commands](commands.md) 10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md). 11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). 12 | * If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools) 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/api/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/instance.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Literal, Tuple 3 | 4 | 5 | @dataclass 6 | class Instance: 7 | request_type: Literal["loglikelihood", "generate_until"] 8 | arguments: tuple 9 | idx: int 10 | metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here 11 | resps: list = field(default_factory=list) 12 | filtered_resps: dict = field(default_factory=dict) 13 | 14 | # initialized after init 15 | task_name: str = None 16 | doc_id: str = None 17 | repeats: str = None 18 | doc: dict = None 19 | 20 | def __post_init__(self) -> None: 21 | # unpack metadata field 22 | self.task_name, self.doc_id, self.repeats = self.metadata 23 | 24 | @property 25 | def args(self): 26 | """ 27 | Returns (string,) where `string` is the string to calculate loglikelihood over 28 | """ 29 | return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/decontamination.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class DecontaminationFilter(Filter): 5 | """ 6 | A filter which evaluates 7 | """ 8 | 9 | name = "track_decontamination" 10 | 11 | def __init__(self, path) -> None: 12 | """ 13 | 14 | TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 15 | should further cache result on a given (task_name, doc_id) 16 | """ 17 | self._decontam_results = None 18 | 19 | def apply(self, resps, docs) -> None: 20 | """ 21 | Return {"no_contamination", "only_contamination"} keys for the 2 different subsets 22 | """ 23 | pass 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/model_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCG-NJU/p-MoD/30c16ecf45e119142a703958c8b1ec349e3891f2/lmms-eval/lmms_eval/models/model_utils/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/video_chatgpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import VideoChatGPTLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/video_chatgpt/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | 7 | # Defining model 8 | DEFAULT_VIDEO_TOKEN = "