├── .devcontainer ├── Dockerfile ├── devcontainer.env ├── devcontainer.json └── postCreateCommand.sh ├── .dockerignore ├── .editorconfig ├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── 1-usage.yaml │ ├── 2-feature-request.yaml │ ├── 3-question.yaml │ └── 4-discussion.yaml ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── deepstack_teaser.png ├── deepstack_vl.png ├── logo.png └── visualization.png ├── cog.yaml ├── docs └── MODEL_ZOO.md ├── images ├── demo_cli.gif ├── llava_example_cmp.png ├── llava_logo.png └── llava_v1_5_radar.jpg ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── deepstack_arch.py │ ├── language_model │ │ ├── custom_llama.py │ │ ├── custom_phi.py │ │ ├── deepstack_llama.py │ │ ├── deepstack_phi.py │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ ├── llava_mpt.py │ │ └── phi3 │ │ │ ├── configuration_phi3.py │ │ │ └── modeling_phi3.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── lmms-eval ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── docs │ ├── README.md │ ├── commands.md │ ├── model_guide.md │ └── task_guide.md ├── example_eval.yaml ├── llava_repr_requirements.txt ├── lmms_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── evaluator.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── logging_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── fuyu.py │ │ ├── gpt4v.py │ │ ├── instructblip.py │ │ ├── llava.py │ │ ├── llava_hf.py │ │ ├── llava_sglang.py │ │ ├── minicpm_v.py │ │ ├── model_utils │ │ │ ├── __init__.py │ │ │ └── qwen │ │ │ │ └── qwen_generate_utils.py │ │ └── qwen_vl.py │ ├── tasks │ │ ├── __init__.py │ │ ├── _task_utils │ │ │ ├── file_utils.py │ │ │ ├── gpt_eval_utils.py │ │ │ └── vqa_eval_metric.py │ │ ├── ai2d │ │ │ ├── ai2d.yaml │ │ │ ├── upload_ai2d.py │ │ │ └── utils.py │ │ ├── chartqa │ │ │ ├── chartqa.yaml │ │ │ ├── upload_chartqa.py │ │ │ └── utils.py │ │ ├── cmmmu │ │ │ ├── _cmmmu.yaml │ │ │ ├── _default_template_cmmmu_yaml │ │ │ ├── cmmmu_test.yaml │ │ │ ├── cmmmu_val.yaml │ │ │ └── utils.py │ │ ├── coco_cap │ │ │ ├── coco2014_cap.yaml │ │ │ ├── coco2014_cap_test.yaml │ │ │ ├── coco2014_cap_val.yaml │ │ │ ├── coco2017_cap.yaml │ │ │ ├── coco2017_cap_test.yaml │ │ │ ├── coco2017_cap_val.yaml │ │ │ ├── coco_cap.yaml │ │ │ └── utils.py │ │ ├── docvqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── docvqa.yaml │ │ │ ├── docvqa_test.yaml │ │ │ ├── docvqa_val.yaml │ │ │ └── utils.py │ │ ├── ferret │ │ │ ├── ferret.yaml │ │ │ └── utils.py │ │ ├── flickr30k │ │ │ ├── flickr30k.yaml │ │ │ ├── flickr30k_test.yaml │ │ │ └── utils.py │ │ ├── gqa │ │ │ ├── gqa.yaml │ │ │ └── utils.py │ │ ├── hallusion_bench │ │ │ ├── evaluate_hb.py │ │ │ ├── hallusion_bench_image.yaml │ │ │ └── utils.py │ │ ├── iconqa │ │ │ ├── _default_template_docvqa_yaml │ │ │ ├── iconqa.yaml │ │ │ ├── iconqa_test.yaml │ │ │ ├── iconqa_val.yaml │ │ │ └── utils.py │ │ ├── infovqa │ │ │ ├── _default_template_infovqa_yaml │ │ │ ├── infovqa.yaml │ │ │ ├── infovqa_test.yaml │ │ │ ├── infovqa_val.yaml │ │ │ └── utils.py │ │ ├── llava-bench-coco │ │ │ ├── llava-bench-coco.yaml │ │ │ └── utils.py │ │ ├── llava-in-the-wild │ │ │ ├── llava-in-the-wild.yaml │ │ │ └── utils.py │ │ ├── mathverse │ │ │ ├── mathverse.yaml │ │ │ ├── mathverse_evals.py │ │ │ ├── mathverse_testmini.yaml │ │ │ ├── mathverse_testmini_text_dominant.yaml │ │ │ ├── mathverse_testmini_text_lite.yaml │ │ │ ├── mathverse_testmini_text_only.yaml │ │ │ ├── mathverse_testmini_vision_dominant.yaml │ │ │ ├── mathverse_testmini_vision_intensive.yaml │ │ │ ├── mathverse_testmini_vision_only.yaml │ │ │ └── utils.py │ │ ├── mathvista │ │ │ ├── mathvista.yaml │ │ │ ├── mathvista_evals.py │ │ │ ├── mathvista_test.yaml │ │ │ ├── mathvista_testmini.yaml │ │ │ └── utils.py │ │ ├── mmbench │ │ │ ├── _default_template_mmbench_cn_yaml │ │ │ ├── _default_template_mmbench_en_yaml │ │ │ ├── cc_utils.py │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── mmbench.yaml │ │ │ ├── mmbench_cc.yaml │ │ │ ├── mmbench_cn.yaml │ │ │ ├── mmbench_cn_dev.yaml │ │ │ ├── mmbench_cn_test.yaml │ │ │ ├── mmbench_en.yaml │ │ │ ├── mmbench_en_dev.yaml │ │ │ ├── mmbench_en_test.yaml │ │ │ └── mmbench_evals.py │ │ ├── mme │ │ │ ├── mme.yaml │ │ │ └── utils.py │ │ ├── mmmu │ │ │ ├── mmmu.yaml │ │ │ ├── mmmu_test.yaml │ │ │ ├── mmmu_val.yaml │ │ │ └── utils.py │ │ ├── mmvet │ │ │ ├── mmvet.yaml │ │ │ └── utils.py │ │ ├── multidocvqa │ │ │ ├── multidocvqa.yaml │ │ │ ├── multidocvqa_test.yaml │ │ │ ├── multidocvqa_val.yaml │ │ │ └── utils.py │ │ ├── multilingual-llava-bench-in-the-wild │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── arabic_llava_in_the_wild.yaml │ │ │ ├── bengali_llava_in_the_wild.yaml │ │ │ ├── chinese_llava_in_the_wild.yaml │ │ │ ├── french_llava_in_the_wild.yaml │ │ │ ├── hindi_llava_in_the_wild.yaml │ │ │ ├── japanese_llava_in_the_wild.yaml │ │ │ ├── russian_llava_in_the_wild.yaml │ │ │ ├── spanish_llava_in_the_wild.yaml │ │ │ ├── urdu_llava_in_the_wild.yaml │ │ │ └── utils.py │ │ ├── nocaps │ │ │ ├── _default_template_nocaps_yaml │ │ │ ├── nocaps.yaml │ │ │ ├── nocaps_test.yaml │ │ │ ├── nocaps_val.yaml │ │ │ └── utils.py │ │ ├── ocrbench │ │ │ ├── ocrbench.yaml │ │ │ ├── upload_ocrbench.py │ │ │ └── utils.py │ │ ├── ok_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _ok_vqa.yaml │ │ │ ├── ok_vqa_val2014.yaml │ │ │ └── utils.py │ │ ├── olympiadbench │ │ │ ├── cn_utils.py │ │ │ ├── en_utils.py │ │ │ ├── olympiadbench.yaml │ │ │ ├── olympiadbench_evals.py │ │ │ ├── olympiadbench_test_cn.yaml │ │ │ └── olympiadbench_test_en.yaml │ │ ├── pope │ │ │ ├── pope.yaml │ │ │ └── utils.py │ │ ├── realworldqa │ │ │ ├── realworldqa.yaml │ │ │ └── utils.py │ │ ├── refcoco+ │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco+_bbox_rec_testA.yaml │ │ │ ├── refcoco+_bbox_rec_testB.yaml │ │ │ ├── refcoco+_bbox_rec_val.yaml │ │ │ ├── refcoco+_bbox_testA.yaml │ │ │ ├── refcoco+_bbox_testB.yaml │ │ │ ├── refcoco+_bbox_val.yaml │ │ │ ├── refcoco+_seg_testA.yaml │ │ │ ├── refcoco+_seg_testB.yaml │ │ │ ├── refcoco+_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── refcoco │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcoco_bbox_rec_test.yaml │ │ │ ├── refcoco_bbox_rec_testA.yaml │ │ │ ├── refcoco_bbox_rec_testB.yaml │ │ │ ├── refcoco_bbox_rec_val.yaml │ │ │ ├── refcoco_bbox_test.yaml │ │ │ ├── refcoco_bbox_testA.yaml │ │ │ ├── refcoco_bbox_testB.yaml │ │ │ ├── refcoco_bbox_val.yaml │ │ │ ├── refcoco_seg_test.yaml │ │ │ ├── refcoco_seg_testA.yaml │ │ │ ├── refcoco_seg_testB.yaml │ │ │ ├── refcoco_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── refcocog │ │ │ ├── _default_template_bbox_rec_yaml │ │ │ ├── _default_template_bbox_yaml │ │ │ ├── _default_template_seg_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _refcoco.yaml │ │ │ ├── refcocog_bbox_rec_test.yaml │ │ │ ├── refcocog_bbox_rec_val.yaml │ │ │ ├── refcocog_bbox_test.yaml │ │ │ ├── refcocog_bbox_val.yaml │ │ │ ├── refcocog_seg_test.yaml │ │ │ ├── refcocog_seg_val.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── scienceqa │ │ │ ├── scienceqa.yaml │ │ │ ├── scienceqa_full.yaml │ │ │ ├── scienceqa_img.yaml │ │ │ └── utils.py │ │ ├── screenspot │ │ │ ├── README.md │ │ │ ├── _default_template_rec_yaml │ │ │ ├── _default_template_reg_yaml │ │ │ ├── _screenspot.yaml │ │ │ ├── screenspot_rec_test.yaml │ │ │ ├── screenspot_reg_test.yaml │ │ │ ├── utils.py │ │ │ └── utils_rec.py │ │ ├── seedbench │ │ │ ├── seedbench.yaml │ │ │ ├── seedbench_ppl.yaml │ │ │ └── utils.py │ │ ├── seedbench_2 │ │ │ ├── seedbench_2.yaml │ │ │ └── utils.py │ │ ├── stvqa │ │ │ ├── stvqa.yaml │ │ │ └── utils.py │ │ ├── textcaps │ │ │ ├── _default_template_textcaps_yaml │ │ │ ├── textcaps.yaml │ │ │ ├── textcaps_test.yaml │ │ │ ├── textcaps_train.yaml │ │ │ ├── textcaps_val.yaml │ │ │ └── utils.py │ │ ├── textvqa │ │ │ ├── _default_template_textvqa_yaml │ │ │ ├── _textvqa.yaml │ │ │ ├── textvqa_test.yaml │ │ │ ├── textvqa_val.yaml │ │ │ ├── textvqa_val_noocr.yaml │ │ │ └── utils.py │ │ ├── vizwiz_vqa │ │ │ ├── _default_template_vqa_yaml │ │ │ ├── _generate_config.py │ │ │ ├── _vizwiz_vqa.yaml │ │ │ ├── utils.py │ │ │ ├── vizwiz_vqa_test.yaml │ │ │ └── vizwiz_vqa_val.yaml │ │ ├── vqav2 │ │ │ ├── _default_template_vqav2_yaml │ │ │ ├── _vqav2.yaml │ │ │ ├── utils.py │ │ │ ├── vqav2_test.yaml │ │ │ └── vqav2_val.yaml │ │ └── websrc │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── websrc.yaml │ │ │ ├── websrc_test.yaml │ │ │ └── websrc_val.yaml │ └── utils.py ├── miscs │ ├── llava_result_check.md │ ├── repr_scripts.sh │ ├── repr_torch_envs.txt │ ├── scienceqa_id.txt │ ├── script.sh │ ├── test_llava.py │ └── test_scienceqa.py ├── pyproject.toml └── setup.py ├── predict.py ├── pyproject.toml └── scripts ├── eval_lmms.sh ├── extract_mm_projector.py ├── merge_lora_weights.py ├── zero2.json ├── zero3.json └── zero3_offload.json /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04 2 | 3 | SHELL [ "bash", "-c" ] 4 | 5 | # update apt and install packages 6 | RUN apt update && \ 7 | apt install -yq \ 8 | ffmpeg \ 9 | dkms \ 10 | build-essential 11 | 12 | # add user tools 13 | RUN sudo apt install -yq \ 14 | jq \ 15 | jp \ 16 | tree \ 17 | tldr 18 | 19 | # add git-lfs and install 20 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \ 21 | sudo apt-get install -yq git-lfs && \ 22 | git lfs install 23 | 24 | ############################################ 25 | # Setup user 26 | ############################################ 27 | 28 | USER vscode 29 | 30 | # install azcopy, a tool to copy to/from blob storage 31 | # for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file 32 | RUN cd /tmp && \ 33 | wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \ 34 | tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \ 35 | mkdir -p ~/.local/bin && \ 36 | mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \ 37 | chmod +x ~/.local/bin/azcopy && \ 38 | rm -rf azcopy_linux_amd64* 39 | 40 | # Setup conda 41 | RUN cd /tmp && \ 42 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 43 | bash ./Miniconda3-latest-Linux-x86_64.sh -b && \ 44 | rm ./Miniconda3-latest-Linux-x86_64.sh 45 | 46 | # Install dotnet 47 | RUN cd /tmp && \ 48 | wget https://dot.net/v1/dotnet-install.sh && \ 49 | chmod +x dotnet-install.sh && \ 50 | ./dotnet-install.sh --channel 7.0 && \ 51 | ./dotnet-install.sh --channel 3.1 && \ 52 | rm ./dotnet-install.sh 53 | 54 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.env: -------------------------------------------------------------------------------- 1 | SAMPLE_ENV_VAR1="Sample Value" 2 | SAMPLE_ENV_VAR2=332431bf-68bf -------------------------------------------------------------------------------- /.devcontainer/postCreateCommand.sh: -------------------------------------------------------------------------------- 1 | git config --global safe.directory '*' 2 | git config --global core.editor "code --wait" 3 | git config --global pager.branch false 4 | 5 | # Set AZCOPY concurrency to auto 6 | echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc 7 | echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.bashrc 8 | 9 | # Activate conda by default 10 | echo ". /home/vscode/miniconda3/bin/activate" >> ~/.zshrc 11 | echo ". /home/vscode/miniconda3/bin/activate" >> ~/.bashrc 12 | 13 | # Use llava environment by default 14 | echo "conda activate llava" >> ~/.zshrc 15 | echo "conda activate llava" >> ~/.bashrc 16 | 17 | # Add dotnet to PATH 18 | echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.bashrc 19 | echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.zshrc 20 | 21 | # Create and activate llava environment 22 | source /home/vscode/miniconda3/bin/activate 23 | conda create -y -q -n llava python=3.10 24 | conda activate llava 25 | 26 | # Install Nvidia Cuda Compiler 27 | conda install -y -c nvidia cuda-compiler 28 | 29 | pip install pre-commit==3.0.2 30 | 31 | # Install package locally 32 | pip install --upgrade pip # enable PEP 660 support 33 | pip install -e . 34 | 35 | # Install additional packages for training 36 | pip install -e ".[train]" 37 | pip install flash-attn --no-build-isolation 38 | 39 | # Download checkpoints to location outside of the repo 40 | git clone https://huggingface.co/liuhaotian/llava-v1.5-7b ~/llava-v1.5-7b 41 | 42 | # Commented because it is unlikely for users to have enough local GPU memory to load the model 43 | # git clone https://huggingface.co/liuhaotian/llava-v1.5-13b ~/llava-v1.5-13b 44 | 45 | echo "postCreateCommand.sh COMPLETE!" 46 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # The .dockerignore file excludes files from the container build process. 2 | # 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file 4 | 5 | # Exclude Git files 6 | .git 7 | .github 8 | .gitignore 9 | 10 | # Exclude Python cache files 11 | __pycache__ 12 | .mypy_cache 13 | .pytest_cache 14 | .ruff_cache 15 | 16 | # Exclude Python virtual environment 17 | /venv 18 | 19 | # Exclude some weights 20 | /openai 21 | /liuhaotian 22 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | # Unix-style newlines with a newline ending every file 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | charset = utf-8 9 | 10 | # 4 space indentation 11 | [*.{py,json}] 12 | indent_style = space 13 | indent_size = 4 14 | 15 | # 2 space indentation 16 | [*.{md,sh,yaml,yml}] 17 | indent_style = space 18 | indent_size = 2 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitattributes 2 | 3 | # Set the default behavior, in case people don't have core.autocrlf set. 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion 5 | * text=auto 6 | 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | *.pyz text diff=python 16 | *.pyi text diff=python 17 | 18 | # Binary files 19 | # ============ 20 | *.db binary 21 | *.p binary 22 | *.pkl binary 23 | *.pickle binary 24 | *.pyc binary export-ignore 25 | *.pyo binary export-ignore 26 | *.pyd binary 27 | 28 | # Jupyter notebook 29 | *.ipynb text eol=lf 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-usage.yaml: -------------------------------------------------------------------------------- 1 | name: Usage issues 2 | description: Report issues in usage. 3 | title: "[Usage] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this form. Please give as detailed description as possible for us to better assist with the issue :) 9 | - type: textarea 10 | id: what-happened 11 | attributes: 12 | label: Describe the issue 13 | description: Please give as detailed description as possible for us to better assist with the issue. Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub. 14 | placeholder: Issue 15 | value: | 16 | Issue: 17 | 18 | Command: 19 | ``` 20 | PASTE THE COMMANDS HERE. 21 | ``` 22 | 23 | Log: 24 | ``` 25 | PASTE THE LOGS HERE. 26 | ``` 27 | 28 | Screenshots: 29 | You may attach screenshots if it better explains the issue. 30 | validations: 31 | required: true 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Request for a new feature 3 | title: "[Feature request] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. Please share your thoughts of the new features below. 9 | - type: textarea 10 | id: feature 11 | attributes: 12 | label: feature 13 | placeholder: Start your thoughts here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-question.yaml: -------------------------------------------------------------------------------- 1 | name: Questions 2 | description: General questions about the work 3 | title: "[Question] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: question 11 | attributes: 12 | label: Question 13 | placeholder: Start question here... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-discussion.yaml: -------------------------------------------------------------------------------- 1 | name: Discussions 2 | description: General discussions about the work 3 | title: "[Discussion] " 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for your interest in our work. For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections. If you believe an issue would be better for your request, please continue your post below :) 9 | - type: textarea 10 | id: discussion 11 | attributes: 12 | label: Discussion 13 | placeholder: Start discussion here... -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | debug.sh 3 | eval_lmms.sh 4 | 5 | # Python 6 | __pycache__ 7 | *.pyc 8 | *.egg-info 9 | dist 10 | 11 | # Log 12 | *.log 13 | *.log.* 14 | *.json 15 | *.jsonl 16 | 17 | # Data 18 | !**/alpaca-data-conversation.json 19 | 20 | # Editor 21 | .idea 22 | *.swp 23 | 24 | # Other 25 | .DS_Store 26 | wandb 27 | output 28 | 29 | checkpoints 30 | ckpts* 31 | 32 | .ipynb_checkpoints 33 | *.ipynb 34 | 35 | # DevContainer 36 | !.devcontainer/* 37 | 38 | # Demo 39 | serve_images/ 40 | -------------------------------------------------------------------------------- /assets/deepstack_teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/deepstack_teaser.png -------------------------------------------------------------------------------- /assets/deepstack_vl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/deepstack_vl.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/logo.png -------------------------------------------------------------------------------- /assets/visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/visualization.png -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /docs/MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | If you are interested in including any other details in Model Zoo, please open an issue :) 4 | 5 | ## DeepStack 6 | 7 | | Version | LLM | Schedule | Checkpoint | VQAv2 | GQA | TextVQA | DocVQA | InfoVQA | SEED | POPE | 8 | |----------|----------|-----------|-----------|---|---|---|---|---|---|---| 9 | | DeepStack-L | Vicuna-7B | full_ft-1e | [menglc/deepstack-l-vicuna-7b](https://huggingface.co/menglc/deepstack-l-vicuna-7b) | 79.5| 63.1| 62.4| 39.1| 29.8| 60.6| 86.7 10 | | DeepStack-L-HD | Vicuna-7B | full_ft-1e | [menglc/deepstack-l-hd-vicuna-7b](https://huggingface.co/menglc/deepstack-l-hd-vicuna-7b) | 82.0| 65.2| 66.7 | 78.8| 41.2| 63.6| 86.5 11 | -------------------------------------------------------------------------------- /images/demo_cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/demo_cli.gif -------------------------------------------------------------------------------- /images/llava_example_cmp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_example_cmp.png -------------------------------------------------------------------------------- /images/llava_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_logo.png -------------------------------------------------------------------------------- /images/llava_v1_5_radar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_v1_5_radar.jpg -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import ModelSelect 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | # try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | 6 | from .language_model.deepstack_llama import DeepstackLlamaForCausalLM, DeepstackConfig 7 | from .language_model.deepstack_phi import DeepstackPhiForCausalLM, DeepstackPhiConfig 8 | 9 | 10 | MODEL_REGISTRY = { 11 | 'llama': DeepstackLlamaForCausalLM, 12 | 'phi-3': DeepstackPhiForCausalLM, 13 | 'phi3': DeepstackPhiForCausalLM, 14 | } 15 | 16 | LLAVA_MODEL_REGISTRY = { 17 | 'llama': LlavaLlamaForCausalLM, 18 | 'mpt': LlavaMptForCausalLM, 19 | 'mistral': LlavaMistralForCausalLM, 20 | } 21 | 22 | def ModelSelect(model_name_or_path): 23 | model = None 24 | 25 | registry = MODEL_REGISTRY if not 'llava' in model_name_or_path.lower() else LLAVA_MODEL_REGISTRY 26 | for name in registry.keys(): 27 | if name.lower() in model_name_or_path.lower(): 28 | model = registry[name] 29 | if model is None: 30 | model = registry['llama'] 31 | return model -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /lmms-eval/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.pyc 3 | output/ 4 | data/ 5 | lm_cache 6 | .idea 7 | build 8 | dist 9 | *.egg-info 10 | venv 11 | .vscode/ 12 | temp 13 | __pycache__ 14 | .ipynb_checkpoints 15 | temp 16 | # IPython 17 | profile_default/ 18 | ipython_config.py 19 | logs/ 20 | scripts/ 21 | wandb/ 22 | SimSun.ttf 23 | submissions/ 24 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json 25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json 26 | zk.log 27 | -------------------------------------------------------------------------------- /lmms-eval/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.12.1 4 | hooks: 5 | - id: black 6 | language_version: python3 -------------------------------------------------------------------------------- /lmms-eval/docs/README.md: -------------------------------------------------------------------------------- 1 | # LMMs Eval Documentation 2 | 3 | Welcome to the docs for `lmms-eval`! 4 | 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) 6 | 7 | ## Table of Contents 8 | 9 | * To learn about the command line flags, see the [commands](commands.md) 10 | * To learn how to add a new moddel, see the [Model Guide](model_guide.md). 11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). -------------------------------------------------------------------------------- /lmms-eval/example_eval.yaml: -------------------------------------------------------------------------------- 1 | - model: llava 2 | model_args: pretrained=liuhaotian/llava-v1.5-7b 3 | tasks: ai2d 4 | batch_size: 1 5 | log_samples: true 6 | log_samples_suffix: eval_vizwiz_vqa 7 | output_path: "./logs/" 8 | 9 | - model: llava 10 | model_args: pretrained=liuhaotian/llava-v1.5-13b 11 | tasks: mme 12 | batch_size: 1 13 | log_samples: true 14 | log_samples_suffix: mme 15 | output_path: "./logs/" 16 | -------------------------------------------------------------------------------- /lmms-eval/llava_repr_requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | datasets==2.16.1 3 | evaluate==0.4.1 4 | hf_transfer==0.1.6 5 | Jinja2==3.1.3 6 | numpy==1.26.4 7 | openai==1.13.3 8 | packaging==23.2 9 | pandas==2.2.1 10 | Pillow==10.2.0 11 | protobuf==4.25.3 12 | pycocoevalcap==1.2 13 | pycocotools==2.0.7 14 | pytablewriter==1.2.0 15 | pytest==8.0.2 16 | python_Levenshtein==0.25.0 17 | pytz==2024.1 18 | PyYAML==6.0.1 19 | PyYAML==6.0.1 20 | Requests==2.31.0 21 | sacrebleu==2.4.0 22 | scikit_learn==1.2.2 23 | sentencepiece==0.1.99 24 | setuptools==68.2.2 25 | sglang==0.1.12 26 | shortuuid==1.0.12 27 | sqlitedict==2.1.0 28 | tenacity==8.2.3 29 | torch==2.0.1 30 | openai>=1.0.0 31 | pycocoevalcap 32 | tokenizers==0.15.2 33 | tqdm==4.66.2 34 | tqdm-multiprocess 35 | transformers==4.37.2 36 | zstandard 37 | pillow 38 | pyyaml 39 | sympy 40 | mpmath 41 | Jinja2 42 | openpyxl 43 | Levenshtein 44 | hf_transfer 45 | tenacity 46 | wandb>=0.16.0 47 | transformers-stream-generator 48 | tiktoken 49 | pre-commit -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/api/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/filter.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | 4 | from lmms_eval.api.instance import Instance 5 | from datasets import Dataset 6 | 7 | 8 | class Filter: 9 | """ 10 | Filter classes operate on a per-task level. 11 | They take all model outputs (`instance.resps` for all `task.instances`) 12 | across all instances of a task, and perform operations. 13 | In a single run, one can configure any number of separate filters or lists of filters. 14 | 15 | """ 16 | 17 | def __init__(self, *args, **kwargs) -> None: 18 | """ 19 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 20 | """ 21 | 22 | def apply(self, resps, docs): 23 | """ 24 | Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. 25 | Should return the list of (filtered) response lists *in the same order as they were input*, e.g. 26 | if pass in [, ] should return 27 | [, ] 28 | """ 29 | return resps 30 | 31 | 32 | @dataclass 33 | class FilterEnsemble: 34 | """ 35 | FilterEnsemble creates a pipeline applying multiple filters. 36 | Its intended usage is to stack multiple post-processing steps in order. 37 | `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 38 | pipeline separately. 39 | """ 40 | 41 | name: str 42 | filters: List[Filter] 43 | 44 | def apply(self, instances: List[Instance], docs: List[Dataset]) -> None: 45 | resps = [inst.resps for inst in instances] # operate just on the model responses 46 | for f in self.filters: 47 | # apply filters in sequence 48 | resps = f.apply(resps, docs) 49 | 50 | # add the end results after filtering to filtered_requests of their respective source instances. 51 | # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. 52 | for inst, resp in zip(instances, resps): 53 | inst.filtered_resps[self.name] = resp 54 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/api/instance.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Literal, Tuple 3 | 4 | 5 | @dataclass 6 | class Instance: 7 | request_type: Literal["loglikelihood", "generate_until"] 8 | arguments: tuple 9 | idx: int 10 | metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here 11 | resps: list = field(default_factory=list) 12 | filtered_resps: dict = field(default_factory=dict) 13 | 14 | # initialized after init 15 | task_name: str = None 16 | doc_id: str = None 17 | repeats: str = None 18 | doc: dict = None 19 | 20 | def __post_init__(self) -> None: 21 | # unpack metadata field 22 | self.task_name, self.doc_id, self.repeats = self.metadata 23 | 24 | @property 25 | def args(self): 26 | """ 27 | Returns (string,) where `string` is the string to calculate loglikelihood over 28 | """ 29 | return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/__init__.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import FilterEnsemble, Filter 2 | from . import selection 3 | from . import extraction 4 | from . import transformation 5 | 6 | 7 | FILTER_REGISTRY = { 8 | "take_first": selection.TakeFirstFilter, 9 | "regex": extraction.RegexFilter, 10 | "majority_vote": selection.MajorityVoteFilter, 11 | "take_first_k": selection.TakeKFilter, 12 | "remove_whitespace": extraction.WhitespaceFilter, 13 | "lowercase": transformation.LowercaseFilter, 14 | "uppercase": transformation.UppercaseFilter, 15 | "map": transformation.MapFilter, 16 | "multi_choice_regex": extraction.MultiChoiceRegexFilter, 17 | # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function 18 | # that takes an input and returns a scalar and then should select the max reward, 19 | # or should implement different filters for different ways of handling a reward model's inference. 20 | # "arg_max": selection.ArgMaxFilter, 21 | } 22 | 23 | 24 | def get_filter(filter_name): 25 | if filter_name in FILTER_REGISTRY: 26 | return FILTER_REGISTRY[filter_name] 27 | else: 28 | return filter_name 29 | 30 | 31 | def build_filter_ensemble(filter_name, components): 32 | """ 33 | Create a filtering pipeline. 34 | """ 35 | filters = [] 36 | for function, kwargs in components: 37 | if kwargs is None: 38 | f = get_filter(function)() 39 | else: 40 | # create a filter given its name in the registry 41 | f = get_filter(function)(**kwargs) # TODO: pass kwargs to filters properly 42 | # add the filter as a pipeline step 43 | filters.append(f) 44 | 45 | return FilterEnsemble(name=filter_name, filters=filters) 46 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/decontamination.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class DecontaminationFilter(Filter): 5 | """ 6 | A filter which evaluates 7 | """ 8 | 9 | name = "track_decontamination" 10 | 11 | def __init__(self, path) -> None: 12 | """ 13 | 14 | TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). 15 | should further cache result on a given (task_name, doc_id) 16 | """ 17 | self._decontam_results = None 18 | 19 | def apply(self, resps, docs) -> None: 20 | """ 21 | Return {"no_contamination", "only_contamination"} keys for the 2 different subsets 22 | """ 23 | pass 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/selection.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from lmms_eval.api.filter import Filter 4 | 5 | 6 | class TakeFirstFilter(Filter): 7 | def __init__(self) -> None: 8 | """ 9 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 10 | """ 11 | 12 | def apply(self, resps, docs): 13 | """ 14 | Assuming each entry of `resps` is a list of model responses, we discard all but the first response. 15 | """ 16 | return map(lambda r: r[0], resps) 17 | 18 | 19 | class TakeKFilter(Filter): 20 | def __init__(self, *args, **kwargs) -> None: 21 | self.k = kwargs.pop("k") 22 | 23 | super().__init__(*args, **kwargs) 24 | 25 | def apply(self, resps, docs): 26 | # check we have at least k responses per doc, else we can't take the first k 27 | assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." 28 | return map(lambda r: r[: self.k], resps) 29 | 30 | 31 | class MajorityVoteFilter(Filter): 32 | def __init__(self) -> None: 33 | """ 34 | Can define custom behavior here, if an individual instantiation of a Filter class should have state. 35 | """ 36 | 37 | def apply(self, resps, docs): 38 | """ 39 | Each entry of `resps` is a list of model responses. 40 | We select the response that occurs most frequently in each entry of `resps`. 41 | """ 42 | 43 | def select_majority(resp): 44 | counts = Counter(resp) 45 | vote = counts.most_common(1)[0][0] 46 | return vote 47 | 48 | return map(lambda r: [select_majority(r)], resps) 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/filters/transformation.py: -------------------------------------------------------------------------------- 1 | from lmms_eval.api.filter import Filter 2 | 3 | 4 | class LowercaseFilter(Filter): 5 | def __init__(self) -> None: 6 | pass 7 | 8 | def apply(self, resps, docs): 9 | def filter_set(inst): 10 | return [resp.lower() for resp in inst] 11 | 12 | return [filter_set(resp) for resp in resps] 13 | 14 | 15 | class UppercaseFilter(Filter): 16 | def __init__(self) -> None: 17 | pass 18 | 19 | def apply(self, resps, docs): 20 | def filter_set(inst): 21 | return [resp.upper() for resp in inst] 22 | 23 | return [filter_set(resp) for resp in resps] 24 | 25 | 26 | class MapFilter(Filter): 27 | def __init__(self, mapping_dict: dict = {}, default_value=None) -> None: 28 | """ 29 | Initializes the MapFilter with a given mapping dictionary and default value. 30 | 31 | Args: 32 | - mapping_dict (dict): A dictionary containing the key-value mappings. 33 | Default is an empty dictionary. 34 | - default_value (Any): The value to be returned when a key is not found in the mapping_dict. 35 | Default is None. 36 | 37 | Example: 38 | mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) 39 | """ 40 | assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary" 41 | self.mapping_dict = mapping_dict 42 | self.default_value = default_value 43 | 44 | def apply(self, resps, docs): 45 | def filter_set(inst): 46 | return [self.mapping_dict.get(resp, self.default_value) for resp in inst] 47 | 48 | return [filter_set(resp) for resp in resps] 49 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AVAILABLE_MODELS = { 4 | "llava": "Llava", 5 | "llava_hf": "LlavaHf", 6 | "llava_sglang": "LlavaSglang", 7 | "qwen_vl": "Qwen_VL", 8 | "fuyu": "Fuyu", 9 | "gpt4v": "GPT4V", 10 | "instructblip": "InstructBLIP", 11 | "minicpm_v": "MiniCPM_V", 12 | } 13 | 14 | for model_name, model_class in AVAILABLE_MODELS.items(): 15 | try: 16 | exec(f"from .{model_name} import {model_class}") 17 | except ImportError: 18 | pass 19 | 20 | 21 | import hf_transfer 22 | 23 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/models/model_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/models/model_utils/__init__.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def generate_submission_file(file_name, args, subpath="submissions"): 5 | path = os.path.join(args.output_path, subpath) 6 | os.makedirs(path, exist_ok=True) 7 | path = os.path.join(path, file_name) 8 | return os.path.abspath(path) 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ai2d 2 | task: "ai2d" 3 | dataset_kwargs: 4 | token: True 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ai2d_doc_to_visual 8 | doc_to_text: !function utils.ai2d_doc_to_text 9 | doc_to_target: !function utils.ai2d_doc_to_target 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | metric_list: 15 | - metric: exact_match 16 | aggregation: mean 17 | higher_is_better: true 18 | ignore_case: true 19 | ignore_punctuation: true 20 | metadata: 21 | - version: 0.0 22 | 23 | model_specific_prompt_kwargs: 24 | default: 25 | prompt_format: mcq 26 | pre_prompt: "" 27 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 28 | # qwen formulate ai2d as question answering instead of mcq 29 | qwen_vl: 30 | prompt_format: qa 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | 34 | model_specific_target_kwargs: 35 | default: "mcq" 36 | qwen_vl: "qa" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ai2d/utils.py: -------------------------------------------------------------------------------- 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None): 2 | question, choices = doc["question"], doc["options"] 3 | len_choices = len(choices) 4 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 5 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 6 | if model_specific_prompt_kwargs["prompt_format"] == "mcq": 7 | options = [chr(ord("A") + i) for i in range(len_choices)] 8 | choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) 9 | return f"{pre_prompt}{question}\n{choices_str}{post_prompt}" 10 | elif model_specific_prompt_kwargs["prompt_format"] == "qa": 11 | options = "\n".join(choices) 12 | return f"{pre_prompt}{question}{options}{post_prompt}" 13 | else: 14 | raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}") 15 | 16 | 17 | def ai2d_doc_to_visual(doc): 18 | return [doc["image"].convert("RGB")] 19 | 20 | 21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs): 22 | if model_specific_target_kwargs == "mcq": 23 | len_choices = len(doc["options"]) 24 | options = [chr(ord("A") + i) for i in range(len_choices)] 25 | return options[int(doc["answer"])] 26 | elif model_specific_target_kwargs == "qa": 27 | return doc["options"][int(doc["answer"])] 28 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ChartQA 2 | dataset_kwargs: 3 | token: True 4 | task: "chartqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.chartqa_doc_to_visual 8 | doc_to_text: !function utils.chartqa_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | do_sample: False 14 | process_results: !function utils.chartqa_process_results 15 | metric_list: 16 | - metric: relaxed_overall 17 | aggregation: mean 18 | higher_is_better: true 19 | - metric: relaxed_human_split 20 | aggregation: mean 21 | higher_is_better: true 22 | - metric: relaxed_augmented_split 23 | aggregation: mean 24 | higher_is_better: true 25 | metadata: 26 | - version: 0.0 27 | model_specific_prompt_kwargs: 28 | default: 29 | pre_prompt: "" 30 | post_prompt: "\nAnswer the question with a single word." 31 | qwen_vl: 32 | pre_prompt: "" 33 | post_prompt: " Answer:" 34 | 35 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml: -------------------------------------------------------------------------------- 1 | group: cmmmu 2 | task: 3 | - cmmmu_val 4 | - cmmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/CMMMU 2 | output_type: generate_until 3 | doc_to_visual: !function utils.cmmmu_doc_to_visual 4 | doc_to_text: !function utils.cmmmu_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_test" 2 | test_split: test 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_test_results_for_submission 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | metric_list: 7 | - metric: submission 8 | aggregation: !function utils.cmmmu_test_aggregate_results_for_submission 9 | higher_is_better: false 10 | metadata: 11 | - version: 0.0 12 | include: _default_template_cmmmu_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml: -------------------------------------------------------------------------------- 1 | task: "cmmmu_val" 2 | test_split: val 3 | # The return value of process_results will be used by metrics 4 | process_results: !function utils.cmmmu_process_results 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 6 | generation_kwargs: 7 | max_new_tokens: 16 8 | image_aspect_ratio: original 9 | metric_list: 10 | - metric: cmmmu_acc 11 | aggregation: !function utils.cmmmu_aggregate_results 12 | higher_is_better: true 13 | metadata: 14 | - version: 0.0 15 | include: _default_template_cmmmu_yaml 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2014_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2014_cap_test" 5 | group : "coco_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: "Provide a one-sentence caption for the provided image." 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption 2 | dataset_kwargs: 3 | token: True 4 | task: "coco2014_cap_val" 5 | group : "coco_caption" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: "Provide a one-sentence caption for the provided image." 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_Bleu_4 21 | aggregation : !function utils.coco_bleu4 22 | higher_is_better : true 23 | - metric: coco_Bleu_3 24 | aggregation : !function utils.coco_bleu3 25 | higher_is_better : true 26 | - metric: coco_Bleu_2 27 | aggregation : !function utils.coco_bleu2 28 | higher_is_better : true 29 | - metric: coco_Bleu_1 30 | aggregation : !function utils.coco_bleu1 31 | higher_is_better : true 32 | - metric: coco_METEOR 33 | aggregation : !function utils.coco_meteor 34 | higher_is_better : true 35 | - metric: coco_ROUGE_L 36 | aggregation : !function utils.coco_rougel 37 | higher_is_better : true 38 | - metric: coco_CIDEr 39 | aggregation : !function utils.coco_cider 40 | higher_is_better : true 41 | #- metric: coco_SPICE 42 | # aggregation : !function utils.coco_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco2017_cap 2 | task: 3 | - coco2017_cap_val 4 | - coco2017_cap_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task : "coco2017_cap_test" 5 | group : "coco_caption2017" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 128 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_passthrough 21 | aggregation : !function utils.coco_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/COCO-Caption2017 2 | dataset_kwargs: 3 | token: True 4 | task: "coco2017_cap_val" 5 | group : "coco_caption2017" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.coco_doc_to_visual 9 | doc_to_text: !function utils.coco_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.coco_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: coco_Bleu_4 21 | aggregation : !function utils.coco_bleu4 22 | higher_is_better : true 23 | - metric: coco_Bleu_3 24 | aggregation : !function utils.coco_bleu3 25 | higher_is_better : true 26 | - metric: coco_Bleu_2 27 | aggregation : !function utils.coco_bleu2 28 | higher_is_better : true 29 | - metric: coco_Bleu_1 30 | aggregation : !function utils.coco_bleu1 31 | higher_is_better : true 32 | - metric: coco_METEOR 33 | aggregation : !function utils.coco_meteor 34 | higher_is_better : true 35 | - metric: coco_ROUGE_L 36 | aggregation : !function utils.coco_rougel 37 | higher_is_better : true 38 | - metric: coco_CIDEr 39 | aggregation : !function utils.coco_cider 40 | higher_is_better : true 41 | #- metric: coco_SPICE 42 | # aggregation : !function utils.coco_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml: -------------------------------------------------------------------------------- 1 | group : coco_cap 2 | task: 3 | - coco2014_cap_val 4 | - coco2014_cap_test 5 | - coco2017_cap_val 6 | - coco2017_cap_test 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: DocVQA 3 | dataset_kwargs: 4 | token: True 5 | output_type: generate_until 6 | doc_to_visual: !function utils.docvqa_doc_to_visual 7 | doc_to_text: !function utils.docvqa_doc_to_text 8 | doc_to_target: "answers" 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | post_prompt: "\nAnswer the question using a single word or phrase." 17 | qwen_vl: 18 | pre_prompt: "" 19 | post_prompt: " Answer:" 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml: -------------------------------------------------------------------------------- 1 | group: docvqa 2 | task: 3 | - docvqa_val 4 | - docvqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_test" 2 | test_split: test 3 | process_results: !function utils.docvqa_test_process_results 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.docvqa_test_aggregate_results 7 | higher_is_better: true 8 | include: _default_template_docvqa_yaml 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "docvqa_val" 2 | test_split: validation 3 | metric_list: 4 | - metric: anls 5 | aggregation: mean 6 | higher_is_better: true 7 | include: _default_template_docvqa_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/docvqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | logger = logging.getLogger("lmms-eval") 8 | 9 | 10 | def docvqa_doc_to_visual(doc): 11 | return [doc["image"].convert("RGB")] 12 | 13 | 14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs): 15 | question = doc["question"] 16 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 17 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 18 | return f"{pre_prompt}{question}{post_prompt}" 19 | 20 | 21 | def docvqa_test_process_results(doc, results): 22 | pred = results[0] 23 | questionId = doc["questionId"] 24 | return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}} 25 | 26 | 27 | def docvqa_test_aggregate_results(results, args): 28 | # save results as json 29 | path = generate_submission_file("docvqa_test_for_submission.json", args) 30 | with open(path, "w") as f: 31 | json.dump(results, f) 32 | logger.info(f"Results saved to {path}") 33 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ferret/ferret.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/Ferret-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "ferret" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ferret_doc_to_visual 8 | doc_to_text: !function utils.ferret_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.ferret_process_results 20 | metric_list: 21 | - metric: gpt_eval_ferret_all 22 | aggregation: !function utils.ferret_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_ferret_refer_desc 25 | aggregation: !function utils.ferret_refer_desc_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_ferret_refer_reason 28 | aggregation: !function utils.ferret_refer_reason_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_ferret_ground_conv 31 | aggregation: !function utils.ferret_ground_conv_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "gpt-4-0314" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml: -------------------------------------------------------------------------------- 1 | group: flickr30k 2 | task: 3 | - flickr30k_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/flickr30k/flickr30k_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/flickr30k 2 | dataset_kwargs: 3 | token: True 4 | task : "flickr30k_test" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.flickr_doc_to_visual 8 | doc_to_text: !function utils.flickr_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 64 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.flickr_process_result 17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 18 | metric_list: 19 | - metric: flickr_Bleu_4 20 | aggregation : !function utils.flickr_bleu4 21 | higher_is_better : true 22 | - metric: flickr_Bleu_3 23 | aggregation : !function utils.flickr_bleu3 24 | higher_is_better : true 25 | - metric: flickr_Bleu_2 26 | aggregation : !function utils.flickr_bleu2 27 | higher_is_better : true 28 | - metric: flickr_Bleu_1 29 | aggregation : !function utils.flickr_bleu1 30 | higher_is_better : true 31 | - metric: flickr_METEOR 32 | aggregation : !function utils.flickr_meteor 33 | higher_is_better : true 34 | - metric: flickr_ROUGE_L 35 | aggregation : !function utils.flickr_rougel 36 | higher_is_better : true 37 | - metric: flickr_CIDEr 38 | aggregation : !function utils.flickr_cider 39 | higher_is_better : true 40 | #- metric: flickr_SPICE 41 | # aggregation : !function utils.flickr_spice 42 | # higher_is_better : true 43 | metadata: 44 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/gqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/GQA 2 | dataset_name: testdev_balanced_instructions 3 | dataset_kwargs: 4 | token: True 5 | task: "gqa" 6 | test_split: testdev 7 | output_type: generate_until 8 | doc_to_visual: !function utils.gqa_doc_to_visual 9 | doc_to_text: !function utils.gqa_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | metric_list: 18 | - metric: exact_match 19 | aggregation: mean 20 | higher_is_better: true 21 | ignore_case: true 22 | ignore_punctuation: true 23 | metadata: 24 | - version: 0.0 25 | 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/gqa/utils.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | GQA_RAW_IMAGE_DATASET = None 4 | GQA_ID2IMAGE = None 5 | 6 | 7 | def gqa_doc_to_visual(doc): 8 | global GQA_RAW_IMAGE_DATASET 9 | global GQA_ID2IMAGE 10 | if GQA_RAW_IMAGE_DATASET is None: 11 | GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True) 12 | GQA_ID2IMAGE = {} 13 | for row in GQA_RAW_IMAGE_DATASET: 14 | GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB") 15 | image = GQA_ID2IMAGE[doc["imageId"]] 16 | return [image] 17 | 18 | 19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs): 20 | question = doc["question"] 21 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 22 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 23 | return f"{pre_prompt}{question}{post_prompt}" 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/HallusionBench 2 | dataset_kwargs: 3 | token: True 4 | task: "hallusion_bench_image" 5 | test_split: image 6 | output_type: generate_until 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text 9 | doc_to_target: "gt_answer_details" 10 | process_results: !function evaluate_hb.hb_process_results 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "" 15 | generation_kwargs: 16 | max_new_tokens: 128 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | metric_list: 22 | - metric: aAcc 23 | aggregation: !function evaluate_hb.hb_aggregation_result_aAcc 24 | higher_is_better: true 25 | - metric: qAcc 26 | aggregation: !function evaluate_hb.hb_aggregation_result_qAcc 27 | higher_is_better: true 28 | - metric: fAcc 29 | aggregation: !function evaluate_hb.hb_aggregation_result_fAcc 30 | higher_is_better: true 31 | # - metric: aAcc 32 | # aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern 33 | # higher_is_better: true 34 | # - metric: qAcc 35 | # aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern 36 | # higher_is_better: true 37 | # - metric: fAcc 38 | # aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern 39 | # higher_is_better: true 40 | metadata: 41 | - version: 0.0 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ICON-QA 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.doc_to_visual 6 | doc_to_text: !function utils.doc_to_text 7 | doc_to_target: "answers" 8 | # process_results: !function utils.test_process_results 9 | generation_kwargs: 10 | max_new_tokens: 32 11 | temperature: 0 12 | do_sample: False 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | statement: "Given a set of images and a question, please provide the answer to the question.\n" 17 | options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly." 18 | freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase." 19 | metric_list: 20 | - metric: anls 21 | aggregation: mean 22 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml: -------------------------------------------------------------------------------- 1 | group: iconqa 2 | task: 3 | - iconqa_val 4 | - iconqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_test" 2 | test_split: test 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "iconqa_val" 2 | test_split: val 3 | include: _default_template_docvqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/iconqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def options_to_str(options_prompt): 6 | option_prompt_str = "" 7 | for i, option in enumerate(options_prompt): 8 | option_choice = chr(ord("A") + i) 9 | option_prompt_str += f"{option_choice}. {option}\n" 10 | 11 | option_prompt_str = option_prompt_str.rstrip("\n") 12 | return option_prompt_str 13 | 14 | 15 | def doc_to_visual(doc): 16 | image_list = [] 17 | if "query_image" in doc: 18 | image_list.append(doc["query_image"].convert("RGB")) 19 | for i in range(5): 20 | id = f"choice_image_{i}" 21 | if id in doc and doc[id] is not None: 22 | image_list.append(doc[id].convert("RGB")) 23 | assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA" 24 | return image_list 25 | 26 | 27 | def doc_to_text(doc, model_specific_prompt_kwargs): 28 | question = doc["question"] 29 | ques_type = doc["ques_type"] 30 | options_prompt = [] 31 | 32 | if ques_type == "choose_img": 33 | options_prompt.append("The first image.") 34 | options_prompt.append("The second image.") 35 | 36 | options_str = options_to_str(options_prompt) 37 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" 38 | 39 | elif ques_type == "choose_txt": 40 | choices = doc["choices"].split(",") 41 | for i, choice in enumerate(choices): 42 | options_prompt.append(f"{choice}") 43 | 44 | options_str = options_to_str(options_prompt) 45 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" 46 | 47 | elif ques_type == "fill_in_blank": 48 | full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}" 49 | 50 | return full_prompt 51 | 52 | 53 | def test_process_results(doc, results): 54 | pred = results[0] 55 | questionId = doc["question_id"] 56 | answer = doc["answer"] 57 | return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}} 58 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/DocVQA 2 | dataset_name: InfographicVQA 3 | dataset_kwargs: 4 | token: True 5 | doc_to_target: "answers" 6 | doc_to_visual: !function utils.infovqa_doc_to_visual 7 | doc_to_text: !function utils.infovqa_doc_to_text 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml: -------------------------------------------------------------------------------- 1 | group: infovqa 2 | task: 3 | - infovqa_val 4 | - infovqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_test" 2 | test_split: test 3 | output_type: generate_until 4 | process_results: !function utils.infovqa_test_process_results 5 | metric_list: 6 | - metric: submission 7 | aggregation: !function utils.infovqa_test_aggregate_results 8 | higher_is_better: true 9 | include: _default_template_infovqa_yaml 10 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: "infovqa_val" 2 | test_split: validation 3 | output_type: generate_until 4 | metric_list: 5 | - metric: anls 6 | aggregation: mean 7 | higher_is_better: true 8 | include: _default_template_infovqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/infovqa/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging 4 | 5 | 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 7 | 8 | lmms_logger = logging.getLogger("lmms-eval") 9 | 10 | 11 | def infovqa_doc_to_visual(doc): 12 | return [doc["image"].convert("RGB")] 13 | 14 | 15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs): 16 | question = doc["question"] 17 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 18 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 19 | return f"{pre_prompt}{question}{post_prompt}" 20 | 21 | 22 | def infovqa_test_process_results(doc, results): 23 | pred = results[0] 24 | questionId = doc["questionId"] 25 | return {"submission": {"questionId": int(questionId), "answer": pred}} 26 | 27 | 28 | def infovqa_test_aggregate_results(results, args): 29 | # save results as json 30 | file = generate_submission_file("infovqa_test_for_submission.json", args) 31 | with open(file, "w") as f: 32 | json.dump(results, f) 33 | lmms_logger.info(f"Results saved to {file}") 34 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-coco 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_bench_coco" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | process_results: !function utils.llava_process_results 19 | metric_list: 20 | - metric: gpt_eval_llava_all 21 | aggregation: !function utils.llava_all_aggregation 22 | higher_is_better: true 23 | - metric: gpt_eval_llava_conv 24 | aggregation: !function utils.llava_conv_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_detail 27 | aggregation: !function utils.llava_detail_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_complex 30 | aggregation: !function utils.llava_complex_aggregation 31 | higher_is_better: true 32 | metadata: 33 | version: 0.0 34 | gpt_eval_model_name: "gpt-4-0314" 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/llava-bench-in-the-wild 2 | dataset_kwargs: 3 | token: True 4 | task: "llava_in_the_wild" 5 | test_split: train 6 | output_type: generate_until 7 | doc_to_visual: !function utils.llava_doc_to_visual 8 | doc_to_text: !function utils.llava_doc_to_text 9 | doc_to_target: "gpt_answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.llava_process_results 20 | metric_list: 21 | - metric: gpt_eval_llava_all 22 | aggregation: !function utils.llava_all_aggregation 23 | higher_is_better: true 24 | - metric: gpt_eval_llava_conv 25 | aggregation: !function utils.llava_conv_aggregation 26 | higher_is_better: true 27 | - metric: gpt_eval_llava_detail 28 | aggregation: !function utils.llava_detail_aggregation 29 | higher_is_better: true 30 | - metric: gpt_eval_llava_complex 31 | aggregation: !function utils.llava_complex_aggregation 32 | higher_is_better: true 33 | metadata: 34 | version: 0.0 35 | gpt_eval_model_name: "gpt-4-0613" 36 | model_specific_prompt_kwargs: 37 | default: 38 | pre_prompt: "" 39 | post_prompt: "" 40 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml: -------------------------------------------------------------------------------- 1 | group: mathverse 2 | task: 3 | - mathverse_testmini 4 | - mathverse_testmini_text_only 5 | - mathverse_testmini_text_lite 6 | - mathverse_testmini_text_dominant 7 | - mathverse_testmini_vision_intensive 8 | - mathverse_testmini_vision_dominant 9 | - mathverse_testmini_vision_only 10 | metadata: 11 | version: 0.0 12 | gpt_eval_model_name: "gpt-3.5-turbo" 13 | trunk_response: 30 14 | quick_match: false -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini" 6 | test_split: testmini 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_dominant" 6 | test_split: text_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_lite" 6 | test_split: text_lite 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_text_only 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_text_only" 6 | test_split: text_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_dominant" 6 | test_split: vision_dominant 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_intensive" 6 | test_split: vision_intensive 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: CaraJ/MathVerse-lmmseval 2 | dataset_name: testmini_version_split 3 | dataset_kwargs: 4 | token: False 5 | task: "mathverse_testmini_vision_only" 6 | test_split: vision_only 7 | output_type: generate_until 8 | doc_to_visual: !function utils.mathverse_doc_to_visual 9 | doc_to_text: !function utils.mathverse_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.mathverse_process_results 20 | metric_list: 21 | - metric: gpt_eval_score 22 | aggregation: !function utils.mathverse_aggregate_results_eval 23 | higher_is_better: true 24 | - metric: submission 25 | aggregation: !function utils.mathverse_aggregate_results_submission 26 | higher_is_better: true 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | shot_type: "format-prompt" # can also be "custom-prompt" 31 | query_type: "query_wo" # now only support query_wo 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml: -------------------------------------------------------------------------------- 1 | group: mathvista 2 | task: 3 | - mathvista_testmini 4 | - mathvista_test 5 | metadata: 6 | version: 0.0 7 | gpt_eval_model_name: "gpt-4-0613" 8 | quick_extract: false -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_test" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | model_specific_generation_kwargs: 28 | llava: 29 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: AI4Math/MathVista 2 | dataset_kwargs: 3 | token: True 4 | task: "mathvista_testmini" 5 | test_split: testmini 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mathvista_doc_to_visual 8 | doc_to_text: !function utils.mathvista_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mathvista_process_results 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mathvista_aggregate_results 22 | higher_is_better: true 23 | 24 | model_specific_prompt_kwargs: 25 | default: 26 | shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" 27 | model_specific_generation_kwargs: 28 | llava: 29 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | dataset_name: "cn" 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text 9 | generation_kwargs: 10 | max_new_tokens: 256 11 | temperature: 0 12 | top_p: 0 13 | num_beams: 1 14 | do_sample: false 15 | process_results: !function cn_utils.mmbench_process_results 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 20 | model_specific_generation_kwargs: 21 | llava: 22 | image_aspect_ratio: original 23 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_kwargs: 3 | token: True 4 | doc_to_target: "answer" 5 | model_specific_prompt_kwargs: 6 | default: 7 | pre_prompt: "" 8 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual 10 | doc_to_text: !function en_utils.mmbench_doc_to_text 11 | doc_to_target: "answer" 12 | process_results: !function en_utils.mmbench_process_results 13 | model_specific_generation_kwargs: 14 | llava: 15 | image_aspect_ratio: original 16 | output_type: generate_until 17 | dataset_name: "en" 18 | generation_kwargs: 19 | until: 20 | - "ASSISTANT:" 21 | max_new_tokens: 1024 22 | temperature: 0 23 | top_p: 0 24 | num_beams: 1 25 | do_sample: false 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | - mmbench_cn_dev 6 | - mmbench_cn_test 7 | - mmbench_cn_cc 8 | metadata: 9 | version: 0.0 10 | sys_prompt: "There are several options:" 11 | gpt_eval_model_name: "gpt-3.5-turbo-0613" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMBench 2 | dataset_name: cc 3 | dataset_kwargs: 4 | token: True 5 | task: "mmbench_cn_cc" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 256 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function cc_utils.mmbench_cn_cc_process_results 18 | metric_list: 19 | - metric: gpt_eval_score 20 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval 21 | higher_is_better: true 22 | - metric: submission 23 | aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results 24 | metadata: 25 | version: 0.0 26 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 27 | 28 | model_specific_prompt_kwargs: 29 | default: 30 | pre_prompt: "" 31 | post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_cn 2 | task: 3 | - mmbench_cn_dev 4 | - mmbench_cn_test 5 | - mmbench_cn_cc 6 | metadata: 7 | version: 0.0 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | sys_prompt: "有如下几个选项:" -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_cn_dev" 2 | test_split: "dev" 3 | metric_list: 4 | - metric: gpt_eval_score 5 | aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval 6 | higher_is_better: true 7 | - metric: submission 8 | higher_is_better: true 9 | aggregation: !function cn_utils.mmbench_aggregate_dev_results 10 | include: _default_template_mmbench_cn_yaml 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml: -------------------------------------------------------------------------------- 1 | task: mmbench_cn_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function cn_utils.mmbench_aggregate_test_results 6 | higher_is_better: true 7 | include: _default_template_mmbench_cn_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml: -------------------------------------------------------------------------------- 1 | group: mmbench_en 2 | task: 3 | - mmbench_en_dev 4 | - mmbench_en_test 5 | metadata: 6 | version: 0.0 7 | sys_prompt: "There are several options:" 8 | gpt_eval_model_name: "gpt-3.5-turbo-0613" 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_dev" 2 | test_split: dev 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: gpt_eval_score 6 | aggregation: !function en_utils.mmbench_aggregate_dev_results_eval 7 | higher_is_better: true 8 | - metric: submission 9 | aggregation: !function en_utils.mmbench_aggregate_dev_results_submission 10 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml: -------------------------------------------------------------------------------- 1 | task: "mmbench_en_test" 2 | test_split: test 3 | include: _default_template_mmbench_en_yaml 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function en_utils.mmbench_aggregate_test_results 7 | higher_is_better: true 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mme/mme.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MME 2 | dataset_kwargs: 3 | token: True 4 | task: "mme" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mme_doc_to_visual 8 | doc_to_text: !function utils.mme_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 16 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | # The return value of process_results will be used by metrics 17 | process_results: !function utils.mme_process_results 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: mme_percetion_score 21 | aggregation: !function utils.mme_aggregate_results 22 | higher_is_better: true 23 | - metric: mme_cognition_score 24 | aggregation: !function utils.mme_aggregate_results 25 | higher_is_better: true 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer the question using a single word or phrase." 30 | qwen_vl: 31 | pre_prompt: "" 32 | post_prompt: " Answer:" 33 | otterhd: 34 | pre_prompt: "" 35 | post_prompt: " Answer:" 36 | metadata: 37 | - version: 0.0 38 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml: -------------------------------------------------------------------------------- 1 | group: mmmu 2 | task: 3 | - mmmu_val 4 | - mmmu_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: original 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.mmmu_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMMU 2 | task: "mmmu_val" 3 | test_split: validation 4 | output_type: generate_until 5 | doc_to_visual: !function utils.mmmu_doc_to_visual 6 | doc_to_text: !function utils.mmmu_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.mmmu_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: original 14 | metric_list: 15 | - metric: mmmu_acc 16 | aggregation: !function utils.mmmu_aggregate_results 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MMVet 2 | dataset_kwargs: 3 | token: True 4 | task: "mmvet" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.mmvet_doc_to_visual 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"] 9 | doc_to_target: "{{answer}}" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function utils.mmvet_process_results # apply gpt eval here 19 | metric_list: 20 | - metric: gpt_eval_score 21 | aggregation: !function utils.mmvet_aggregate_results 22 | higher_is_better: true 23 | metadata: 24 | version: 0.0 25 | gpt_eval_model_name: "gpt-4" 26 | model_specific_prompt_kwargs: 27 | default: 28 | pre_prompt: "" 29 | post_prompt: "" 30 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml: -------------------------------------------------------------------------------- 1 | group: multidocvqa 2 | task: 3 | - multidocvqa_val 4 | - multidocvqa_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_test_results_for_submission 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/MP-DocVQA 2 | task: "multidocvqa_val" 3 | test_split: val 4 | output_type: generate_until 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual 6 | doc_to_text: !function utils.multidocvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.multidocvqa_process_results 13 | metric_list: 14 | - metric: anls 15 | aggregation: !function utils.multidocvqa_aggregate_results_anls 16 | higher_is_better: true 17 | - metric: accuracy 18 | aggregation: !function utils.multidocvqa_aggregate_results_accuracy 19 | higher_is_better: true 20 | model_specific_prompt_kwargs: 21 | default: 22 | pre_prompt: "" 23 | post_prompt: "\nAnswer the question using a single word or phrase." 24 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_generate_configs.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | # dataset = load_dataset("gagan3012/multilingual-llava-bench") 4 | 5 | configs = ['arabic', 'bengali', 'chinese', 'french', 'hindi', 'japanese', 'russian', 'spanish', 'urdu'] 6 | 7 | for config in configs: 8 | yaml_output = f""" 9 | dataset_path: "gagan3012/multilingual-llava-bench" 10 | dataset_kwargs: 11 | config: {config} 12 | token: True 13 | task: "llava_in_the_wild_{config}" 14 | test_split: train 15 | output_type: generate_until 16 | doc_to_visual: !function utils.llava_doc_to_visual 17 | doc_to_text: !function utils.llava_doc_to_text 18 | doc_to_target: "gpt_answer" 19 | generation_kwargs: 20 | until: 21 | - "ASSISTANT:" 22 | image_aspect_ratio: original 23 | max_new_tokens: 1024 24 | temperature: 0 25 | top_p: 0 26 | num_beams: 1 27 | do_sample: false 28 | process_results: !function utils.llava_process_results 29 | metric_list: 30 | - metric: gpt_eval_llava_all 31 | aggregation: !function utils.llava_all_aggregation 32 | higher_is_better: true 33 | - metric: gpt_eval_llava_conv 34 | aggregation: !function utils.llava_conv_aggregation 35 | higher_is_better: true 36 | - metric: gpt_eval_llava_detail 37 | aggregation: !function utils.llava_detail_aggregation 38 | higher_is_better: true 39 | - metric: gpt_eval_llava_complex 40 | aggregation: !function utils.llava_complex_aggregation 41 | higher_is_better: true 42 | metadata: 43 | version: 0.0 44 | gpt_eval_model_name: "gpt-4-0613" 45 | model_specific_prompt_kwargs: 46 | default: 47 | pre_prompt: "" 48 | post_prompt: "" 49 | """ 50 | 51 | with open(f"{config}_llava_in_the_wild.yaml", "w") as f: 52 | f.write(yaml_output) 53 | 54 | # Path: _generate_configs.py -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: arabic 5 | token: True 6 | task: "llava_in_the_wild_arabic" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: bengali 5 | token: True 6 | task: "llava_in_the_wild_bengali" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: chinese 5 | token: True 6 | task: "llava_in_the_wild_chinese" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: french 5 | token: True 6 | task: "llava_in_the_wild_french" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: hindi 5 | token: True 6 | task: "llava_in_the_wild_hindi" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: japanese 5 | token: True 6 | task: "llava_in_the_wild_japanese" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: russian 5 | token: True 6 | task: "llava_in_the_wild_russian" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: spanish 5 | token: True 6 | task: "llava_in_the_wild_spanish" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml: -------------------------------------------------------------------------------- 1 | 2 | dataset_path: "gagan3012/multilingual-llava-bench" 3 | dataset_kwargs: 4 | config: urdu 5 | token: True 6 | task: "llava_in_the_wild_urdu" 7 | test_split: train 8 | output_type: generate_until 9 | doc_to_visual: !function utils.llava_doc_to_visual 10 | doc_to_text: !function utils.llava_doc_to_text 11 | doc_to_target: "gpt_answer" 12 | generation_kwargs: 13 | until: 14 | - "ASSISTANT:" 15 | image_aspect_ratio: original 16 | max_new_tokens: 1024 17 | temperature: 0 18 | top_p: 0 19 | num_beams: 1 20 | do_sample: false 21 | process_results: !function utils.llava_process_results 22 | metric_list: 23 | - metric: gpt_eval_llava_all 24 | aggregation: !function utils.llava_all_aggregation 25 | higher_is_better: true 26 | - metric: gpt_eval_llava_conv 27 | aggregation: !function utils.llava_conv_aggregation 28 | higher_is_better: true 29 | - metric: gpt_eval_llava_detail 30 | aggregation: !function utils.llava_detail_aggregation 31 | higher_is_better: true 32 | - metric: gpt_eval_llava_complex 33 | aggregation: !function utils.llava_complex_aggregation 34 | higher_is_better: true 35 | metadata: 36 | version: 0.0 37 | gpt_eval_model_name: "gpt-4-0613" 38 | model_specific_prompt_kwargs: 39 | default: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: "Provide a one-sentence caption for the provided image." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml: -------------------------------------------------------------------------------- 1 | group : nocaps 2 | task: 3 | - nocaps_test 4 | - nocaps_val -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/NoCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "nocaps_test" 5 | group : "nocaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.nocaps_doc_to_visual 9 | doc_to_text: !function utils.nocaps_doc_to_text 10 | doc_to_target: "annotations_captions" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.nocaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: nocaps_passthrough 21 | aggregation : !function utils.nocaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_nocaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/nocaps/nocaps_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/NoCaps 2 | dataset_kwargs: 3 | token: True 4 | task: "nocaps_val" 5 | group : "nocaps_caption" 6 | test_split: validation 7 | output_type: generate_until 8 | doc_to_visual: !function utils.nocaps_doc_to_visual 9 | doc_to_text: !function utils.nocaps_doc_to_text 10 | doc_to_target: "annotations_captions" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.nocaps_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: nocaps_Bleu_4 21 | aggregation : !function utils.nocaps_bleu4 22 | higher_is_better : true 23 | - metric: nocaps_Bleu_3 24 | aggregation : !function utils.nocaps_bleu3 25 | higher_is_better : true 26 | - metric: nocaps_Bleu_2 27 | aggregation : !function utils.nocaps_bleu2 28 | higher_is_better : true 29 | - metric: nocaps_Bleu_1 30 | aggregation : !function utils.nocaps_bleu1 31 | higher_is_better : true 32 | - metric: nocaps_METEOR 33 | aggregation : !function utils.nocaps_meteor 34 | higher_is_better : true 35 | - metric: nocaps_ROUGE_L 36 | aggregation : !function utils.nocaps_rougel 37 | higher_is_better : true 38 | - metric: nocaps_CIDEr 39 | aggregation : !function utils.nocaps_cider 40 | higher_is_better : true 41 | #- metric: nocaps_SPICE 42 | # aggregation : !function utils.nocaps_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 46 | include: _default_template_nocaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: echo840/OCRBench 2 | dataset_kwargs: 3 | token: True 4 | task: "ocrbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual 8 | doc_to_text: !function utils.ocrbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.ocrbench_process_results 17 | metric_list: 18 | - metric: ocrbench_accuracy 19 | aggregation: !function utils.ocrbench_aggregate_accuracy 20 | higher_is_better: true 21 | metadata: 22 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OK-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual 4 | doc_to_text: !function utils.ok_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metric_list: 10 | - metric: exact_match 11 | aggregation: mean 12 | higher_is_better: true 13 | ignore_case: true 14 | ignore_punctuation: true 15 | - metric: submission 16 | aggregation: !function utils.ok_vqa_aggreate_submissions 17 | higher_is_better: true 18 | process_results: !function utils.ok_vqa_process_results 19 | model_specific_prompt_kwargs: 20 | default: 21 | pre_prompt: "" 22 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 23 | metadata: 24 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val2014"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"ok_vqa_{split}") 16 | 17 | save_path = f"./ok_vqa_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "ok_vqa", "task": dump_tasks} 23 | 24 | with open("./_ok_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: 3 | - ok_vqa_val2014 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml: -------------------------------------------------------------------------------- 1 | group: ok_vqa 2 | task: ok_vqa_val2014 3 | test_split: val2014 4 | include: _default_template_vqa_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/ok_vqa/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import yaml 5 | import pathlib 6 | import logging 7 | import datetime 8 | import statistics 9 | 10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor 12 | 13 | eval_logger = logging.getLogger("lmms-eval") 14 | 15 | 16 | def ok_vqa_doc_to_visual(doc): 17 | return [doc["image"].convert("RGB")] 18 | 19 | 20 | def ok_vqa_process_results(doc, result): 21 | eval_ai_processor = EvalAIAnswerProcessor() 22 | assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." 23 | resAns = eval_ai_processor(result[0]) 24 | accuracy = 0 25 | 26 | if "answers" in doc and doc["answers"] is not None: 27 | gtAcc = [] 28 | 29 | for i in range(len(doc["answers"])): 30 | doc["answers"][i] = eval_ai_processor(doc["answers"][i]) 31 | 32 | for i in range(len(doc["answers"])): 33 | otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] 34 | matchingAns = [item for item in otherGTAns if item == resAns] 35 | acc = min(1, float(len(matchingAns)) / 3) 36 | gtAcc.append(acc) 37 | if gtAcc: 38 | accuracy = statistics.mean(gtAcc) 39 | else: 40 | accuracy = 0 41 | 42 | return { 43 | "exact_match": accuracy, 44 | "submission": { 45 | "image": f"{doc['question_id']}.jpg", 46 | "answer": resAns, 47 | }, 48 | } 49 | 50 | 51 | def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 52 | question = doc["question"] 53 | if model_specific_prompt_kwargs is None: 54 | model_specific_prompt_kwargs = {} 55 | pre_prompt = "" 56 | post_prompt = "" 57 | if "pre_prompt" in model_specific_prompt_kwargs: 58 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 59 | if "post_prompt" in model_specific_prompt_kwargs: 60 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 61 | return f"{pre_prompt}{question}{post_prompt}" 62 | 63 | 64 | def ok_vqa_aggreate_submissions(results, args): 65 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 66 | file = f"ok_vqa-test-submission-{now_date_time}.json" 67 | path = generate_submission_file(file, args) 68 | with open(path, "w") as f: 69 | json.dump(results, f) 70 | print(f"Submission file saved to {path}") 71 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/cn_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import datetime 4 | from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | import logging 8 | 9 | eval_logger = logging.getLogger("lmms-eval") 10 | dir_name = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | olympiadbench_evaluator = OlympiadBenchEvaluator() 13 | 14 | 15 | def olympiadbench_doc_to_visual(doc): 16 | return [image.convert("RGB") for image in doc["images"]] 17 | 18 | 19 | def olympiadbench_doc_to_text(doc): 20 | question = doc["question"] 21 | subject = doc["subfield"] 22 | mul_ans = doc["is_multiple_answer"] 23 | if mul_ans is None: 24 | mul_ans = False 25 | ans_type = doc["answer_type"] 26 | if ans_type == "Need_human_evaluate": 27 | ans_type = "proof based" 28 | 29 | pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n" 30 | 31 | post_prompt = "" 32 | if not mul_ans: 33 | post_prompt += f"答案类型为{ans_type}。\n" 34 | else: 35 | post_prompt += f"题目有多个答案,答案类型均为{ans_type}。\n" 36 | post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以" 37 | if not mul_ans: 38 | post_prompt += '"所以最终答案是\\boxed{答案}。"\n' 39 | else: 40 | post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n' 41 | 42 | final_question = pre_prompt + question + "\n" + post_prompt 43 | return final_question 44 | 45 | 46 | def olympiadbench_process_results(doc, results): 47 | precision = doc["error"] 48 | is_proving = "TP" in doc["source"] 49 | if precision is None: 50 | precision = 0 51 | prediction = results[0].strip() 52 | 53 | if is_proving: 54 | return {"submission": prediction} 55 | else: 56 | prediction = prediction.split("所以最终答案是")[-1] 57 | prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") 58 | accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) 59 | accuracy = int(accuracy) 60 | return {"exact_match": accuracy} 61 | 62 | 63 | def olympiadbench_aggregate_results(results, args): 64 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 65 | submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json" 66 | path = generate_submission_file(submission_file_name, args) 67 | with open(path, "w") as f: 68 | json.dump(results, f, ensure_ascii=False) 69 | print(f"Submission file saved to {path}") 70 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml: -------------------------------------------------------------------------------- 1 | group: olympiadbench 2 | task: 3 | - olympiadbench_test_en 4 | - olympiadbench_test_cn 5 | metadata: 6 | - version: 0.0 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_cn" 5 | test_split: test_cn 6 | output_type: generate_until 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function cn_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function cn_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/OlympiadBench 2 | dataset_kwargs: 3 | token: True 4 | task : "olympiadbench_test_en" 5 | test_split: test_en 6 | output_type: generate_until 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 1024 14 | temperature: 0 15 | top_p: 0 16 | num_beams: 1 17 | do_sample: false 18 | process_results: !function en_utils.olympiadbench_process_results 19 | metric_list: 20 | - metric: submission 21 | aggregation: !function en_utils.olympiadbench_aggregate_results 22 | higher_is_better: true 23 | - metric: exact_match 24 | aggregation: mean 25 | higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/pope/pope.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/POPE 2 | dataset_kwargs: 3 | token: True 4 | task: "pope" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.pope_doc_to_visual 8 | doc_to_text: !function utils.pope_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | max_new_tokens: 128 12 | temperature: 0 13 | top_p: 0 14 | num_beams: 1 15 | do_sample: false 16 | process_results: !function utils.pope_process_results 17 | metric_list: 18 | - metric: pope_accuracy 19 | aggregation: !function utils.pope_aggregate_accuracy 20 | higher_is_better: true 21 | - metric: pope_precision 22 | aggregation: !function utils.pope_aggregate_precision 23 | higher_is_better: true 24 | - metric: pope_recall 25 | aggregation: !function utils.pope_aggregate_recall 26 | higher_is_better: true 27 | - metric: pope_f1_score 28 | aggregation: !function utils.pope_aggregate_f1_score 29 | higher_is_better: true 30 | - metric: pope_yes_ratio 31 | aggregation: !function utils.pope_aggregate_yes_ratio 32 | higher_is_better: true 33 | metadata: 34 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RealWorldQA 2 | dataset_kwargs: 3 | token: True 4 | task: "realworldqa" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual 8 | doc_to_text: !function utils.realworldqa_doc_to_text 9 | doc_to_target: "answer" 10 | 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | 18 | filter_list: 19 | - name: "flexible-extract" 20 | filter: 21 | - function: !function utils.NumberWordsToDigitsFilter 22 | - function: !function utils.MultiChoiceRegexFilter 23 | group_select: 0 24 | ignore_case: true 25 | ignore_punctuation: true 26 | regex_pattern: "(\\([A-Z]\\))" 27 | 28 | metric_list: 29 | - metric: exact_match 30 | aggregation: mean 31 | higher_is_better: true 32 | ignore_case: true 33 | ignore_punctuation: true 34 | 35 | model_specific_prompt_kwargs: 36 | default: 37 | pre_prompt: "" 38 | post_prompt: "" 39 | gpt4v: 40 | pre_prompt: "" 41 | post_prompt: "" 42 | metadata: 43 | - version: 0.0 44 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOPlus 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOplus 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "val", "testA", "testB"] 5 | splits = ["val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco+_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco+", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+ 2 | task: 3 | - refcoco+_seg_val 4 | - refcoco+_seg_testA 5 | - refcoco+_seg_testB 6 | - refcoco+_bbox_val 7 | - refcoco+_bbox_testA 8 | - refcoco+_bbox_testB 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testA 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_testB 3 | include: _default_template_bbox_rec_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox_rec 2 | task: refcoco+_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testA 3 | include: _default_template_bbox_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_testB 3 | include: _default_template_bbox_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_bbox 2 | task: refcoco+_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testA 3 | include: _default_template_seg_yaml 4 | test_split: testA 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_testB 3 | include: _default_template_seg_yaml 4 | test_split: testB 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco+_seg 2 | task: refcoco+_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCO 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val", "testA", "testB"] 5 | splits = ["test", "val", "testA", "testB"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcoco_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcoco", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco 2 | task: 3 | - refcoco_seg_test 4 | - refcoco_seg_val 5 | - refcoco_seg_testA 6 | - refcoco_seg_testB 7 | - refcoco_bbox_test 8 | - refcoco_bbox_val 9 | - refcoco_bbox_testA 10 | - refcoco_bbox_testB 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_test 3 | test_split: test 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testA 3 | test_split: testA 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_testB 3 | test_split: testB 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox_rec 2 | task: refcoco_bbox_rec_val 3 | test_split: val 4 | include: _default_template_bbox_rec_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_test 3 | test_split: test 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testA 3 | test_split: testA 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_testB 3 | test_split: testB 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_bbox 2 | task: refcoco_bbox_val 3 | test_split: val 4 | include: _default_template_bbox_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_test 3 | test_split: test 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testA 3 | test_split: testA 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_testB 3 | test_split: testB 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcoco_seg 2 | task: refcoco_seg_val 3 | test_split: val 4 | include: _default_template_seg_yaml 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text 6 | doc_to_target: "bbox" 7 | generation_kwargs: 8 | until: 9 | - "ASSISTANT:" 10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result 11 | metric_list: 12 | - metric: refcoco_IoU 13 | aggregation : !function utils_rec.refcoco_bbox_rec_iou 14 | higher_is_better : true 15 | - metric: refcoco_ACC@0.1 16 | aggregation : !function utils_rec.refcoco_bbox_rec_acc01 17 | higher_is_better : true 18 | - metric: refcoco_ACC@0.3 19 | aggregation : !function utils_rec.refcoco_bbox_rec_acc03 20 | higher_is_better : true 21 | - metric: refcoco_ACC@0.5 22 | aggregation : !function utils_rec.refcoco_bbox_rec_acc05 23 | higher_is_better : true 24 | - metric: refcoco_ACC@0.7 25 | aggregation : !function utils_rec.refcoco_bbox_rec_acc07 26 | higher_is_better : true 27 | - metric: refcoco_ACC@0.9 28 | aggregation : !function utils_rec.refcoco_bbox_rec_acc09 29 | higher_is_better : true 30 | - metric: refcoco_Center_ACC 31 | aggregation : !function utils_rec.refcoco_bbox_rec_center_acc 32 | higher_is_better : true 33 | metadata: 34 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/RefCOCOg 2 | output_type: generate_until 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual 4 | doc_to_text: !function utils.refcoco_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.refcoco_process_result 10 | metric_list: 11 | - metric: refcoco_Bleu_4 12 | aggregation : !function utils.refcoco_bleu4 13 | higher_is_better : true 14 | - metric: refcoco_Bleu_3 15 | aggregation : !function utils.refcoco_bleu3 16 | higher_is_better : true 17 | - metric: refcoco_Bleu_2 18 | aggregation : !function utils.refcoco_bleu2 19 | higher_is_better : true 20 | - metric: refcoco_Bleu_1 21 | aggregation : !function utils.refcoco_bleu1 22 | higher_is_better : true 23 | - metric: refcoco_METEOR 24 | aggregation : !function utils.refcoco_meteor 25 | higher_is_better : true 26 | - metric: refcoco_ROUGE_L 27 | aggregation : !function utils.refcoco_rougel 28 | higher_is_better : true 29 | - metric: refcoco_CIDEr 30 | aggregation : !function utils.refcoco_cider 31 | higher_is_better : true 32 | #- metric: refcoco_SPICE 33 | # aggregation : !function utils.refcoco_spice 34 | # higher_is_better : true 35 | metadata: 36 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | # splits = ["train", "test", "val"] 5 | splits = ["test", "val"] 6 | tasks = ["seg", "bbox"] 7 | 8 | if __name__ == "__main__": 9 | dump_tasks = [] 10 | for task in tasks: 11 | for split in splits: 12 | yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 13 | if split == "train": 14 | yaml_dict.pop("group") 15 | else: 16 | dump_tasks.append(f"refcoco_{task}_{split}") 17 | 18 | save_path = f"./refcocog_{task}_{split}.yaml" 19 | print(f"Saving to {save_path}") 20 | with open(save_path, "w") as f: 21 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 22 | 23 | group_dict = {"group": "refcocog", "task": dump_tasks} 24 | 25 | with open("./_refcoco.yaml", "w") as f: 26 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 27 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog 2 | task: 3 | - refcocog_seg_test 4 | - refcocog_seg_val 5 | - refcocog_bbox_test 6 | - refcocog_bbox_val 7 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_test 3 | include: _default_template_bbox_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox_rec 2 | task: refcocog_bbox_rec_val 3 | include: _default_template_bbox_rec_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_test 3 | include: _default_template_bbox_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_bbox 2 | task: refcocog_bbox_val 3 | include: _default_template_bbox_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_test 3 | include: _default_template_seg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml: -------------------------------------------------------------------------------- 1 | group: refcocog_seg 2 | task: refcocog_seg_val 3 | include: _default_template_seg_yaml 4 | test_split: val 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-FULL 3 | task: "scienceqa" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | 33 | model_specific_generation_kwargs: 34 | llava: 35 | image_aspect_ratio: original 36 | 37 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml: -------------------------------------------------------------------------------- 1 | group: scienceqa_full 2 | task: 3 | - scienceqa 4 | - scienceqa_img -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ScienceQA 2 | dataset_name: ScienceQA-IMG 3 | task: "scienceqa_img" 4 | dataset_kwargs: 5 | token: True 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.sqa_doc_to_visual 9 | doc_to_text: !function utils.sqa_doc_to_text 10 | doc_to_target: !function utils.sqa_doc_to_target 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | temperature: 0 14 | do_sample: False 15 | metric_list: 16 | - metric: exact_match 17 | aggregation: mean 18 | higher_is_better: true 19 | ignore_case: true 20 | ignore_punctuation: true 21 | process_results: !function utils.sqa_process_results 22 | metadata: 23 | - version: 0.0 24 | 25 | model_specific_prompt_kwargs: 26 | default: 27 | format: default 28 | pre_prompt: "" 29 | post_prompt: "\nAnswer with the option's letter from the given choices directly." 30 | qwen_vl: 31 | format: qwen_vl 32 | model_specific_generation_kwargs: 33 | llava: 34 | image_aspect_ratio: original 35 | 36 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/scienceqa/utils.py: -------------------------------------------------------------------------------- 1 | def sqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 2 | context, question, choices = doc["hint"], doc["question"], doc["choices"] 3 | len_choices = len(choices) 4 | options = [chr(ord("A") + i) for i in range(len_choices)] 5 | choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)]) 6 | if model_specific_prompt_kwargs["format"] == "default": 7 | if context: 8 | context = f"Context: {context}\n" 9 | 10 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 11 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 12 | return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}" 13 | elif model_specific_prompt_kwargs["format"] == "qwen_vl": 14 | prompt = "Context: {}\nQuestion: {}\nOptions: {}\nAnswer:" 15 | context = context if context else "N/A" 16 | prompt = prompt.format(context, question, choices_str) 17 | return prompt 18 | else: 19 | raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs}") 20 | 21 | 22 | def sqa_doc_to_visual(doc): 23 | if doc["image"] is None: 24 | return [] 25 | return [doc["image"].convert("RGB")] 26 | 27 | 28 | def sqa_doc_to_target(doc): 29 | len_choices = len(doc["choices"]) 30 | options = [chr(ord("A") + i) for i in range(len_choices)] 31 | return options[doc["answer"]] 32 | 33 | 34 | def sqa_process_results(doc, results): 35 | # I know this is weird, but it's how llava parse it. 36 | target = sqa_doc_to_target(doc) 37 | pred = results[0] 38 | if pred == target: 39 | return {"exact_match": 1.0} 40 | # pattern: ^[A-Z]\. .* 41 | if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".": 42 | result = 1.0 if pred[0] == target else 0.0 43 | return {"exact_match": result} 44 | return {"exact_match": 0.0} 45 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text 5 | doc_to_target: "bbox" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils_rec.screenspot_rec_process_result 10 | metric_list: 11 | - metric: screenspot_IoU 12 | aggregation : !function utils_rec.screenspot_rec_iou 13 | higher_is_better : true 14 | - metric: screenspot_ACC@0.1 15 | aggregation : !function utils_rec.screenspot_rec_acc01 16 | higher_is_better : true 17 | - metric: screenspot_ACC@0.3 18 | aggregation : !function utils_rec.screenspot_rec_acc03 19 | higher_is_better : true 20 | - metric: screenspot_ACC@0.5 21 | aggregation : !function utils_rec.screenspot_rec_acc05 22 | higher_is_better : true 23 | - metric: screenspot_ACC@0.7 24 | aggregation : !function utils_rec.screenspot_rec_acc07 25 | higher_is_better : true 26 | - metric: screenspot_ACC@0.9 27 | aggregation : !function utils_rec.screenspot_rec_acc09 28 | higher_is_better : true 29 | - metric: screenspot_Center_ACC 30 | aggregation : !function utils_rec.screenspot_rec_center_acc 31 | higher_is_better : true 32 | metadata: 33 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/ScreenSpot 2 | output_type: generate_until 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual 4 | doc_to_text: !function utils.screenspot_doc_to_text 5 | doc_to_target: "instruction" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.screenspot_process_result 10 | metric_list: 11 | - metric: screenspot_CIDEr 12 | aggregation : !function utils.screenspot_cider 13 | higher_is_better : true 14 | metadata: 15 | version: '0.0' -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot 2 | task: 3 | - screenspot_reg_test 4 | - screenspot_rec_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_rec 2 | task: screenspot_rec_test 3 | include: _default_template_rec_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml: -------------------------------------------------------------------------------- 1 | group: screenspot_reg 2 | task: screenspot_reg_test 3 | include: _default_template_reg_yaml 4 | test_split: test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | image_aspect_ratio: original 14 | # The return value of process_results will be used by metrics 15 | process_results: !function utils.seed_process_result 16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 17 | metric_list: 18 | - metric: seed_image 19 | aggregation: !function utils.seed_aggregation_result 20 | higher_is_better: true 21 | - metric: seed_video 22 | aggregation: !function utils.seed_aggregation_result 23 | higher_is_better: true 24 | - metric: seed_all 25 | aggregation: !function utils.seed_aggregation_result 26 | higher_is_better: true 27 | metadata: 28 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench_ppl" 5 | test_split: test 6 | output_type: multiple_choice 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text_mc 9 | doc_to_choice : !function utils.seed_doc_to_choice 10 | doc_to_target: !function utils.seed_doc_to_mc_target 11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 12 | metric_list: 13 | - metric: acc 14 | metadata: 15 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def seed_doc_to_visual(doc): 5 | return [image.convert("RGB") for image in doc["image"]] 6 | 7 | 8 | def seed_doc_to_text(doc): 9 | question = doc["question"] 10 | question += "\n" + f"A. {doc['choice_a']}\n" 11 | question += f"B. {doc['choice_b']}\n" 12 | question += f"C. {doc['choice_c']}\n" 13 | question += f"D. {doc['choice_d']}" 14 | return f"{question}\nAnswer with the option's letter from the given choices directly." 15 | 16 | 17 | def seed_process_result(doc, result): 18 | pred = result[0].strip() 19 | if len(pred) > 1: 20 | pred = pred[0] 21 | answer = doc["answer"] 22 | data_type = doc["data_type"] 23 | 24 | return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} 25 | 26 | 27 | def seed_aggregation_result(results): 28 | total_count = 0 29 | total_correct = 0 30 | for result in results: 31 | if result["pred"] == result["answer"]: 32 | total_correct += 1 33 | total_count += 1 34 | return total_correct / total_count 35 | 36 | 37 | def seed_aggregation_result_all(results): 38 | score = seed_aggregation_result(results) 39 | stored_results = [] 40 | for result in results: 41 | stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) 42 | with open("./seed_submission.json", "w") as f: 43 | json.dump(stored_results, f, indent=4) 44 | print("Storing files for seed_submission ...") 45 | 46 | return score 47 | 48 | 49 | def seed_doc_to_text_mc(doc): 50 | question = doc["question"] 51 | return f"{question} Answer :" 52 | 53 | 54 | def seed_doc_to_choice(doc): 55 | return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]] 56 | 57 | 58 | def seed_doc_to_mc_target(doc): 59 | answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"} 60 | return doc[answer2choice[doc["answer"]]] 61 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench_2/seedbench_2.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/SEED-Bench-2 2 | dataset_kwargs: 3 | token: True 4 | task: "seedbench-2" 5 | test_split: test 6 | output_type: generate_until 7 | doc_to_visual: !function utils.seed_doc_to_visual 8 | doc_to_text: !function utils.seed_doc_to_text 9 | doc_to_target: "answer" 10 | generation_kwargs: 11 | until: 12 | - "ASSISTANT:" 13 | max_new_tokens: 16 14 | image_aspect_ratio: original 15 | # The return value of process_results will be used by metrics 16 | process_results: !function utils.seed_process_result 17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 18 | metric_list: 19 | - metric: seed_Video 20 | aggregation: !function utils.seed_aggregation_result 21 | higher_is_better: true 22 | - metric: seed_Multiple_Images 23 | aggregation: !function utils.seed_aggregation_result 24 | higher_is_better: true 25 | - metric: seed_Image_&_Text_Generation 26 | aggregation: !function utils.seed_aggregation_result 27 | higher_is_better: true 28 | - metric: seed_Single_Image 29 | aggregation: !function utils.seed_aggregation_result 30 | higher_is_better: true 31 | - metric: seed_Image_Generation 32 | aggregation: !function utils.seed_aggregation_result 33 | higher_is_better: true 34 | - metric: seed_Interleaved_Image 35 | aggregation: !function utils.seed_aggregation_result 36 | higher_is_better: true 37 | - metric: seed_all 38 | aggregation: !function utils.seed_aggregation_result 39 | higher_is_better: true 40 | metadata: 41 | - version: 0.0 42 | 43 | model_specific_prompt_kwargs: 44 | llava : 45 | img_token : 46 | post_prompt : "Answer with the option's letter from the given choices directly." 47 | gpt4V : 48 | img_token : 49 | post_prompt : "Answer with the option's letter from the given choices directly." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/seedbench_2/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def seed_doc_to_visual(doc): 5 | return [image.convert("RGB") for image in doc["image"]] 6 | 7 | 8 | def parse_choice_img(choice: str, img_token: str): 9 | if "jpg" in choice or "png" in choice: 10 | return img_token 11 | return choice 12 | 13 | 14 | def seed_doc_to_text(doc, model_specific_kwargs=None): 15 | question = doc["question"] 16 | question.replace("", model_specific_kwargs["img_token"]) 17 | question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n" 18 | question += f"B. {parse_choice_img(doc['choice_b'], model_specific_kwargs['img_token'])}\n" 19 | question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n" 20 | question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}" 21 | if doc["data_type"] == "Image Generation": 22 | num_img_in_question = len(doc["data_id"]) - 4 23 | prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question 24 | question = " ".join(prepend_tokens) + "\n" + question 25 | return f"{question}\n{model_specific_kwargs['post_prompt']}" 26 | 27 | 28 | def seed_process_result(doc, result): 29 | pred = result[0].strip() 30 | if len(pred) > 1: 31 | pred = pred[0] 32 | answer = doc["answer"] 33 | data_type = doc["data_type"].split(" ") 34 | data_type = "_".join(data_type) 35 | 36 | return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} 37 | 38 | 39 | def seed_aggregation_result(results): 40 | total_count = 0 41 | total_correct = 0 42 | for result in results: 43 | if result["pred"] == result["answer"]: 44 | total_correct += 1 45 | total_count += 1 46 | return total_correct / total_count if total_count != 0 else 0 47 | 48 | 49 | def seed_aggregation_result_all(results): 50 | score = seed_aggregation_result(results) 51 | stored_results = [] 52 | for result in results: 53 | stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) 54 | with open("./seed_submission.json", "w") as f: 55 | json.dump(stored_results, f, indent=4) 56 | print("Storing files for seed_submission ...") 57 | 58 | return score 59 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/ST-VQA 2 | task: "stvqa" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.stvqa_doc_to_visual 6 | doc_to_text: !function utils.stvqa_doc_to_text 7 | doc_to_target: "answers" 8 | generation_kwargs: 9 | max_new_tokens: 32 10 | temperature: 0 11 | do_sample: False 12 | process_results: !function utils.stvqa_process_results 13 | metric_list: 14 | - metric: submission 15 | aggregation: !function utils.stvqa_aggregate_submissions 16 | model_specific_prompt_kwargs: 17 | default: 18 | pre_prompt: "" 19 | post_prompt: "\nAnswer the question using a single word or phrase." 20 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/stvqa/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 6 | 7 | 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs): 9 | question = doc["question"] 10 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 11 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 12 | return f"{pre_prompt}{question}{post_prompt}" 13 | 14 | 15 | def stvqa_doc_to_visual(doc): 16 | return [doc["image"].convert("RGB")] 17 | 18 | 19 | def stvqa_process_results(doc, results): 20 | answer = results[0] 21 | return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}} 22 | 23 | 24 | def stvqa_aggregate_submissions(results, args): 25 | file = generate_submission_file("stvqa_test_for_submission.json", args) 26 | with open(file, "w") as f: 27 | json.dump(results, f) 28 | logging.getLogger("lmms-eval").info(f"Results saved to {file}") 29 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml: -------------------------------------------------------------------------------- 1 | model_specific_prompt_kwargs: 2 | default: 3 | prompt: Provide a one-sentence caption for the provided image. -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml: -------------------------------------------------------------------------------- 1 | group : textcaps 2 | task: 3 | - textcaps_val 4 | - textcaps_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "textcaps_test" 5 | group : "textcaps_caption" 6 | test_split: test 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.textcaps_test_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: textcaps_passthrough 21 | aggregation : !function utils.textcaps_test_aggregation_result 22 | higher_is_better : true 23 | metadata: 24 | - version: 0.0 25 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_train.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task : "textcaps_train" 5 | group : "textcaps_caption" 6 | test_split: train 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | until: 13 | - "ASSISTANT:" 14 | max_new_tokens: 1024 15 | temperature: 0 16 | top_p: 0 17 | num_beams: 1 18 | do_sample: false 19 | process_results: !function utils.textcaps_process_result 20 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 21 | metric_list: 22 | - metric: textcaps_Bleu_4 23 | aggregation : !function utils.textcaps_bleu4 24 | higher_is_better : true 25 | - metric: textcaps_Bleu_3 26 | aggregation : !function utils.textcaps_bleu3 27 | higher_is_better : true 28 | - metric: textcaps_Bleu_2 29 | aggregation : !function utils.textcaps_bleu2 30 | higher_is_better : true 31 | - metric: textcaps_Bleu_1 32 | aggregation : !function utils.textcaps_bleu1 33 | higher_is_better : true 34 | - metric: textcaps_METEOR 35 | aggregation : !function utils.textcaps_meteor 36 | higher_is_better : true 37 | - metric: textcaps_ROUGE_L 38 | aggregation : !function utils.textcaps_rougel 39 | higher_is_better : true 40 | - metric: textcaps_CIDEr 41 | aggregation : !function utils.textcaps_cider 42 | higher_is_better : true 43 | #- metric: textcaps_SPICE 44 | # aggregation : !function utils.textcaps_spice 45 | # higher_is_better : true 46 | metadata: 47 | - version: 0.0 48 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textcaps/textcaps_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/TextCaps 2 | dataset_kwargs: 3 | token: True 4 | task: "textcaps_val" 5 | group : "textcaps_caption" 6 | test_split: val 7 | output_type: generate_until 8 | doc_to_visual: !function utils.textcaps_doc_to_visual 9 | doc_to_text: !function utils.textcaps_doc_to_text 10 | doc_to_target: "answer" 11 | generation_kwargs: 12 | max_new_tokens: 64 13 | temperature: 0 14 | top_p: 0 15 | num_beams: 1 16 | do_sample: false 17 | process_results: !function utils.textcaps_process_result 18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 19 | metric_list: 20 | - metric: textcaps_Bleu_4 21 | aggregation : !function utils.textcaps_bleu4 22 | higher_is_better : true 23 | - metric: textcaps_Bleu_3 24 | aggregation : !function utils.textcaps_bleu3 25 | higher_is_better : true 26 | - metric: textcaps_Bleu_2 27 | aggregation : !function utils.textcaps_bleu2 28 | higher_is_better : true 29 | - metric: textcaps_Bleu_1 30 | aggregation : !function utils.textcaps_bleu1 31 | higher_is_better : true 32 | - metric: textcaps_METEOR 33 | aggregation : !function utils.textcaps_meteor 34 | higher_is_better : true 35 | - metric: textcaps_ROUGE_L 36 | aggregation : !function utils.textcaps_rougel 37 | higher_is_better : true 38 | - metric: textcaps_CIDEr 39 | aggregation : !function utils.textcaps_cider 40 | higher_is_better : true 41 | #- metric: textcaps_SPICE 42 | # aggregation : !function utils.textcaps_spice 43 | # higher_is_better : true 44 | metadata: 45 | - version: 0.0 46 | include: _default_template_textcaps_yaml -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/textvqa 2 | output_type: generate_until 3 | doc_to_visual: !function utils.textvqa_doc_to_visual 4 | doc_to_text: !function utils.textvqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | process_results: !function utils.textvqa_process_results 10 | model_specific_prompt_kwargs: 11 | default: 12 | pre_prompt: "" 13 | post_prompt: "\nAnswer the question using a single word or phrase." 14 | ocr: true 15 | qwen_vl: 16 | pre_prompt: "" 17 | post_prompt: " Answer:" 18 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml: -------------------------------------------------------------------------------- 1 | group: textvqa 2 | task: 3 | - textvqa_val 4 | - textvqa_test 5 | - textvqa_val_noocr -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_test 2 | test_split: test 3 | metric_list: 4 | - metric: submission 5 | aggregation: !function utils.textvqa_aggreate_submissions 6 | higher_is_better: true 7 | include: _default_template_textvqa_yaml 8 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_val 2 | test_split: validation 3 | metric_list: 4 | - metric: exact_match 5 | aggregation: mean 6 | higher_is_better: true 7 | ignore_case: true 8 | ignore_punctuation: true 9 | - metric: submission 10 | aggregation: !function utils.textvqa_aggreate_submissions 11 | higher_is_better: true 12 | include: _default_template_textvqa_yaml 13 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/textvqa/textvqa_val_noocr.yaml: -------------------------------------------------------------------------------- 1 | task: textvqa_val_noocr 2 | test_split: validation 3 | metric_list: 4 | - metric: exact_match 5 | aggregation: mean 6 | higher_is_better: true 7 | ignore_case: true 8 | ignore_punctuation: true 9 | - metric: submission 10 | aggregation: !function utils.textvqa_aggreate_submissions 11 | higher_is_better: true 12 | include: _default_template_textvqa_yaml 13 | model_specific_prompt_kwargs: 14 | default: 15 | pre_prompt: "" 16 | post_prompt: "\nAnswer the question using a single word or phrase." 17 | ocr: false 18 | qwen_vl: 19 | pre_prompt: "" 20 | post_prompt: " Answer:" 21 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VizWiz-VQA 2 | output_type: generate_until 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text 5 | doc_to_target: "answer" 6 | generation_kwargs: 7 | until: 8 | - "ASSISTANT:" 9 | metadata: 10 | - version: 0.0 11 | model_specific_prompt_kwargs: 12 | default: 13 | pre_prompt: "" 14 | post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." 15 | process_results: !function utils.vizwiz_vqa_process_results 16 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | splits = ["val", "test"] 5 | tasks = ["vqa"] 6 | 7 | if __name__ == "__main__": 8 | dump_tasks = [] 9 | for task in tasks: 10 | for split in splits: 11 | yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} 12 | if split == "train": 13 | yaml_dict.pop("group") 14 | else: 15 | dump_tasks.append(f"vizwiz_{task}_{split}") 16 | 17 | save_path = f"./vizwiz_{task}_{split}.yaml" 18 | print(f"Saving to {save_path}") 19 | with open(save_path, "w") as f: 20 | yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) 21 | 22 | group_dict = {"group": "vizwiz_vqa", "task": dump_tasks} 23 | 24 | with open("./_vizwiz_vqa.yaml", "w") as f: 25 | yaml.dump(group_dict, f, default_flow_style=False, indent=4) 26 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: 3 | - vizwiz_vqa_val 4 | - vizwiz_vqa_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import yaml 5 | import pathlib 6 | import logging 7 | import datetime 8 | import statistics 9 | 10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file 11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor 12 | 13 | eval_logger = logging.getLogger("lmms-eval") 14 | 15 | 16 | def vizwiz_vqa_doc_to_visual(doc): 17 | return [doc["image"].convert("RGB")] 18 | 19 | 20 | def vizwiz_vqa_process_results(doc, result): 21 | eval_ai_processor = EvalAIAnswerProcessor() 22 | assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." 23 | resAns = eval_ai_processor(result[0]) 24 | accuracy = 0 25 | 26 | if "answers" in doc and doc["answers"] is not None: 27 | gtAcc = [] 28 | 29 | for i in range(len(doc["answers"])): 30 | doc["answers"][i] = eval_ai_processor(doc["answers"][i]) 31 | 32 | for i in range(len(doc["answers"])): 33 | otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] 34 | matchingAns = [item for item in otherGTAns if item == resAns] 35 | acc = min(1, float(len(matchingAns)) / 3) 36 | gtAcc.append(acc) 37 | if gtAcc: 38 | accuracy = statistics.mean(gtAcc) 39 | else: 40 | accuracy = 0 41 | 42 | return { 43 | "exact_match": accuracy, 44 | "submission": { 45 | "image": f"{doc['question_id']}.jpg", 46 | "answer": resAns, 47 | }, 48 | } 49 | 50 | 51 | def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): 52 | if model_specific_prompt_kwargs is None: 53 | model_specific_prompt_kwargs = {} 54 | pre_prompt = "" 55 | post_prompt = "" 56 | if "pre_prompt" in model_specific_prompt_kwargs: 57 | pre_prompt = model_specific_prompt_kwargs["pre_prompt"] 58 | if "post_prompt" in model_specific_prompt_kwargs: 59 | post_prompt = model_specific_prompt_kwargs["post_prompt"] 60 | text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}" 61 | return text 62 | 63 | 64 | def vizwiz_vqa_aggreate_submissions(results, args): 65 | now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") 66 | submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json" 67 | path = generate_submission_file(submission_file_name, args) 68 | with open(path, "w") as f: 69 | json.dump(results, f) 70 | print(f"Submission file saved to {path}") 71 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_test 3 | test_split: test 4 | include: _default_template_vqa_yaml 5 | process_results: !function utils.vizwiz_vqa_process_results 6 | metric_list: 7 | # - metric: exact_match 8 | # aggregation: mean 9 | # higher_is_better: true 10 | # ignore_case: true 11 | # ignore_punctuation: true 12 | - metric: submission 13 | aggregation: !function utils.vizwiz_vqa_aggreate_submissions 14 | higher_is_better: true 15 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml: -------------------------------------------------------------------------------- 1 | group: vizwiz_vqa 2 | task: vizwiz_vqa_val 3 | test_split: val 4 | include: _default_template_vqa_yaml 5 | metric_list: 6 | - metric: exact_match 7 | aggregation: mean 8 | higher_is_better: true 9 | ignore_case: true 10 | ignore_punctuation: true 11 | # - metric: submission 12 | # aggregation: !function utils.vizwiz_vqa_aggreate_submissions 13 | # higher_is_better: true -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml: -------------------------------------------------------------------------------- 1 | dataset_path: lmms-lab/VQAv2 2 | dataset_kwargs: 3 | token: True 4 | output_type: generate_until 5 | doc_to_visual: !function utils.vqav2_doc_to_visual 6 | doc_to_text: !function utils.vqav2_doc_to_text 7 | doc_to_target: "answer" 8 | generation_kwargs: 9 | max_new_tokens: 16 10 | metadata: 11 | - version: 0.0 12 | model_specific_prompt_kwargs: 13 | default: 14 | pre_prompt: "" 15 | post_prompt: "\nAnswer the question using a single word or phrase." -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml: -------------------------------------------------------------------------------- 1 | group: vqav2 2 | task: 3 | - vqav2_val 4 | - vqav2_test -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_test" 2 | include: _default_template_vqav2_yaml 3 | test_split: test 4 | metric_list: 5 | - metric: submission 6 | aggregation: !function utils.vqav2_aggreate_submissions 7 | higher_is_better: true 8 | process_results: !function utils.vqav2_process_results_test 9 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml: -------------------------------------------------------------------------------- 1 | task: "vqav2_val" 2 | include: _default_template_vqav2_yaml 3 | test_split: validation 4 | metric_list: 5 | - metric: exact_match 6 | aggregation: mean 7 | higher_is_better: true 8 | ignore_case: true 9 | ignore_punctuation: true 10 | process_results: !function utils.vqav2_process_results_val 11 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/README.md: -------------------------------------------------------------------------------- 1 | # WebSRC 2 | 3 | ## Paper 4 | 5 | Title: WebSRC: A Dataset for Web-Based Structural Reading Comprehension 6 | 7 | Abstract: https://arxiv.org/abs/2101.09465 8 | 9 | Homepage: https://x-lance.github.io/WebSRC/# 10 | 11 | WebSRC is a dataset for web-based structural reading comprehension. 12 | Its full train/dev/test split contains over 400k questions across 6.4k webpages. 13 | This version of the dataset does not contain OCR or original HTML, it simply treats WebSRC as a image-and-text-based multimodal Q&A benchmark on webpage screenshots. 14 | 15 | ## Citation 16 | 17 | ```bibtex 18 | @inproceedings{chen2021websrc, 19 | title={WebSRC: A Dataset for Web-Based Structural Reading Comprehension}, 20 | author={Chen, Xingyu and Zhao, Zihan and Chen, Lu and Ji, Jiabao and Zhang, Danyang and Luo, Ao and Xiong, Yuxuan and Yu, Kai}, 21 | booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, 22 | pages={4173--4185}, 23 | year={2021} 24 | } 25 | ``` 26 | 27 | ## Groups & Tasks 28 | 29 | ### Groups 30 | 31 | - `websrc`: Evaluates `websrc-val` and generates a submission file for `websrc-test`. 32 | 33 | ### Tasks 34 | 35 | - `websrc-val`: Given a question and a web page, predict the answer. 36 | - `websrc-test`: Given a question and a web page, predict the answer. Ground truth is not provided for this task. 37 | 38 | ## Metrics 39 | 40 | This task uses SQUAD-style evaluation metrics, of which F1 score over tokens is used. 41 | The orignal paper also uses Exact Match (EM) score, but this is not implemented here as that metric is more conducive for Encoder-only extraction models. 42 | 43 | ### F1 Score 44 | 45 | F1 Score is the harmonic mean of precision and recall. 46 | We calculate precision and recall at the token level, then compute the F1 score as normal using these values. 47 | 48 | ### Test Submission 49 | 50 | When evaluaing on the test split, a prediction JSON will be compiled instead of metrics computed. 51 | Instructions for submission are available on the [WebSRC homepage](https://x-lance.github.io/WebSRC/#) and in their [Original GitHub Repo](https://github.com/X-LANCE/WebSRC-Baseline#obtain-test-result). -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc.yaml: -------------------------------------------------------------------------------- 1 | group: websrc 2 | task: 3 | - websrc_val 4 | - websrc_test 5 | -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc-test 2 | task: "websrc_test" 3 | test_split: test 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: submission 16 | aggregation: !function utils.websrc_test_aggregate_results_for_submission 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml: -------------------------------------------------------------------------------- 1 | dataset_path: rootsautomation/websrc 2 | task: "websrc_val" 3 | test_split: dev 4 | output_type: generate_until 5 | doc_to_visual: !function utils.websrc_doc_to_visual 6 | doc_to_text: !function utils.websrc_doc_to_text 7 | doc_to_target: "answer" 8 | # The return value of process_results will be used by metrics 9 | process_results: !function utils.websrc_process_results 10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results 11 | generation_kwargs: 12 | max_new_tokens: 16 13 | image_aspect_ratio: pad 14 | metric_list: 15 | - metric: websrc_squad_f1 16 | aggregation: !function utils.websrc_aggregate_results 17 | higher_is_better: true 18 | metadata: 19 | - version: 0.0 -------------------------------------------------------------------------------- /lmms-eval/miscs/llava_result_check.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/miscs/llava_result_check.md -------------------------------------------------------------------------------- /lmms-eval/miscs/repr_scripts.sh: -------------------------------------------------------------------------------- 1 | # install lmms_eval without building dependencies 2 | cd lmms_eval; 3 | pip install --no-deps -U -e . 4 | 5 | # install LLaVA without building dependencies 6 | cd LLaVA 7 | pip install --no-deps -U -e . 8 | 9 | # install all the requirements that require for reproduce llava results 10 | pip install -r llava_repr_requirements.txt 11 | 12 | # Run and exactly reproduce llava_v1.5 results! 13 | # mme as an example 14 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto" --tasks mme --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/ -------------------------------------------------------------------------------- /lmms-eval/miscs/script.sh: -------------------------------------------------------------------------------- 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme_llava_prompt --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 2 | 3 | 4 | gpu = 8 bs 1: 5 | 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1 7 | | Tasks |Version|Filter|n-shot| Metric |Value| |Stderr | 8 | |----------------|-------|------|-----:|-----------|----:|---|------:| 9 | |mme_llava_prompt|Yaml |none | 0|exact_match| 1873|± |38.4331| 10 | 11 | gpu = 8 bs 1 use_flash_attention_2=True: 12 | 13 | 14 | 15 | 16 | 17 | gpu = 4 bs 1 use_flash_attention_2=True: 18 | 19 | 20 | 21 | accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks scienceqa --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/ 22 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_llava.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | 4 | import torch 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration 6 | 7 | model_id = "llava-hf/llava-1.5-7b-hf" 8 | 9 | prompt_1 = "USER: \nWhat does this image show?\nASSISTANT:" 10 | prompt_2 = "USER: \nWhat is the difference between these two images?\nASSISTANT:" 11 | image_file_1 = "image1.png" 12 | image_file_2 = "image2.png" 13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0) 14 | processor = AutoProcessor.from_pretrained(model_id) 15 | raw_image_1 = Image.open(image_file_1) 16 | raw_image_2 = Image.open(image_file_2) 17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16) 18 | import pdb 19 | 20 | pdb.set_trace() 21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False) 22 | print(processor.batch_decode(output, skip_special_tokens=True)) 23 | -------------------------------------------------------------------------------- /lmms-eval/miscs/test_scienceqa.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"] 4 | for doc in dataset: 5 | print(doc["id"]) 6 | -------------------------------------------------------------------------------- /lmms-eval/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 240 3 | 4 | [build-system] 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "lmms_eval" 10 | version = "0.1.2" 11 | authors = [ 12 | { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" }, 13 | ] 14 | description = "A framework for evaluating large multi-modality language models" 15 | readme = "README.md" 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ] 21 | requires-python = ">=3.8" 22 | license = { text = "MIT" } 23 | dependencies = [ 24 | "accelerate>=0.21.0", 25 | "black==24.1.0", 26 | "datasets==2.16.1", 27 | "evaluate>=0.4.0", 28 | "jsonlines", 29 | "numexpr", 30 | "peft>=0.2.0", 31 | "pybind11>=2.6.2", 32 | "pytablewriter", 33 | "rouge-score>=0.0.4", 34 | "sacrebleu>=1.5.0", 35 | "scikit-learn>=0.24.1", 36 | "sqlitedict", 37 | "torch>=1.8", 38 | "openai>=1.0.0", 39 | "pycocoevalcap", 40 | "tqdm-multiprocess", 41 | "transformers", 42 | "zstandard", 43 | "pillow", 44 | "pyyaml", 45 | "sympy", 46 | "mpmath", 47 | "Jinja2", 48 | "openpyxl", 49 | "Levenshtein", 50 | "hf_transfer", 51 | "tenacity", 52 | "wandb>=0.16.0", 53 | "transformers-stream-generator", 54 | "tiktoken", 55 | "pre-commit", 56 | "pydantic", 57 | ] 58 | 59 | [tool.setuptools.packages.find] 60 | include = ["lmms_eval*"] 61 | 62 | [tool.setuptools.package-data] 63 | lmms_eval = ["**/*.yaml", "tasks/**/*"] 64 | 65 | [project.scripts] 66 | lmms-eval = "lmms_eval.__main__:cli_evaluate" 67 | lmms_eval = "lmms_eval.__main__:cli_evaluate" 68 | 69 | [project.urls] 70 | Homepage = "https://lmms-lab.github.io/lmms-eval-blog/" 71 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval" 72 | -------------------------------------------------------------------------------- /lmms-eval/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # This is to make sure that the package supports editable installs 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.2.2.post1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.1.2", "torchvision==0.16.2", 17 | "transformers==4.39.3", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.27.2", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://llava-vl.github.io" 31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /scripts/eval_lmms.sh: -------------------------------------------------------------------------------- 1 | # login to huggingface 2 | 3 | CKPT=${1:-"None"} 4 | conv_template=${2:-"vicuna_v1"} 5 | vistoken_patch_size=${3:-"None"} 6 | 7 | eval_tasks=${eval_tasks:-"textvqa,chartqa,docvqa"} 8 | master_port=${master_port:-"12345"} 9 | GPUS=`nvidia-smi -L | wc -l` 10 | 11 | echo $CKPT, $conv_template 12 | 13 | accelerate launch --num_processes=$GPUS --main_process_port=${master_port} -m lmms_eval --model llava \ 14 | --model_args pretrained=$CKPT,conv_template=${conv_template} \ 15 | --tasks $eval_tasks --batch_size 1 --log_samples --log_samples_suffix lmms_eval --output_path $CKPT/logs/ 16 | 17 | -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } --------------------------------------------------------------------------------