├── .devcontainer
    ├── Dockerfile
    ├── devcontainer.env
    ├── devcontainer.json
    └── postCreateCommand.sh
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── 1-usage.yaml
    │   ├── 2-feature-request.yaml
    │   ├── 3-question.yaml
    │   └── 4-discussion.yaml
├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── deepstack_teaser.png
    ├── deepstack_vl.png
    ├── logo.png
    └── visualization.png
├── cog.yaml
├── docs
    └── MODEL_ZOO.md
├── images
    ├── demo_cli.gif
    ├── llava_example_cmp.png
    ├── llava_logo.png
    └── llava_v1_5_radar.jpg
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── deepstack_arch.py
    │   ├── language_model
    │   │   ├── custom_llama.py
    │   │   ├── custom_phi.py
    │   │   ├── deepstack_llama.py
    │   │   ├── deepstack_phi.py
    │   │   ├── llava_llama.py
    │   │   ├── llava_mistral.py
    │   │   ├── llava_mpt.py
    │   │   └── phi3
    │   │   │   ├── configuration_phi3.py
    │   │   │   └── modeling_phi3.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── lmms-eval
    ├── .gitignore
    ├── .pre-commit-config.yaml
    ├── README.md
    ├── docs
    │   ├── README.md
    │   ├── commands.md
    │   ├── model_guide.md
    │   └── task_guide.md
    ├── example_eval.yaml
    ├── llava_repr_requirements.txt
    ├── lmms_eval
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── filter.py
    │   │   ├── instance.py
    │   │   ├── metrics.py
    │   │   ├── model.py
    │   │   ├── registry.py
    │   │   ├── samplers.py
    │   │   └── task.py
    │   ├── evaluator.py
    │   ├── filters
    │   │   ├── __init__.py
    │   │   ├── decontamination.py
    │   │   ├── extraction.py
    │   │   ├── selection.py
    │   │   └── transformation.py
    │   ├── logging_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── fuyu.py
    │   │   ├── gpt4v.py
    │   │   ├── instructblip.py
    │   │   ├── llava.py
    │   │   ├── llava_hf.py
    │   │   ├── llava_sglang.py
    │   │   ├── minicpm_v.py
    │   │   ├── model_utils
    │   │   │   ├── __init__.py
    │   │   │   └── qwen
    │   │   │   │   └── qwen_generate_utils.py
    │   │   └── qwen_vl.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── _task_utils
    │   │   │   ├── file_utils.py
    │   │   │   ├── gpt_eval_utils.py
    │   │   │   └── vqa_eval_metric.py
    │   │   ├── ai2d
    │   │   │   ├── ai2d.yaml
    │   │   │   ├── upload_ai2d.py
    │   │   │   └── utils.py
    │   │   ├── chartqa
    │   │   │   ├── chartqa.yaml
    │   │   │   ├── upload_chartqa.py
    │   │   │   └── utils.py
    │   │   ├── cmmmu
    │   │   │   ├── _cmmmu.yaml
    │   │   │   ├── _default_template_cmmmu_yaml
    │   │   │   ├── cmmmu_test.yaml
    │   │   │   ├── cmmmu_val.yaml
    │   │   │   └── utils.py
    │   │   ├── coco_cap
    │   │   │   ├── coco2014_cap.yaml
    │   │   │   ├── coco2014_cap_test.yaml
    │   │   │   ├── coco2014_cap_val.yaml
    │   │   │   ├── coco2017_cap.yaml
    │   │   │   ├── coco2017_cap_test.yaml
    │   │   │   ├── coco2017_cap_val.yaml
    │   │   │   ├── coco_cap.yaml
    │   │   │   └── utils.py
    │   │   ├── docvqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── docvqa.yaml
    │   │   │   ├── docvqa_test.yaml
    │   │   │   ├── docvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ferret
    │   │   │   ├── ferret.yaml
    │   │   │   └── utils.py
    │   │   ├── flickr30k
    │   │   │   ├── flickr30k.yaml
    │   │   │   ├── flickr30k_test.yaml
    │   │   │   └── utils.py
    │   │   ├── gqa
    │   │   │   ├── gqa.yaml
    │   │   │   └── utils.py
    │   │   ├── hallusion_bench
    │   │   │   ├── evaluate_hb.py
    │   │   │   ├── hallusion_bench_image.yaml
    │   │   │   └── utils.py
    │   │   ├── iconqa
    │   │   │   ├── _default_template_docvqa_yaml
    │   │   │   ├── iconqa.yaml
    │   │   │   ├── iconqa_test.yaml
    │   │   │   ├── iconqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── infovqa
    │   │   │   ├── _default_template_infovqa_yaml
    │   │   │   ├── infovqa.yaml
    │   │   │   ├── infovqa_test.yaml
    │   │   │   ├── infovqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-bench-coco
    │   │   │   ├── llava-bench-coco.yaml
    │   │   │   └── utils.py
    │   │   ├── llava-in-the-wild
    │   │   │   ├── llava-in-the-wild.yaml
    │   │   │   └── utils.py
    │   │   ├── mathverse
    │   │   │   ├── mathverse.yaml
    │   │   │   ├── mathverse_evals.py
    │   │   │   ├── mathverse_testmini.yaml
    │   │   │   ├── mathverse_testmini_text_dominant.yaml
    │   │   │   ├── mathverse_testmini_text_lite.yaml
    │   │   │   ├── mathverse_testmini_text_only.yaml
    │   │   │   ├── mathverse_testmini_vision_dominant.yaml
    │   │   │   ├── mathverse_testmini_vision_intensive.yaml
    │   │   │   ├── mathverse_testmini_vision_only.yaml
    │   │   │   └── utils.py
    │   │   ├── mathvista
    │   │   │   ├── mathvista.yaml
    │   │   │   ├── mathvista_evals.py
    │   │   │   ├── mathvista_test.yaml
    │   │   │   ├── mathvista_testmini.yaml
    │   │   │   └── utils.py
    │   │   ├── mmbench
    │   │   │   ├── _default_template_mmbench_cn_yaml
    │   │   │   ├── _default_template_mmbench_en_yaml
    │   │   │   ├── cc_utils.py
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── mmbench.yaml
    │   │   │   ├── mmbench_cc.yaml
    │   │   │   ├── mmbench_cn.yaml
    │   │   │   ├── mmbench_cn_dev.yaml
    │   │   │   ├── mmbench_cn_test.yaml
    │   │   │   ├── mmbench_en.yaml
    │   │   │   ├── mmbench_en_dev.yaml
    │   │   │   ├── mmbench_en_test.yaml
    │   │   │   └── mmbench_evals.py
    │   │   ├── mme
    │   │   │   ├── mme.yaml
    │   │   │   └── utils.py
    │   │   ├── mmmu
    │   │   │   ├── mmmu.yaml
    │   │   │   ├── mmmu_test.yaml
    │   │   │   ├── mmmu_val.yaml
    │   │   │   └── utils.py
    │   │   ├── mmvet
    │   │   │   ├── mmvet.yaml
    │   │   │   └── utils.py
    │   │   ├── multidocvqa
    │   │   │   ├── multidocvqa.yaml
    │   │   │   ├── multidocvqa_test.yaml
    │   │   │   ├── multidocvqa_val.yaml
    │   │   │   └── utils.py
    │   │   ├── multilingual-llava-bench-in-the-wild
    │   │   │   ├── README.md
    │   │   │   ├── _generate_configs.py
    │   │   │   ├── arabic_llava_in_the_wild.yaml
    │   │   │   ├── bengali_llava_in_the_wild.yaml
    │   │   │   ├── chinese_llava_in_the_wild.yaml
    │   │   │   ├── french_llava_in_the_wild.yaml
    │   │   │   ├── hindi_llava_in_the_wild.yaml
    │   │   │   ├── japanese_llava_in_the_wild.yaml
    │   │   │   ├── russian_llava_in_the_wild.yaml
    │   │   │   ├── spanish_llava_in_the_wild.yaml
    │   │   │   ├── urdu_llava_in_the_wild.yaml
    │   │   │   └── utils.py
    │   │   ├── nocaps
    │   │   │   ├── _default_template_nocaps_yaml
    │   │   │   ├── nocaps.yaml
    │   │   │   ├── nocaps_test.yaml
    │   │   │   ├── nocaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── ocrbench
    │   │   │   ├── ocrbench.yaml
    │   │   │   ├── upload_ocrbench.py
    │   │   │   └── utils.py
    │   │   ├── ok_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _ok_vqa.yaml
    │   │   │   ├── ok_vqa_val2014.yaml
    │   │   │   └── utils.py
    │   │   ├── olympiadbench
    │   │   │   ├── cn_utils.py
    │   │   │   ├── en_utils.py
    │   │   │   ├── olympiadbench.yaml
    │   │   │   ├── olympiadbench_evals.py
    │   │   │   ├── olympiadbench_test_cn.yaml
    │   │   │   └── olympiadbench_test_en.yaml
    │   │   ├── pope
    │   │   │   ├── pope.yaml
    │   │   │   └── utils.py
    │   │   ├── realworldqa
    │   │   │   ├── realworldqa.yaml
    │   │   │   └── utils.py
    │   │   ├── refcoco+
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco+_bbox_rec_testA.yaml
    │   │   │   ├── refcoco+_bbox_rec_testB.yaml
    │   │   │   ├── refcoco+_bbox_rec_val.yaml
    │   │   │   ├── refcoco+_bbox_testA.yaml
    │   │   │   ├── refcoco+_bbox_testB.yaml
    │   │   │   ├── refcoco+_bbox_val.yaml
    │   │   │   ├── refcoco+_seg_testA.yaml
    │   │   │   ├── refcoco+_seg_testB.yaml
    │   │   │   ├── refcoco+_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── refcoco
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcoco_bbox_rec_test.yaml
    │   │   │   ├── refcoco_bbox_rec_testA.yaml
    │   │   │   ├── refcoco_bbox_rec_testB.yaml
    │   │   │   ├── refcoco_bbox_rec_val.yaml
    │   │   │   ├── refcoco_bbox_test.yaml
    │   │   │   ├── refcoco_bbox_testA.yaml
    │   │   │   ├── refcoco_bbox_testB.yaml
    │   │   │   ├── refcoco_bbox_val.yaml
    │   │   │   ├── refcoco_seg_test.yaml
    │   │   │   ├── refcoco_seg_testA.yaml
    │   │   │   ├── refcoco_seg_testB.yaml
    │   │   │   ├── refcoco_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── refcocog
    │   │   │   ├── _default_template_bbox_rec_yaml
    │   │   │   ├── _default_template_bbox_yaml
    │   │   │   ├── _default_template_seg_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _refcoco.yaml
    │   │   │   ├── refcocog_bbox_rec_test.yaml
    │   │   │   ├── refcocog_bbox_rec_val.yaml
    │   │   │   ├── refcocog_bbox_test.yaml
    │   │   │   ├── refcocog_bbox_val.yaml
    │   │   │   ├── refcocog_seg_test.yaml
    │   │   │   ├── refcocog_seg_val.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── scienceqa
    │   │   │   ├── scienceqa.yaml
    │   │   │   ├── scienceqa_full.yaml
    │   │   │   ├── scienceqa_img.yaml
    │   │   │   └── utils.py
    │   │   ├── screenspot
    │   │   │   ├── README.md
    │   │   │   ├── _default_template_rec_yaml
    │   │   │   ├── _default_template_reg_yaml
    │   │   │   ├── _screenspot.yaml
    │   │   │   ├── screenspot_rec_test.yaml
    │   │   │   ├── screenspot_reg_test.yaml
    │   │   │   ├── utils.py
    │   │   │   └── utils_rec.py
    │   │   ├── seedbench
    │   │   │   ├── seedbench.yaml
    │   │   │   ├── seedbench_ppl.yaml
    │   │   │   └── utils.py
    │   │   ├── seedbench_2
    │   │   │   ├── seedbench_2.yaml
    │   │   │   └── utils.py
    │   │   ├── stvqa
    │   │   │   ├── stvqa.yaml
    │   │   │   └── utils.py
    │   │   ├── textcaps
    │   │   │   ├── _default_template_textcaps_yaml
    │   │   │   ├── textcaps.yaml
    │   │   │   ├── textcaps_test.yaml
    │   │   │   ├── textcaps_train.yaml
    │   │   │   ├── textcaps_val.yaml
    │   │   │   └── utils.py
    │   │   ├── textvqa
    │   │   │   ├── _default_template_textvqa_yaml
    │   │   │   ├── _textvqa.yaml
    │   │   │   ├── textvqa_test.yaml
    │   │   │   ├── textvqa_val.yaml
    │   │   │   ├── textvqa_val_noocr.yaml
    │   │   │   └── utils.py
    │   │   ├── vizwiz_vqa
    │   │   │   ├── _default_template_vqa_yaml
    │   │   │   ├── _generate_config.py
    │   │   │   ├── _vizwiz_vqa.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vizwiz_vqa_test.yaml
    │   │   │   └── vizwiz_vqa_val.yaml
    │   │   ├── vqav2
    │   │   │   ├── _default_template_vqav2_yaml
    │   │   │   ├── _vqav2.yaml
    │   │   │   ├── utils.py
    │   │   │   ├── vqav2_test.yaml
    │   │   │   └── vqav2_val.yaml
    │   │   └── websrc
    │   │   │   ├── README.md
    │   │   │   ├── utils.py
    │   │   │   ├── websrc.yaml
    │   │   │   ├── websrc_test.yaml
    │   │   │   └── websrc_val.yaml
    │   └── utils.py
    ├── miscs
    │   ├── llava_result_check.md
    │   ├── repr_scripts.sh
    │   ├── repr_torch_envs.txt
    │   ├── scienceqa_id.txt
    │   ├── script.sh
    │   ├── test_llava.py
    │   └── test_scienceqa.py
    ├── pyproject.toml
    └── setup.py
├── predict.py
├── pyproject.toml
└── scripts
    ├── eval_lmms.sh
    ├── extract_mm_projector.py
    ├── merge_lora_weights.py
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04
 2 | 
 3 | SHELL [ "bash", "-c" ]
 4 | 
 5 | # update apt and install packages
 6 | RUN apt update && \
 7 |     apt install -yq \
 8 |         ffmpeg \
 9 |         dkms \
10 |         build-essential
11 | 
12 | # add user tools
13 | RUN sudo apt install -yq \
14 |         jq \
15 |         jp \
16 |         tree \
17 |         tldr
18 | 
19 | # add git-lfs and install
20 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
21 |     sudo apt-get install -yq git-lfs && \
22 |     git lfs install
23 | 
24 | ############################################
25 | # Setup user
26 | ############################################
27 | 
28 | USER vscode
29 | 
30 | # install azcopy, a tool to copy to/from blob storage
31 | # for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
32 | RUN cd /tmp && \
33 |     wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
34 |     tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
35 |     mkdir -p ~/.local/bin && \
36 |     mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
37 |     chmod +x ~/.local/bin/azcopy && \
38 |     rm -rf azcopy_linux_amd64*
39 | 
40 | # Setup conda
41 | RUN cd /tmp && \
42 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
43 |     bash ./Miniconda3-latest-Linux-x86_64.sh -b && \
44 |     rm ./Miniconda3-latest-Linux-x86_64.sh
45 | 
46 | # Install dotnet
47 | RUN cd /tmp && \
48 |     wget https://dot.net/v1/dotnet-install.sh && \
49 |     chmod +x dotnet-install.sh && \
50 |     ./dotnet-install.sh --channel 7.0 && \
51 |     ./dotnet-install.sh --channel 3.1 && \
52 |     rm ./dotnet-install.sh
53 | 
54 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.env:
--------------------------------------------------------------------------------
1 | SAMPLE_ENV_VAR1="Sample Value"
2 | SAMPLE_ENV_VAR2=332431bf-68bf


--------------------------------------------------------------------------------
/.devcontainer/postCreateCommand.sh:
--------------------------------------------------------------------------------
 1 | git config --global safe.directory '*'
 2 | git config --global core.editor "code --wait"
 3 | git config --global pager.branch false
 4 | 
 5 | # Set AZCOPY concurrency to auto
 6 | echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc
 7 | echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.bashrc
 8 | 
 9 | # Activate conda by default
10 | echo ". /home/vscode/miniconda3/bin/activate" >> ~/.zshrc
11 | echo ". /home/vscode/miniconda3/bin/activate" >> ~/.bashrc
12 | 
13 | # Use llava environment by default
14 | echo "conda activate llava" >> ~/.zshrc
15 | echo "conda activate llava" >> ~/.bashrc
16 | 
17 | # Add dotnet to PATH
18 | echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.bashrc
19 | echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.zshrc
20 | 
21 | # Create and activate llava environment
22 | source /home/vscode/miniconda3/bin/activate
23 | conda create -y -q -n llava python=3.10
24 | conda activate llava
25 | 
26 | # Install Nvidia Cuda Compiler
27 | conda install -y -c nvidia cuda-compiler
28 | 
29 | pip install pre-commit==3.0.2
30 | 
31 | # Install package locally
32 | pip install --upgrade pip  # enable PEP 660 support
33 | pip install -e .
34 | 
35 | # Install additional packages for training
36 | pip install -e ".[train]"
37 | pip install flash-attn --no-build-isolation
38 | 
39 | # Download checkpoints to location outside of the repo
40 | git clone https://huggingface.co/liuhaotian/llava-v1.5-7b ~/llava-v1.5-7b
41 | 
42 | # Commented because it is unlikely for users to have enough local GPU memory to load the model
43 | # git clone https://huggingface.co/liuhaotian/llava-v1.5-13b ~/llava-v1.5-13b
44 | 
45 | echo "postCreateCommand.sh COMPLETE!"
46 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # The .dockerignore file excludes files from the container build process.
 2 | #
 3 | # https://docs.docker.com/engine/reference/builder/#dockerignore-file
 4 | 
 5 | # Exclude Git files
 6 | .git
 7 | .github
 8 | .gitignore
 9 | 
10 | # Exclude Python cache files
11 | __pycache__
12 | .mypy_cache
13 | .pytest_cache
14 | .ruff_cache
15 | 
16 | # Exclude Python virtual environment
17 | /venv
18 | 
19 | # Exclude some weights
20 | /openai
21 | /liuhaotian
22 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | # Unix-style newlines with a newline ending every file
 4 | [*]
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | charset = utf-8
 9 | 
10 | # 4 space indentation
11 | [*.{py,json}]
12 | indent_style = space
13 | indent_size = 4
14 | 
15 | # 2 space indentation
16 | [*.{md,sh,yaml,yml}]
17 | indent_style = space
18 | indent_size = 2


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # https://git-scm.com/docs/gitattributes
 2 | 
 3 | # Set the default behavior, in case people don't have core.autocrlf set.
 4 | # https://git-scm.com/docs/gitattributes#_end_of_line_conversion
 5 | * text=auto
 6 | 
 7 | # common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
 8 | # Source files
 9 | # ============
10 | *.pxd    text diff=python
11 | *.py     text diff=python
12 | *.py3    text diff=python
13 | *.pyw    text diff=python
14 | *.pyx    text diff=python
15 | *.pyz    text diff=python
16 | *.pyi    text diff=python
17 | 
18 | # Binary files
19 | # ============
20 | *.db     binary
21 | *.p      binary
22 | *.pkl    binary
23 | *.pickle binary
24 | *.pyc    binary export-ignore
25 | *.pyo    binary export-ignore
26 | *.pyd    binary
27 | 
28 | # Jupyter notebook
29 | *.ipynb  text eol=lf
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-usage.yaml:
--------------------------------------------------------------------------------
 1 | name: Usage issues
 2 | description: Report issues in usage.
 3 | title: "[Usage] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for taking the time to fill out this form.  Please give as detailed description as possible for us to better assist with the issue :)
 9 |   - type: textarea
10 |     id: what-happened
11 |     attributes:
12 |       label: Describe the issue
13 |       description: Please give as detailed description as possible for us to better assist with the issue.  Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub.
14 |       placeholder: Issue
15 |       value: |
16 |         Issue:
17 |         
18 |         Command:
19 |         ```
20 |         PASTE THE COMMANDS HERE.
21 |         ```
22 |         
23 |         Log: 
24 |         ```
25 |         PASTE THE LOGS HERE.
26 |         ```
27 |         
28 |         Screenshots:
29 |         You may attach screenshots if it better explains the issue.
30 |     validations:
31 |       required: true
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Request for a new feature
 3 | title: "[Feature request] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  Please share your thoughts of the new features below.
 9 |   - type: textarea
10 |     id: feature
11 |     attributes:
12 |       label: feature
13 |       placeholder: Start your thoughts here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-question.yaml:
--------------------------------------------------------------------------------
 1 | name: Questions
 2 | description: General questions about the work
 3 | title: "[Question] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: question
11 |     attributes:
12 |       label: Question
13 |       placeholder: Start question here...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/4-discussion.yaml:
--------------------------------------------------------------------------------
 1 | name: Discussions
 2 | description: General discussions about the work
 3 | title: "[Discussion] "
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
 9 |   - type: textarea
10 |     id: discussion
11 |     attributes:
12 |       label: Discussion
13 |       placeholder: Start discussion here...


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | debug.sh
 3 | eval_lmms.sh
 4 | 
 5 | # Python
 6 | __pycache__
 7 | *.pyc
 8 | *.egg-info
 9 | dist
10 | 
11 | # Log
12 | *.log
13 | *.log.*
14 | *.json
15 | *.jsonl
16 | 
17 | # Data
18 | !**/alpaca-data-conversation.json
19 | 
20 | # Editor
21 | .idea
22 | *.swp
23 | 
24 | # Other
25 | .DS_Store
26 | wandb
27 | output
28 | 
29 | checkpoints
30 | ckpts*
31 | 
32 | .ipynb_checkpoints
33 | *.ipynb
34 | 
35 | # DevContainer
36 | !.devcontainer/*
37 | 
38 | # Demo
39 | serve_images/
40 | 


--------------------------------------------------------------------------------
/assets/deepstack_teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/deepstack_teaser.png


--------------------------------------------------------------------------------
/assets/deepstack_vl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/deepstack_vl.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/logo.png


--------------------------------------------------------------------------------
/assets/visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/assets/visualization.png


--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 | 
 7 |   python_version: "3.11"
 8 | 
 9 |   python_packages:
10 |     - "torch==2.0.1"
11 |     - "accelerate==0.21.0"
12 |     - "bitsandbytes==0.41.0"
13 |     - "deepspeed==0.9.5"
14 |     - "einops-exts==0.0.4"
15 |     - "einops==0.6.1"
16 |     - "gradio==3.35.2"
17 |     - "gradio_client==0.2.9"
18 |     - "httpx==0.24.0"
19 |     - "markdown2==2.4.10"
20 |     - "numpy==1.26.0"
21 |     - "peft==0.4.0"
22 |     - "scikit-learn==1.2.2"
23 |     - "sentencepiece==0.1.99"
24 |     - "shortuuid==1.0.11"
25 |     - "timm==0.6.13"
26 |     - "tokenizers==0.13.3"
27 |     - "torch==2.0.1"
28 |     - "torchvision==0.15.2"
29 |     - "transformers==4.31.0"
30 |     - "wandb==0.15.12"
31 |     - "wavedrom==2.0.3.post3"
32 |     - "Pygments==2.16.1"
33 |   run:
34 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 | 
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/docs/MODEL_ZOO.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | If you are interested in including any other details in Model Zoo, please open an issue :)
 4 | 
 5 | ## DeepStack
 6 | 
 7 | | Version | LLM | Schedule | Checkpoint | VQAv2 | GQA | TextVQA | DocVQA | InfoVQA | SEED | POPE |
 8 | |----------|----------|-----------|-----------|---|---|---|---|---|---|---|
 9 | | DeepStack-L | Vicuna-7B | full_ft-1e | [menglc/deepstack-l-vicuna-7b](https://huggingface.co/menglc/deepstack-l-vicuna-7b) | 79.5| 63.1| 62.4| 39.1| 29.8| 60.6| 86.7 
10 | | DeepStack-L-HD | Vicuna-7B | full_ft-1e | [menglc/deepstack-l-hd-vicuna-7b](https://huggingface.co/menglc/deepstack-l-hd-vicuna-7b) | 82.0| 65.2| 66.7 | 78.8| 41.2| 63.6| 86.5
11 | 


--------------------------------------------------------------------------------
/images/demo_cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/demo_cli.gif


--------------------------------------------------------------------------------
/images/llava_example_cmp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_example_cmp.png


--------------------------------------------------------------------------------
/images/llava_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_logo.png


--------------------------------------------------------------------------------
/images/llava_v1_5_radar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/images/llava_v1_5_radar.jpg


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import ModelSelect
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/llava/eval/webpage/styles.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  3 |     background-color: #f8f9fa;
  4 | }
  5 | 
  6 | .navbar-dark .navbar-nav .nav-link {
  7 |     color: #f1cf68;
  8 |     font-size: 1.1rem;
  9 |     padding: 0.5rem 0.6rem;
 10 | }
 11 | 
 12 | .card-header {
 13 |     font-weight: bold;
 14 | }
 15 | 
 16 | .card {
 17 |     box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 18 |     transition: 0.3s;
 19 | }
 20 | 
 21 | .card:hover {
 22 |     box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
 23 | }
 24 | 
 25 | button {
 26 |     transition: background-color 0.3s;
 27 | }
 28 | 
 29 | button:hover {
 30 |     background-color: #007bff;
 31 | }
 32 | 
 33 | @media (max-width: 767px) {
 34 |     .form-row .form-group {
 35 |         margin-bottom: 10px;
 36 |     }
 37 | }
 38 | 
 39 | /* Extra styles */
 40 | 
 41 | .expandable-card .card-text-container {
 42 |     max-height: 200px;
 43 |     overflow-y: hidden;
 44 |     position: relative;
 45 | }
 46 | 
 47 | .expandable-card.expanded .card-text-container {
 48 |     max-height: none;
 49 | }
 50 | 
 51 | .expand-btn {
 52 |     position: relative;
 53 |     display: none;
 54 |     background-color: rgba(255, 255, 255, 0.8);
 55 |     color: #510c75;
 56 |     border-color: transparent;
 57 | }
 58 | 
 59 | .expand-btn:hover {
 60 |     background-color: rgba(200, 200, 200, 0.8);
 61 |     text-decoration: none;
 62 |     border-color: transparent;
 63 |     color: #510c75;
 64 | }
 65 | 
 66 | .expand-btn:focus {
 67 |     outline: none;
 68 |     text-decoration: none;
 69 | }
 70 | 
 71 | .expandable-card:not(.expanded) .card-text-container:after {
 72 |     content: "";
 73 |     position: absolute;
 74 |     bottom: 0;
 75 |     left: 0;
 76 |     width: 100%;
 77 |     height: 90px;
 78 |     background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
 79 | }
 80 | 
 81 | .expandable-card:not(.expanded) .expand-btn {
 82 |     margin-top: -40px;
 83 | }
 84 | 
 85 | .card-body {
 86 |     padding-bottom: 5px;
 87 | }
 88 | 
 89 | .vertical-flex-layout {
 90 |     justify-content: center;
 91 |     align-items: center;
 92 |     height: 100%;
 93 |     display: flex;
 94 |     flex-direction: column;
 95 |     gap: 5px;
 96 | }
 97 | 
 98 | .figure-img {
 99 |     max-width: 100%;
100 |     height: auto;
101 | }
102 | 
103 | .adjustable-font-size {
104 |     font-size: calc(0.5rem + 2vw);
105 | }
106 | 


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # try:
 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
 5 | 
 6 | from .language_model.deepstack_llama import DeepstackLlamaForCausalLM, DeepstackConfig
 7 | from .language_model.deepstack_phi import DeepstackPhiForCausalLM, DeepstackPhiConfig
 8 | 
 9 | 
10 | MODEL_REGISTRY = {
11 |     'llama': DeepstackLlamaForCausalLM,
12 |     'phi-3': DeepstackPhiForCausalLM,
13 |     'phi3': DeepstackPhiForCausalLM,
14 | }
15 | 
16 | LLAVA_MODEL_REGISTRY = {
17 |     'llama': LlavaLlamaForCausalLM,
18 |     'mpt': LlavaMptForCausalLM,
19 |     'mistral': LlavaMistralForCausalLM,
20 | }
21 | 
22 | def ModelSelect(model_name_or_path):
23 |     model = None
24 | 
25 |     registry = MODEL_REGISTRY if not 'llava' in model_name_or_path.lower() else LLAVA_MODEL_REGISTRY
26 |     for name in registry.keys():
27 |         if name.lower() in model_name_or_path.lower():
28 |             model = registry[name]
29 |     if model is None:
30 |         model = registry['llama']
31 |     return model


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/lmms-eval/.gitignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.pyc
 3 | output/
 4 | data/
 5 | lm_cache
 6 | .idea
 7 | build
 8 | dist
 9 | *.egg-info
10 | venv
11 | .vscode/
12 | temp
13 | __pycache__
14 | .ipynb_checkpoints
15 | temp
16 | # IPython
17 | profile_default/
18 | ipython_config.py
19 | logs/
20 | scripts/
21 | wandb/
22 | SimSun.ttf
23 | submissions/
24 | lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
25 | lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
26 | zk.log
27 | 


--------------------------------------------------------------------------------
/lmms-eval/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/psf/black
3 |     rev: 23.12.1
4 |     hooks:
5 |       - id: black
6 |         language_version: python3


--------------------------------------------------------------------------------
/lmms-eval/docs/README.md:
--------------------------------------------------------------------------------
 1 | # LMMs Eval Documentation
 2 | 
 3 | Welcome to the docs for `lmms-eval`!
 4 | 
 5 | Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | * To learn about the command line flags, see the [commands](commands.md)
10 | * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
11 | * For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).


--------------------------------------------------------------------------------
/lmms-eval/example_eval.yaml:
--------------------------------------------------------------------------------
 1 | - model: llava
 2 |   model_args: pretrained=liuhaotian/llava-v1.5-7b
 3 |   tasks: ai2d
 4 |   batch_size: 1
 5 |   log_samples: true
 6 |   log_samples_suffix: eval_vizwiz_vqa
 7 |   output_path: "./logs/"
 8 | 
 9 | - model: llava
10 |   model_args: pretrained=liuhaotian/llava-v1.5-13b
11 |   tasks: mme
12 |   batch_size: 1
13 |   log_samples: true
14 |   log_samples_suffix: mme
15 |   output_path: "./logs/"
16 | 


--------------------------------------------------------------------------------
/lmms-eval/llava_repr_requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | datasets==2.16.1
 3 | evaluate==0.4.1
 4 | hf_transfer==0.1.6
 5 | Jinja2==3.1.3
 6 | numpy==1.26.4
 7 | openai==1.13.3
 8 | packaging==23.2
 9 | pandas==2.2.1
10 | Pillow==10.2.0
11 | protobuf==4.25.3
12 | pycocoevalcap==1.2
13 | pycocotools==2.0.7
14 | pytablewriter==1.2.0
15 | pytest==8.0.2
16 | python_Levenshtein==0.25.0
17 | pytz==2024.1
18 | PyYAML==6.0.1
19 | PyYAML==6.0.1
20 | Requests==2.31.0
21 | sacrebleu==2.4.0
22 | scikit_learn==1.2.2
23 | sentencepiece==0.1.99
24 | setuptools==68.2.2
25 | sglang==0.1.12
26 | shortuuid==1.0.12
27 | sqlitedict==2.1.0
28 | tenacity==8.2.3
29 | torch==2.0.1
30 | openai>=1.0.0
31 | pycocoevalcap
32 | tokenizers==0.15.2
33 | tqdm==4.66.2
34 | tqdm-multiprocess
35 | transformers==4.37.2
36 | zstandard
37 | pillow
38 | pyyaml
39 | sympy
40 | mpmath
41 | Jinja2
42 | openpyxl
43 | Levenshtein
44 | hf_transfer
45 | tenacity
46 | wandb>=0.16.0
47 | transformers-stream-generator
48 | tiktoken
49 | pre-commit


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/api/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/filter.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List
 3 | 
 4 | from lmms_eval.api.instance import Instance
 5 | from datasets import Dataset
 6 | 
 7 | 
 8 | class Filter:
 9 |     """
10 |     Filter classes operate on a per-task level.
11 |     They take all model outputs (`instance.resps` for all `task.instances`)
12 |     across all instances of a task, and perform operations.
13 |     In a single run, one can configure any number of separate filters or lists of filters.
14 | 
15 |     """
16 | 
17 |     def __init__(self, *args, **kwargs) -> None:
18 |         """
19 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
20 |         """
21 | 
22 |     def apply(self, resps, docs):
23 |         """
24 |         Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
25 |         Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
26 |         if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
27 |         [<filtered resps for instance 0>, <filtered resps for instance 1>]
28 |         """
29 |         return resps
30 | 
31 | 
32 | @dataclass
33 | class FilterEnsemble:
34 |     """
35 |     FilterEnsemble creates a pipeline applying multiple filters.
36 |     Its intended usage is to stack multiple post-processing steps in order.
37 |     `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
38 |     pipeline separately.
39 |     """
40 | 
41 |     name: str
42 |     filters: List[Filter]
43 | 
44 |     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
45 |         resps = [inst.resps for inst in instances]  # operate just on the model responses
46 |         for f in self.filters:
47 |             # apply filters in sequence
48 |             resps = f.apply(resps, docs)
49 | 
50 |         # add the end results after filtering to filtered_requests of their respective source instances.
51 |         # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
52 |         for inst, resp in zip(instances, resps):
53 |             inst.filtered_resps[self.name] = resp
54 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/api/instance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Literal, Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class Instance:
 7 |     request_type: Literal["loglikelihood", "generate_until"]
 8 |     arguments: tuple
 9 |     idx: int
10 |     metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
11 |     resps: list = field(default_factory=list)
12 |     filtered_resps: dict = field(default_factory=dict)
13 | 
14 |     # initialized after init
15 |     task_name: str = None
16 |     doc_id: str = None
17 |     repeats: str = None
18 |     doc: dict = None
19 | 
20 |     def __post_init__(self) -> None:
21 |         # unpack metadata field
22 |         self.task_name, self.doc_id, self.repeats = self.metadata
23 | 
24 |     @property
25 |     def args(self):
26 |         """
27 |         Returns (string,) where `string` is the string to calculate loglikelihood over
28 |         """
29 |         return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import FilterEnsemble, Filter
 2 | from . import selection
 3 | from . import extraction
 4 | from . import transformation
 5 | 
 6 | 
 7 | FILTER_REGISTRY = {
 8 |     "take_first": selection.TakeFirstFilter,
 9 |     "regex": extraction.RegexFilter,
10 |     "majority_vote": selection.MajorityVoteFilter,
11 |     "take_first_k": selection.TakeKFilter,
12 |     "remove_whitespace": extraction.WhitespaceFilter,
13 |     "lowercase": transformation.LowercaseFilter,
14 |     "uppercase": transformation.UppercaseFilter,
15 |     "map": transformation.MapFilter,
16 |     "multi_choice_regex": extraction.MultiChoiceRegexFilter,
17 |     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
18 |     # that takes an input and returns a scalar and then should select the max reward,
19 |     # or should implement different filters for different ways of handling a reward model's inference.
20 |     # "arg_max": selection.ArgMaxFilter,
21 | }
22 | 
23 | 
24 | def get_filter(filter_name):
25 |     if filter_name in FILTER_REGISTRY:
26 |         return FILTER_REGISTRY[filter_name]
27 |     else:
28 |         return filter_name
29 | 
30 | 
31 | def build_filter_ensemble(filter_name, components):
32 |     """
33 |     Create a filtering pipeline.
34 |     """
35 |     filters = []
36 |     for function, kwargs in components:
37 |         if kwargs is None:
38 |             f = get_filter(function)()
39 |         else:
40 |             # create a filter given its name in the registry
41 |             f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
42 |         # add the filter as a pipeline step
43 |         filters.append(f)
44 | 
45 |     return FilterEnsemble(name=filter_name, filters=filters)
46 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/decontamination.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class DecontaminationFilter(Filter):
 5 |     """
 6 |     A filter which evaluates
 7 |     """
 8 | 
 9 |     name = "track_decontamination"
10 | 
11 |     def __init__(self, path) -> None:
12 |         """
13 | 
14 |         TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
15 |         should further cache result on a given (task_name, doc_id)
16 |         """
17 |         self._decontam_results = None
18 | 
19 |     def apply(self, resps, docs) -> None:
20 |         """
21 |         Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
22 |         """
23 |         pass
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/selection.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from lmms_eval.api.filter import Filter
 4 | 
 5 | 
 6 | class TakeFirstFilter(Filter):
 7 |     def __init__(self) -> None:
 8 |         """
 9 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
10 |         """
11 | 
12 |     def apply(self, resps, docs):
13 |         """
14 |         Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
15 |         """
16 |         return map(lambda r: r[0], resps)
17 | 
18 | 
19 | class TakeKFilter(Filter):
20 |     def __init__(self, *args, **kwargs) -> None:
21 |         self.k = kwargs.pop("k")
22 | 
23 |         super().__init__(*args, **kwargs)
24 | 
25 |     def apply(self, resps, docs):
26 |         # check we have at least k responses per doc, else we can't take the first k
27 |         assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
28 |         return map(lambda r: r[: self.k], resps)
29 | 
30 | 
31 | class MajorityVoteFilter(Filter):
32 |     def __init__(self) -> None:
33 |         """
34 |         Can define custom behavior here, if an individual instantiation of a Filter class should have state.
35 |         """
36 | 
37 |     def apply(self, resps, docs):
38 |         """
39 |         Each entry of `resps` is a list of model responses.
40 |         We select the response that occurs most frequently in each entry of `resps`.
41 |         """
42 | 
43 |         def select_majority(resp):
44 |             counts = Counter(resp)
45 |             vote = counts.most_common(1)[0][0]
46 |             return vote
47 | 
48 |         return map(lambda r: [select_majority(r)], resps)
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/filters/transformation.py:
--------------------------------------------------------------------------------
 1 | from lmms_eval.api.filter import Filter
 2 | 
 3 | 
 4 | class LowercaseFilter(Filter):
 5 |     def __init__(self) -> None:
 6 |         pass
 7 | 
 8 |     def apply(self, resps, docs):
 9 |         def filter_set(inst):
10 |             return [resp.lower() for resp in inst]
11 | 
12 |         return [filter_set(resp) for resp in resps]
13 | 
14 | 
15 | class UppercaseFilter(Filter):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def apply(self, resps, docs):
20 |         def filter_set(inst):
21 |             return [resp.upper() for resp in inst]
22 | 
23 |         return [filter_set(resp) for resp in resps]
24 | 
25 | 
26 | class MapFilter(Filter):
27 |     def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
28 |         """
29 |         Initializes the MapFilter with a given mapping dictionary and default value.
30 | 
31 |         Args:
32 |         - mapping_dict (dict): A dictionary containing the key-value mappings.
33 |                                Default is an empty dictionary.
34 |         - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
35 |                                Default is None.
36 | 
37 |         Example:
38 |         mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
39 |         """
40 |         assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
41 |         self.mapping_dict = mapping_dict
42 |         self.default_value = default_value
43 | 
44 |     def apply(self, resps, docs):
45 |         def filter_set(inst):
46 |             return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
47 | 
48 |         return [filter_set(resp) for resp in resps]
49 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | AVAILABLE_MODELS = {
 4 |     "llava": "Llava",
 5 |     "llava_hf": "LlavaHf",
 6 |     "llava_sglang": "LlavaSglang",
 7 |     "qwen_vl": "Qwen_VL",
 8 |     "fuyu": "Fuyu",
 9 |     "gpt4v": "GPT4V",
10 |     "instructblip": "InstructBLIP",
11 |     "minicpm_v": "MiniCPM_V",
12 | }
13 | 
14 | for model_name, model_class in AVAILABLE_MODELS.items():
15 |     try:
16 |         exec(f"from .{model_name} import {model_class}")
17 |     except ImportError:
18 |         pass
19 | 
20 | 
21 | import hf_transfer
22 | 
23 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/models/model_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/models/model_utils/__init__.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def generate_submission_file(file_name, args, subpath="submissions"):
5 |     path = os.path.join(args.output_path, subpath)
6 |     os.makedirs(path, exist_ok=True)
7 |     path = os.path.join(path, file_name)
8 |     return os.path.abspath(path)
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/lmms_eval/tasks/_task_utils/gpt_eval_utils.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/ai2d.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ai2d
 2 | task: "ai2d"
 3 | dataset_kwargs:
 4 |   token: True
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ai2d_doc_to_visual
 8 | doc_to_text: !function utils.ai2d_doc_to_text
 9 | doc_to_target: !function utils.ai2d_doc_to_target
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | metric_list:
15 |   - metric: exact_match
16 |     aggregation: mean
17 |     higher_is_better: true
18 |     ignore_case: true
19 |     ignore_punctuation: true
20 | metadata:
21 |   - version: 0.0
22 | 
23 | model_specific_prompt_kwargs:
24 |   default:
25 |     prompt_format: mcq
26 |     pre_prompt: ""
27 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
28 |   # qwen formulate ai2d as question answering instead of mcq
29 |   qwen_vl:
30 |     prompt_format: qa
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 | 
34 | model_specific_target_kwargs:
35 |   default: "mcq"
36 |   qwen_vl: "qa"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ai2d/utils.py:
--------------------------------------------------------------------------------
 1 | def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None):
 2 |     question, choices = doc["question"], doc["options"]
 3 |     len_choices = len(choices)
 4 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
 5 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
 6 |     if model_specific_prompt_kwargs["prompt_format"] == "mcq":
 7 |         options = [chr(ord("A") + i) for i in range(len_choices)]
 8 |         choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
 9 |         return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"
10 |     elif model_specific_prompt_kwargs["prompt_format"] == "qa":
11 |         options = "\n".join(choices)
12 |         return f"{pre_prompt}{question}{options}{post_prompt}"
13 |     else:
14 |         raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs['prompt_format']}")
15 | 
16 | 
17 | def ai2d_doc_to_visual(doc):
18 |     return [doc["image"].convert("RGB")]
19 | 
20 | 
21 | def ai2d_doc_to_target(doc, model_specific_target_kwargs):
22 |     if model_specific_target_kwargs == "mcq":
23 |         len_choices = len(doc["options"])
24 |         options = [chr(ord("A") + i) for i in range(len_choices)]
25 |         return options[int(doc["answer"])]
26 |     elif model_specific_target_kwargs == "qa":
27 |         return doc["options"][int(doc["answer"])]
28 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/chartqa/chartqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ChartQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "chartqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.chartqa_doc_to_visual
 8 | doc_to_text: !function utils.chartqa_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   do_sample: False
14 | process_results: !function utils.chartqa_process_results
15 | metric_list:
16 |   - metric: relaxed_overall
17 |     aggregation: mean
18 |     higher_is_better: true
19 |   - metric: relaxed_human_split
20 |     aggregation: mean
21 |     higher_is_better: true
22 |   - metric: relaxed_augmented_split
23 |     aggregation: mean
24 |     higher_is_better: true
25 | metadata:
26 |   - version: 0.0
27 | model_specific_prompt_kwargs:
28 |   default:
29 |     pre_prompt: ""
30 |     post_prompt: "\nAnswer the question with a single word."
31 |   qwen_vl:
32 |     pre_prompt: ""
33 |     post_prompt: " Answer:"
34 | 
35 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_cmmmu.yaml:
--------------------------------------------------------------------------------
1 | group: cmmmu
2 | task:
3 | - cmmmu_val
4 | - cmmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml:
--------------------------------------------------------------------------------
1 | dataset_path: lmms-lab/CMMMU
2 | output_type: generate_until
3 | doc_to_visual: !function utils.cmmmu_doc_to_visual
4 | doc_to_text: !function utils.cmmmu_doc_to_text
5 | doc_to_target: "answer"
6 | generation_kwargs:
7 |   max_new_tokens: 16
8 |   image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_test"
 2 | test_split: test
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_test_results_for_submission
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | metric_list:
 7 |   - metric: submission
 8 |     aggregation: !function utils.cmmmu_test_aggregate_results_for_submission
 9 |     higher_is_better: false
10 | metadata:
11 |   - version: 0.0
12 | include: _default_template_cmmmu_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/cmmmu/cmmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "cmmmu_val"
 2 | test_split: val
 3 | # The return value of process_results will be used by metrics
 4 | process_results: !function utils.cmmmu_process_results
 5 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
 6 | generation_kwargs:
 7 |   max_new_tokens: 16
 8 |   image_aspect_ratio: original
 9 | metric_list:
10 |   - metric: cmmmu_acc
11 |     aggregation: !function utils.cmmmu_aggregate_results
12 |     higher_is_better: true
13 | metadata:
14 |   - version: 0.0
15 | include: _default_template_cmmmu_yaml
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2014_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2014_cap_test"
 5 | group : "coco_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "coco2014_cap_val"
 5 | group : "coco_caption"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: "Provide a one-sentence caption for the provided image."
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_Bleu_4 
21 |     aggregation : !function utils.coco_bleu4
22 |     higher_is_better : true
23 |   - metric: coco_Bleu_3
24 |     aggregation : !function utils.coco_bleu3
25 |     higher_is_better : true
26 |   - metric: coco_Bleu_2
27 |     aggregation : !function utils.coco_bleu2
28 |     higher_is_better : true
29 |   - metric: coco_Bleu_1
30 |     aggregation : !function utils.coco_bleu1
31 |     higher_is_better : true
32 |   - metric: coco_METEOR
33 |     aggregation : !function utils.coco_meteor
34 |     higher_is_better : true
35 |   - metric: coco_ROUGE_L
36 |     aggregation : !function utils.coco_rougel
37 |     higher_is_better : true
38 |   - metric: coco_CIDEr
39 |     aggregation : !function utils.coco_cider
40 |     higher_is_better : true
41 |   #- metric: coco_SPICE
42 |   #  aggregation : !function utils.coco_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco2017_cap
2 | task:
3 |   - coco2017_cap_val
4 |   - coco2017_cap_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "coco2017_cap_test"
 5 | group : "coco_caption2017"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 128
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_passthrough 
21 |     aggregation : !function utils.coco_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/COCO-Caption2017
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "coco2017_cap_val"
 5 | group : "coco_caption2017"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.coco_doc_to_visual
 9 | doc_to_text: !function utils.coco_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.coco_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: coco_Bleu_4 
21 |     aggregation : !function utils.coco_bleu4
22 |     higher_is_better : true
23 |   - metric: coco_Bleu_3
24 |     aggregation : !function utils.coco_bleu3
25 |     higher_is_better : true
26 |   - metric: coco_Bleu_2
27 |     aggregation : !function utils.coco_bleu2
28 |     higher_is_better : true
29 |   - metric: coco_Bleu_1
30 |     aggregation : !function utils.coco_bleu1
31 |     higher_is_better : true
32 |   - metric: coco_METEOR
33 |     aggregation : !function utils.coco_meteor
34 |     higher_is_better : true
35 |   - metric: coco_ROUGE_L
36 |     aggregation : !function utils.coco_rougel
37 |     higher_is_better : true
38 |   - metric: coco_CIDEr
39 |     aggregation : !function utils.coco_cider
40 |     higher_is_better : true
41 |   #- metric: coco_SPICE
42 |   #  aggregation : !function utils.coco_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/coco_cap/coco_cap.yaml:
--------------------------------------------------------------------------------
1 | group : coco_cap
2 | task:
3 |   - coco2014_cap_val
4 |   - coco2014_cap_test
5 |   - coco2017_cap_val
6 |   - coco2017_cap_test
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: DocVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | output_type: generate_until
 6 | doc_to_visual: !function utils.docvqa_doc_to_visual
 7 | doc_to_text: !function utils.docvqa_doc_to_text
 8 | doc_to_target: "answers"
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer the question using a single word or phrase."
17 |   qwen_vl:
18 |     pre_prompt: ""
19 |     post_prompt: " Answer:"
20 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa.yaml:
--------------------------------------------------------------------------------
1 | group: docvqa
2 | task:
3 | - docvqa_val
4 | - docvqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_test"
2 | test_split: test
3 | process_results: !function utils.docvqa_test_process_results
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.docvqa_test_aggregate_results
7 |     higher_is_better: true
8 | include: _default_template_docvqa_yaml
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/docvqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "docvqa_val"
2 | test_split: validation
3 | metric_list:
4 |   - metric: anls
5 |     aggregation: mean
6 |     higher_is_better: true
7 | include: _default_template_docvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/docvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | logger = logging.getLogger("lmms-eval")
 8 | 
 9 | 
10 | def docvqa_doc_to_visual(doc):
11 |     return [doc["image"].convert("RGB")]
12 | 
13 | 
14 | def docvqa_doc_to_text(doc, model_specific_prompt_kwargs):
15 |     question = doc["question"]
16 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
17 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
18 |     return f"{pre_prompt}{question}{post_prompt}"
19 | 
20 | 
21 | def docvqa_test_process_results(doc, results):
22 |     pred = results[0]
23 |     questionId = doc["questionId"]
24 |     return {"anls": {"questionId": int(questionId), "answer": pred}, "submission": {"questionId": int(questionId), "answer": pred}}
25 | 
26 | 
27 | def docvqa_test_aggregate_results(results, args):
28 |     # save results as json
29 |     path = generate_submission_file("docvqa_test_for_submission.json", args)
30 |     with open(path, "w") as f:
31 |         json.dump(results, f)
32 |     logger.info(f"Results saved to {path}")
33 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ferret/ferret.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/Ferret-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ferret"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ferret_doc_to_visual
 8 | doc_to_text: !function utils.ferret_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.ferret_process_results
20 | metric_list:
21 |   - metric: gpt_eval_ferret_all
22 |     aggregation: !function utils.ferret_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_ferret_refer_desc
25 |     aggregation: !function utils.ferret_refer_desc_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_ferret_refer_reason
28 |     aggregation: !function utils.ferret_refer_reason_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_ferret_ground_conv
31 |     aggregation: !function utils.ferret_ground_conv_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0314"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k.yaml:
--------------------------------------------------------------------------------
1 | group: flickr30k
2 | task:
3 | - flickr30k_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/flickr30k/flickr30k_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/flickr30k
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "flickr30k_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.flickr_doc_to_visual
 8 | doc_to_text: !function utils.flickr_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 64
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.flickr_process_result
17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
18 | metric_list:
19 |   - metric: flickr_Bleu_4 
20 |     aggregation : !function utils.flickr_bleu4
21 |     higher_is_better : true
22 |   - metric: flickr_Bleu_3
23 |     aggregation : !function utils.flickr_bleu3
24 |     higher_is_better : true
25 |   - metric: flickr_Bleu_2
26 |     aggregation : !function utils.flickr_bleu2
27 |     higher_is_better : true
28 |   - metric: flickr_Bleu_1
29 |     aggregation : !function utils.flickr_bleu1
30 |     higher_is_better : true
31 |   - metric: flickr_METEOR
32 |     aggregation : !function utils.flickr_meteor
33 |     higher_is_better : true
34 |   - metric: flickr_ROUGE_L
35 |     aggregation : !function utils.flickr_rougel
36 |     higher_is_better : true
37 |   - metric: flickr_CIDEr
38 |     aggregation : !function utils.flickr_cider
39 |     higher_is_better : true
40 |   #- metric: flickr_SPICE
41 |   #  aggregation : !function utils.flickr_spice
42 |   #  higher_is_better : true
43 | metadata:
44 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/gqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/GQA
 2 | dataset_name: testdev_balanced_instructions
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "gqa"
 6 | test_split: testdev
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.gqa_doc_to_visual
 9 | doc_to_text: !function utils.gqa_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | metric_list:
18 |   - metric: exact_match
19 |     aggregation: mean
20 |     higher_is_better: true
21 |     ignore_case: true
22 |     ignore_punctuation: true
23 | metadata:
24 |   - version: 0.0
25 |   
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/gqa/utils.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | GQA_RAW_IMAGE_DATASET = None
 4 | GQA_ID2IMAGE = None
 5 | 
 6 | 
 7 | def gqa_doc_to_visual(doc):
 8 |     global GQA_RAW_IMAGE_DATASET
 9 |     global GQA_ID2IMAGE
10 |     if GQA_RAW_IMAGE_DATASET is None:
11 |         GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
12 |         GQA_ID2IMAGE = {}
13 |         for row in GQA_RAW_IMAGE_DATASET:
14 |             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
15 |     image = GQA_ID2IMAGE[doc["imageId"]]
16 |     return [image]
17 | 
18 | 
19 | def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
20 |     question = doc["question"]
21 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
22 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
23 |     return f"{pre_prompt}{question}{post_prompt}"
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/HallusionBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "hallusion_bench_image"
 5 | test_split: image
 6 | output_type: generate_until
 7 | doc_to_visual: !function evaluate_hb.hb_doc_to_visual
 8 | doc_to_text: !function evaluate_hb.hb_doc_to_text
 9 | doc_to_target: "gt_answer_details"
10 | process_results: !function evaluate_hb.hb_process_results
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: ""
15 | generation_kwargs:
16 |   max_new_tokens: 128
17 |   temperature: 0
18 |   top_p: 0
19 |   num_beams: 1
20 |   do_sample: false
21 | metric_list:
22 |   - metric: aAcc
23 |     aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
24 |     higher_is_better: true
25 |   - metric: qAcc
26 |     aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
27 |     higher_is_better: true
28 |   - metric: fAcc
29 |     aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
30 |     higher_is_better: true
31 |   # - metric: aAcc
32 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
33 |   #  higher_is_better: true
34 |   # - metric: qAcc
35 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
36 |   #  higher_is_better: true
37 |   # - metric: fAcc
38 |   #  aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
39 |   #  higher_is_better: true
40 | metadata:
41 |   - version: 0.0
42 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/_default_template_docvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ICON-QA
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.doc_to_visual
 6 | doc_to_text: !function utils.doc_to_text
 7 | doc_to_target: "answers"
 8 | # process_results: !function utils.test_process_results
 9 | generation_kwargs:
10 |   max_new_tokens: 32
11 |   temperature: 0
12 |   do_sample: False
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     statement: "Given a set of images and a question, please provide the answer to the question.\n"
17 |     options_statement: "Question: {question}.\nOptions:\n{options}\nPlease answer with the option letter from the given choices directly."
18 |     freeform_statement: "Question: {question}.\nPlease answer the question using a single word or phrase."
19 | metric_list:
20 |   - metric: anls
21 |     aggregation: mean
22 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa.yaml:
--------------------------------------------------------------------------------
1 | group: iconqa
2 | task:
3 | - iconqa_val
4 | - iconqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_test"
2 | test_split: test
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/iconqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "iconqa_val"
2 | test_split: val
3 | include: _default_template_docvqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/iconqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def options_to_str(options_prompt):
 6 |     option_prompt_str = ""
 7 |     for i, option in enumerate(options_prompt):
 8 |         option_choice = chr(ord("A") + i)
 9 |         option_prompt_str += f"{option_choice}. {option}\n"
10 | 
11 |     option_prompt_str = option_prompt_str.rstrip("\n")
12 |     return option_prompt_str
13 | 
14 | 
15 | def doc_to_visual(doc):
16 |     image_list = []
17 |     if "query_image" in doc:
18 |         image_list.append(doc["query_image"].convert("RGB"))
19 |     for i in range(5):
20 |         id = f"choice_image_{i}"
21 |         if id in doc and doc[id] is not None:
22 |             image_list.append(doc[id].convert("RGB"))
23 |     assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA"
24 |     return image_list
25 | 
26 | 
27 | def doc_to_text(doc, model_specific_prompt_kwargs):
28 |     question = doc["question"]
29 |     ques_type = doc["ques_type"]
30 |     options_prompt = []
31 | 
32 |     if ques_type == "choose_img":
33 |         options_prompt.append("The first image.")
34 |         options_prompt.append("The second image.")
35 | 
36 |         options_str = options_to_str(options_prompt)
37 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
38 | 
39 |     elif ques_type == "choose_txt":
40 |         choices = doc["choices"].split(",")
41 |         for i, choice in enumerate(choices):
42 |             options_prompt.append(f"{choice}")
43 | 
44 |         options_str = options_to_str(options_prompt)
45 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}"
46 | 
47 |     elif ques_type == "fill_in_blank":
48 |         full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}"
49 | 
50 |     return full_prompt
51 | 
52 | 
53 | def test_process_results(doc, results):
54 |     pred = results[0]
55 |     questionId = doc["question_id"]
56 |     answer = doc["answer"]
57 |     return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}}
58 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/_default_template_infovqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/DocVQA
 2 | dataset_name: InfographicVQA
 3 | dataset_kwargs:
 4 |   token: True
 5 | doc_to_target: "answers"
 6 | doc_to_visual: !function utils.infovqa_doc_to_visual
 7 | doc_to_text: !function utils.infovqa_doc_to_text
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa.yaml:
--------------------------------------------------------------------------------
1 | group: infovqa
2 | task:
3 | - infovqa_val
4 | - infovqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_test.yaml:
--------------------------------------------------------------------------------
 1 | task: "infovqa_test"
 2 | test_split: test
 3 | output_type: generate_until
 4 | process_results: !function utils.infovqa_test_process_results
 5 | metric_list:
 6 |   - metric: submission
 7 |     aggregation: !function utils.infovqa_test_aggregate_results
 8 |     higher_is_better: true
 9 | include: _default_template_infovqa_yaml
10 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/infovqa_val.yaml:
--------------------------------------------------------------------------------
1 | task: "infovqa_val"
2 | test_split: validation
3 | output_type: generate_until
4 | metric_list:
5 |   - metric: anls
6 |     aggregation: mean
7 |     higher_is_better: true
8 | include: _default_template_infovqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/infovqa/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging
 4 | 
 5 | 
 6 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 7 | 
 8 | lmms_logger = logging.getLogger("lmms-eval")
 9 | 
10 | 
11 | def infovqa_doc_to_visual(doc):
12 |     return [doc["image"].convert("RGB")]
13 | 
14 | 
15 | def infovqa_doc_to_text(doc, model_specific_prompt_kwargs):
16 |     question = doc["question"]
17 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
18 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
19 |     return f"{pre_prompt}{question}{post_prompt}"
20 | 
21 | 
22 | def infovqa_test_process_results(doc, results):
23 |     pred = results[0]
24 |     questionId = doc["questionId"]
25 |     return {"submission": {"questionId": int(questionId), "answer": pred}}
26 | 
27 | 
28 | def infovqa_test_aggregate_results(results, args):
29 |     # save results as json
30 |     file = generate_submission_file("infovqa_test_for_submission.json", args)
31 |     with open(file, "w") as f:
32 |         json.dump(results, f)
33 |     lmms_logger.info(f"Results saved to {file}")
34 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-coco
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_bench_coco"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 | process_results: !function utils.llava_process_results
19 | metric_list:
20 |   - metric: gpt_eval_llava_all
21 |     aggregation: !function utils.llava_all_aggregation
22 |     higher_is_better: true
23 |   - metric: gpt_eval_llava_conv
24 |     aggregation: !function utils.llava_conv_aggregation
25 |     higher_is_better: true
26 |   - metric: gpt_eval_llava_detail
27 |     aggregation: !function utils.llava_detail_aggregation
28 |     higher_is_better: true
29 |   - metric: gpt_eval_llava_complex
30 |     aggregation: !function utils.llava_complex_aggregation
31 |     higher_is_better: true
32 | metadata:
33 |   version: 0.0
34 |   gpt_eval_model_name: "gpt-4-0314"
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/llava-bench-in-the-wild
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "llava_in_the_wild"
 5 | test_split: train
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.llava_doc_to_visual
 8 | doc_to_text: !function utils.llava_doc_to_text
 9 | doc_to_target: "gpt_answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.llava_process_results
20 | metric_list:
21 |   - metric: gpt_eval_llava_all
22 |     aggregation: !function utils.llava_all_aggregation
23 |     higher_is_better: true
24 |   - metric: gpt_eval_llava_conv
25 |     aggregation: !function utils.llava_conv_aggregation
26 |     higher_is_better: true
27 |   - metric: gpt_eval_llava_detail
28 |     aggregation: !function utils.llava_detail_aggregation
29 |     higher_is_better: true
30 |   - metric: gpt_eval_llava_complex
31 |     aggregation: !function utils.llava_complex_aggregation
32 |     higher_is_better: true
33 | metadata:
34 |   version: 0.0
35 |   gpt_eval_model_name: "gpt-4-0613"
36 | model_specific_prompt_kwargs:
37 |   default:
38 |     pre_prompt: ""
39 |     post_prompt: ""
40 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse.yaml:
--------------------------------------------------------------------------------
 1 | group: mathverse
 2 | task:
 3 |   - mathverse_testmini
 4 |   - mathverse_testmini_text_only
 5 |   - mathverse_testmini_text_lite
 6 |   - mathverse_testmini_text_dominant
 7 |   - mathverse_testmini_vision_intensive
 8 |   - mathverse_testmini_vision_dominant
 9 |   - mathverse_testmini_vision_only
10 | metadata:
11 |   version: 0.0
12 |   gpt_eval_model_name: "gpt-3.5-turbo"
13 |   trunk_response: 30
14 |   quick_match: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini"
 6 | test_split: testmini
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_dominant"
 6 | test_split: text_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_lite.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_lite"
 6 | test_split: text_lite
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_text_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_text_only
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_text_only"
 6 | test_split: text_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_dominant.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_dominant"
 6 | test_split: vision_dominant
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_intensive.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_intensive"
 6 | test_split: vision_intensive
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathverse/mathverse_testmini_vision_only.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: CaraJ/MathVerse-lmmseval
 2 | dataset_name: testmini_version_split
 3 | dataset_kwargs:
 4 |   token: False
 5 | task: "mathverse_testmini_vision_only"
 6 | test_split: vision_only
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.mathverse_doc_to_visual
 9 | doc_to_text: !function utils.mathverse_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.mathverse_process_results
20 | metric_list:
21 |   - metric: gpt_eval_score
22 |     aggregation: !function utils.mathverse_aggregate_results_eval
23 |     higher_is_better: true
24 |   - metric: submission
25 |     aggregation: !function utils.mathverse_aggregate_results_submission
26 |     higher_is_better: true
27 |   
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     shot_type: "format-prompt" # can also be "custom-prompt"
31 |     query_type: "query_wo" # now only support query_wo
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista.yaml:
--------------------------------------------------------------------------------
1 | group: mathvista
2 | task:
3 |   - mathvista_testmini
4 |   - mathvista_test
5 | metadata:
6 |   version: 0.0
7 |   gpt_eval_model_name: "gpt-4-0613"
8 |   quick_extract: false


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_test"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 | 
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 | model_specific_generation_kwargs:
28 |   llava:
29 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mathvista/mathvista_testmini.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: AI4Math/MathVista
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mathvista_testmini"
 5 | test_split: testmini
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mathvista_doc_to_visual
 8 | doc_to_text: !function utils.mathvista_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mathvista_process_results
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mathvista_aggregate_results
22 |     higher_is_better: true
23 |   
24 | model_specific_prompt_kwargs:
25 |   default:
26 |     shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
27 | model_specific_generation_kwargs:
28 |   llava:
29 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | dataset_name: "cn"
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.mmbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.mmbench_doc_to_text
 9 | generation_kwargs:
10 |   max_new_tokens: 256
11 |   temperature: 0
12 |   top_p: 0
13 |   num_beams: 1
14 |   do_sample: false
15 | process_results: !function cn_utils.mmbench_process_results
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
20 | model_specific_generation_kwargs:
21 |   llava:
22 |     image_aspect_ratio: original
23 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | doc_to_target: "answer"
 5 | model_specific_prompt_kwargs:
 6 |   default:
 7 |     pre_prompt: ""
 8 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 9 | doc_to_visual: !function en_utils.mmbench_doc_to_visual
10 | doc_to_text: !function en_utils.mmbench_doc_to_text
11 | doc_to_target: "answer"
12 | process_results: !function en_utils.mmbench_process_results
13 | model_specific_generation_kwargs:
14 |   llava:
15 |     image_aspect_ratio: original
16 | output_type: generate_until
17 | dataset_name: "en"
18 | generation_kwargs:
19 |   until:
20 |     - "ASSISTANT:"
21 |   max_new_tokens: 1024
22 |   temperature: 0
23 |   top_p: 0
24 |   num_beams: 1
25 |   do_sample: false
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench.yaml:
--------------------------------------------------------------------------------
 1 | group: mmbench
 2 | task:
 3 |   - mmbench_en_dev
 4 |   - mmbench_en_test
 5 |   - mmbench_cn_dev
 6 |   - mmbench_cn_test
 7 |   - mmbench_cn_cc
 8 | metadata:
 9 |   version: 0.0
10 |   sys_prompt: "There are several options:"
11 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cc.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMBench
 2 | dataset_name: cc
 3 | dataset_kwargs:
 4 |   token: True
 5 | task: "mmbench_cn_cc"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function cc_utils.mmbench_doc_to_visual
 9 | doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 256
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function cc_utils.mmbench_cn_cc_process_results
18 | metric_list:
19 |   - metric: gpt_eval_score
20 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
21 |     higher_is_better: true
22 |   - metric: submission
23 |     aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
24 | metadata:
25 |   version: 0.0
26 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
27 | 
28 | model_specific_prompt_kwargs:
29 |   default:
30 |     pre_prompt: ""
31 |     post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_cn
2 | task:
3 |   - mmbench_cn_dev
4 |   - mmbench_cn_test
5 |   - mmbench_cn_cc
6 | metadata:
7 |   version: 0.0
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 |   sys_prompt: "有如下几个选项："


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_cn_dev"
 2 | test_split: "dev"
 3 | metric_list:
 4 |   - metric: gpt_eval_score
 5 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
 6 |     higher_is_better: true
 7 |   - metric: submission
 8 |     higher_is_better: true
 9 |     aggregation: !function cn_utils.mmbench_aggregate_dev_results
10 | include: _default_template_mmbench_cn_yaml
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml:
--------------------------------------------------------------------------------
1 | task: mmbench_cn_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function cn_utils.mmbench_aggregate_test_results
6 |     higher_is_better: true
7 | include: _default_template_mmbench_cn_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en.yaml:
--------------------------------------------------------------------------------
1 | group: mmbench_en
2 | task:
3 |   - mmbench_en_dev
4 |   - mmbench_en_test
5 | metadata:
6 |   version: 0.0
7 |   sys_prompt: "There are several options:"
8 |   gpt_eval_model_name: "gpt-3.5-turbo-0613"
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml:
--------------------------------------------------------------------------------
 1 | task: "mmbench_en_dev"
 2 | test_split: dev
 3 | include: _default_template_mmbench_en_yaml
 4 | metric_list:
 5 |   - metric: gpt_eval_score
 6 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
 7 |     higher_is_better: true
 8 |   - metric: submission
 9 |     aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
10 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmbench/mmbench_en_test.yaml:
--------------------------------------------------------------------------------
1 | task: "mmbench_en_test"
2 | test_split: test
3 | include: _default_template_mmbench_en_yaml
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function en_utils.mmbench_aggregate_test_results
7 |     higher_is_better: true
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mme/mme.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MME
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mme"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mme_doc_to_visual
 8 | doc_to_text: !function utils.mme_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 16
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | # The return value of process_results will be used by metrics
17 | process_results: !function utils.mme_process_results
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: mme_percetion_score
21 |     aggregation: !function utils.mme_aggregate_results
22 |     higher_is_better: true
23 |   - metric: mme_cognition_score
24 |     aggregation: !function utils.mme_aggregate_results
25 |     higher_is_better: true
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer the question using a single word or phrase."
30 |   qwen_vl:  
31 |     pre_prompt: ""
32 |     post_prompt: " Answer:"
33 |   otterhd:
34 |     pre_prompt: ""
35 |     post_prompt: " Answer:"
36 | metadata:
37 |   - version: 0.0
38 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu.yaml:
--------------------------------------------------------------------------------
1 | group: mmmu
2 | task:
3 | - mmmu_val
4 | - mmmu_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.mmmu_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmmu/mmmu_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMMU
 2 | task: "mmmu_val"
 3 | test_split: validation
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.mmmu_doc_to_visual
 6 | doc_to_text: !function utils.mmmu_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.mmmu_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: original
14 | metric_list:
15 |   - metric: mmmu_acc
16 |     aggregation: !function utils.mmmu_aggregate_results
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/mmvet/mmvet.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MMVet
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "mmvet"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.mmvet_doc_to_visual
 8 | doc_to_text: !function utils.doc_to_text # Such that {{question}} will be replaced by doc["question"]
 9 | doc_to_target: "{{answer}}"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function utils.mmvet_process_results # apply gpt eval here
19 | metric_list:
20 |   - metric: gpt_eval_score
21 |     aggregation: !function utils.mmvet_aggregate_results
22 |     higher_is_better: true
23 | metadata:
24 |   version: 0.0
25 |   gpt_eval_model_name: "gpt-4"
26 | model_specific_prompt_kwargs:
27 |   default:
28 |     pre_prompt: ""
29 |     post_prompt: ""
30 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa.yaml:
--------------------------------------------------------------------------------
1 | group: multidocvqa
2 | task:
3 | - multidocvqa_val
4 | - multidocvqa_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_test_results_for_submission
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/MP-DocVQA
 2 | task: "multidocvqa_val"
 3 | test_split: val
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.multidocvqa_doc_to_visual
 6 | doc_to_text: !function utils.multidocvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.multidocvqa_process_results
13 | metric_list:
14 |   - metric: anls
15 |     aggregation: !function utils.multidocvqa_aggregate_results_anls
16 |     higher_is_better: true
17 |   - metric: accuracy
18 |     aggregation: !function utils.multidocvqa_aggregate_results_accuracy
19 |     higher_is_better: true
20 | model_specific_prompt_kwargs:
21 |   default:
22 |     pre_prompt: ""
23 |     post_prompt: "\nAnswer the question using a single word or phrase."
24 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_generate_configs.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | # dataset = load_dataset("gagan3012/multilingual-llava-bench")
 4 | 
 5 | configs = ['arabic', 'bengali', 'chinese', 'french', 'hindi', 'japanese', 'russian', 'spanish', 'urdu']
 6 | 
 7 | for config in configs:
 8 |     yaml_output = f"""
 9 |     dataset_path: "gagan3012/multilingual-llava-bench"
10 |     dataset_kwargs:
11 |         config: {config}
12 |         token: True
13 |     task: "llava_in_the_wild_{config}"
14 |     test_split: train
15 |     output_type: generate_until
16 |     doc_to_visual: !function utils.llava_doc_to_visual
17 |     doc_to_text: !function utils.llava_doc_to_text
18 |     doc_to_target: "gpt_answer"
19 |     generation_kwargs:
20 |     until:
21 |         - "ASSISTANT:"
22 |     image_aspect_ratio: original
23 |     max_new_tokens: 1024
24 |     temperature: 0
25 |     top_p: 0
26 |     num_beams: 1
27 |     do_sample: false
28 |     process_results: !function utils.llava_process_results
29 |     metric_list:
30 |     - metric: gpt_eval_llava_all
31 |         aggregation: !function utils.llava_all_aggregation
32 |         higher_is_better: true
33 |     - metric: gpt_eval_llava_conv
34 |         aggregation: !function utils.llava_conv_aggregation
35 |         higher_is_better: true
36 |     - metric: gpt_eval_llava_detail
37 |         aggregation: !function utils.llava_detail_aggregation
38 |         higher_is_better: true
39 |     - metric: gpt_eval_llava_complex
40 |         aggregation: !function utils.llava_complex_aggregation
41 |         higher_is_better: true
42 |     metadata:
43 |     version: 0.0
44 |     gpt_eval_model_name: "gpt-4-0613"
45 |     model_specific_prompt_kwargs:
46 |     default:
47 |         pre_prompt: ""
48 |         post_prompt: ""
49 |     """
50 | 
51 |     with open(f"{config}_llava_in_the_wild.yaml", "w") as f:
52 |         f.write(yaml_output)
53 | 
54 | # Path: _generate_configs.py


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: arabic
 5 |         token: True
 6 |     task: "llava_in_the_wild_arabic"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: bengali
 5 |         token: True
 6 |     task: "llava_in_the_wild_bengali"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: chinese
 5 |         token: True
 6 |     task: "llava_in_the_wild_chinese"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: french
 5 |         token: True
 6 |     task: "llava_in_the_wild_french"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: hindi
 5 |         token: True
 6 |     task: "llava_in_the_wild_hindi"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: japanese
 5 |         token: True
 6 |     task: "llava_in_the_wild_japanese"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: russian
 5 |         token: True
 6 |     task: "llava_in_the_wild_russian"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: spanish
 5 |         token: True
 6 |     task: "llava_in_the_wild_spanish"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 |     dataset_path: "gagan3012/multilingual-llava-bench"
 3 |     dataset_kwargs:
 4 |         config: urdu
 5 |         token: True
 6 |     task: "llava_in_the_wild_urdu"
 7 |     test_split: train
 8 |     output_type: generate_until
 9 |     doc_to_visual: !function utils.llava_doc_to_visual
10 |     doc_to_text: !function utils.llava_doc_to_text
11 |     doc_to_target: "gpt_answer"
12 |     generation_kwargs:
13 |     until:
14 |         - "ASSISTANT:"
15 |     image_aspect_ratio: original
16 |     max_new_tokens: 1024
17 |     temperature: 0
18 |     top_p: 0
19 |     num_beams: 1
20 |     do_sample: false
21 |     process_results: !function utils.llava_process_results
22 |     metric_list:
23 |     - metric: gpt_eval_llava_all
24 |         aggregation: !function utils.llava_all_aggregation
25 |         higher_is_better: true
26 |     - metric: gpt_eval_llava_conv
27 |         aggregation: !function utils.llava_conv_aggregation
28 |         higher_is_better: true
29 |     - metric: gpt_eval_llava_detail
30 |         aggregation: !function utils.llava_detail_aggregation
31 |         higher_is_better: true
32 |     - metric: gpt_eval_llava_complex
33 |         aggregation: !function utils.llava_complex_aggregation
34 |         higher_is_better: true
35 |     metadata:
36 |     version: 0.0
37 |     gpt_eval_model_name: "gpt-4-0613"
38 |     model_specific_prompt_kwargs:
39 |     default:
40 |         pre_prompt: ""
41 |         post_prompt: ""
42 |     


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: "Provide a one-sentence caption for the provided image."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps.yaml:
--------------------------------------------------------------------------------
1 | group : nocaps
2 | task:
3 |   - nocaps_test
4 |   - nocaps_val


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "nocaps_test"
 5 | group : "nocaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_passthrough 
21 |     aggregation : !function utils.nocaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/nocaps/nocaps_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/NoCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "nocaps_val"
 5 | group : "nocaps_caption"
 6 | test_split: validation
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.nocaps_doc_to_visual
 9 | doc_to_text: !function utils.nocaps_doc_to_text
10 | doc_to_target: "annotations_captions"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.nocaps_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: nocaps_Bleu_4 
21 |     aggregation : !function utils.nocaps_bleu4
22 |     higher_is_better : true
23 |   - metric: nocaps_Bleu_3
24 |     aggregation : !function utils.nocaps_bleu3
25 |     higher_is_better : true
26 |   - metric: nocaps_Bleu_2
27 |     aggregation : !function utils.nocaps_bleu2
28 |     higher_is_better : true
29 |   - metric: nocaps_Bleu_1
30 |     aggregation : !function utils.nocaps_bleu1
31 |     higher_is_better : true
32 |   - metric: nocaps_METEOR
33 |     aggregation : !function utils.nocaps_meteor
34 |     higher_is_better : true
35 |   - metric: nocaps_ROUGE_L
36 |     aggregation : !function utils.nocaps_rougel
37 |     higher_is_better : true
38 |   - metric: nocaps_CIDEr
39 |     aggregation : !function utils.nocaps_cider
40 |     higher_is_better : true
41 |   #- metric: nocaps_SPICE
42 |   #  aggregation : !function utils.nocaps_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0
46 | include: _default_template_nocaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ocrbench/ocrbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: echo840/OCRBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "ocrbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.ocrbench_doc_to_visual
 8 | doc_to_text: !function utils.ocrbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.ocrbench_process_results
17 | metric_list:
18 |   - metric: ocrbench_accuracy
19 |     aggregation: !function utils.ocrbench_aggregate_accuracy
20 |     higher_is_better: true
21 | metadata:
22 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OK-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.ok_vqa_doc_to_visual
 4 | doc_to_text: !function utils.ok_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metric_list:
10 |   - metric: exact_match
11 |     aggregation: mean
12 |     higher_is_better: true
13 |     ignore_case: true
14 |     ignore_punctuation: true
15 |   - metric: submission
16 |     aggregation: !function utils.ok_vqa_aggreate_submissions
17 |     higher_is_better: true
18 | process_results: !function utils.ok_vqa_process_results
19 | model_specific_prompt_kwargs:
20 |   default:
21 |     pre_prompt: ""
22 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
23 | metadata:
24 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val2014"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"ok_vqa_{split}")
16 | 
17 |             save_path = f"./ok_vqa_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "ok_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_ok_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task:
3 | - ok_vqa_val2014


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml:
--------------------------------------------------------------------------------
1 | group: ok_vqa
2 | task: ok_vqa_val2014
3 | test_split: val2014
4 | include: _default_template_vqa_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/ok_vqa/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | import yaml
 5 | import pathlib
 6 | import logging
 7 | import datetime
 8 | import statistics
 9 | 
10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12 | 
13 | eval_logger = logging.getLogger("lmms-eval")
14 | 
15 | 
16 | def ok_vqa_doc_to_visual(doc):
17 |     return [doc["image"].convert("RGB")]
18 | 
19 | 
20 | def ok_vqa_process_results(doc, result):
21 |     eval_ai_processor = EvalAIAnswerProcessor()
22 |     assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23 |     resAns = eval_ai_processor(result[0])
24 |     accuracy = 0
25 | 
26 |     if "answers" in doc and doc["answers"] is not None:
27 |         gtAcc = []
28 | 
29 |         for i in range(len(doc["answers"])):
30 |             doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31 | 
32 |         for i in range(len(doc["answers"])):
33 |             otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34 |             matchingAns = [item for item in otherGTAns if item == resAns]
35 |             acc = min(1, float(len(matchingAns)) / 3)
36 |             gtAcc.append(acc)
37 |         if gtAcc:
38 |             accuracy = statistics.mean(gtAcc)
39 |         else:
40 |             accuracy = 0
41 | 
42 |     return {
43 |         "exact_match": accuracy,
44 |         "submission": {
45 |             "image": f"{doc['question_id']}.jpg",
46 |             "answer": resAns,
47 |         },
48 |     }
49 | 
50 | 
51 | def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52 |     question = doc["question"]
53 |     if model_specific_prompt_kwargs is None:
54 |         model_specific_prompt_kwargs = {}
55 |     pre_prompt = ""
56 |     post_prompt = ""
57 |     if "pre_prompt" in model_specific_prompt_kwargs:
58 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
59 |     if "post_prompt" in model_specific_prompt_kwargs:
60 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
61 |     return f"{pre_prompt}{question}{post_prompt}"
62 | 
63 | 
64 | def ok_vqa_aggreate_submissions(results, args):
65 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66 |     file = f"ok_vqa-test-submission-{now_date_time}.json"
67 |     path = generate_submission_file(file, args)
68 |     with open(path, "w") as f:
69 |         json.dump(results, f)
70 |     print(f"Submission file saved to {path}")
71 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/cn_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import datetime
 4 | from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | import logging
 8 | 
 9 | eval_logger = logging.getLogger("lmms-eval")
10 | dir_name = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | olympiadbench_evaluator = OlympiadBenchEvaluator()
13 | 
14 | 
15 | def olympiadbench_doc_to_visual(doc):
16 |     return [image.convert("RGB") for image in doc["images"]]
17 | 
18 | 
19 | def olympiadbench_doc_to_text(doc):
20 |     question = doc["question"]
21 |     subject = doc["subfield"]
22 |     mul_ans = doc["is_multiple_answer"]
23 |     if mul_ans is None:
24 |         mul_ans = False
25 |     ans_type = doc["answer_type"]
26 |     if ans_type == "Need_human_evaluate":
27 |         ans_type = "proof based"
28 | 
29 |     pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
30 | 
31 |     post_prompt = ""
32 |     if not mul_ans:
33 |         post_prompt += f"答案类型为{ans_type}。\n"
34 |     else:
35 |         post_prompt += f"题目有多个答案，答案类型均为{ans_type}。\n"
36 |     post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
37 |     if not mul_ans:
38 |         post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
39 |     else:
40 |         post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
41 | 
42 |     final_question = pre_prompt + question + "\n" + post_prompt
43 |     return final_question
44 | 
45 | 
46 | def olympiadbench_process_results(doc, results):
47 |     precision = doc["error"]
48 |     is_proving = "TP" in doc["source"]
49 |     if precision is None:
50 |         precision = 0
51 |     prediction = results[0].strip()
52 | 
53 |     if is_proving:
54 |         return {"submission": prediction}
55 |     else:
56 |         prediction = prediction.split("所以最终答案是")[-1]
57 |         prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
58 |         accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
59 |         accuracy = int(accuracy)
60 |         return {"exact_match": accuracy}
61 | 
62 | 
63 | def olympiadbench_aggregate_results(results, args):
64 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
65 |     submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
66 |     path = generate_submission_file(submission_file_name, args)
67 |     with open(path, "w") as f:
68 |         json.dump(results, f, ensure_ascii=False)
69 |     print(f"Submission file saved to {path}")
70 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench.yaml:
--------------------------------------------------------------------------------
1 | group: olympiadbench
2 | task:
3 | - olympiadbench_test_en
4 | - olympiadbench_test_cn
5 | metadata:
6 |   - version: 0.0
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_cn"
 5 | test_split: test_cn
 6 | output_type: generate_until
 7 | doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function cn_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function cn_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function cn_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/OlympiadBench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "olympiadbench_test_en"
 5 | test_split: test_en
 6 | output_type: generate_until
 7 | doc_to_visual: !function en_utils.olympiadbench_doc_to_visual
 8 | doc_to_text: !function en_utils.olympiadbench_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 1024
14 |   temperature: 0
15 |   top_p: 0
16 |   num_beams: 1
17 |   do_sample: false
18 | process_results: !function en_utils.olympiadbench_process_results
19 | metric_list:
20 |   - metric: submission
21 |     aggregation: !function en_utils.olympiadbench_aggregate_results
22 |     higher_is_better: true
23 |   - metric: exact_match
24 |     aggregation: mean
25 |     higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/pope/pope.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/POPE
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "pope"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.pope_doc_to_visual
 8 | doc_to_text: !function utils.pope_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   max_new_tokens: 128
12 |   temperature: 0
13 |   top_p: 0
14 |   num_beams: 1
15 |   do_sample: false
16 | process_results: !function utils.pope_process_results
17 | metric_list:
18 |   - metric: pope_accuracy
19 |     aggregation: !function utils.pope_aggregate_accuracy
20 |     higher_is_better: true
21 |   - metric: pope_precision
22 |     aggregation: !function utils.pope_aggregate_precision
23 |     higher_is_better: true
24 |   - metric: pope_recall
25 |     aggregation: !function utils.pope_aggregate_recall
26 |     higher_is_better: true
27 |   - metric: pope_f1_score
28 |     aggregation: !function utils.pope_aggregate_f1_score
29 |     higher_is_better: true
30 |   - metric: pope_yes_ratio
31 |     aggregation: !function utils.pope_aggregate_yes_ratio
32 |     higher_is_better: true
33 | metadata:
34 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/realworldqa/realworldqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RealWorldQA
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "realworldqa"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.realworldqa_doc_to_visual
 8 | doc_to_text: !function utils.realworldqa_doc_to_text
 9 | doc_to_target: "answer"
10 | 
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | 
18 | filter_list:
19 |   - name: "flexible-extract"
20 |     filter:
21 |       - function: !function utils.NumberWordsToDigitsFilter
22 |       - function: !function utils.MultiChoiceRegexFilter
23 |         group_select: 0
24 |         ignore_case: true
25 |         ignore_punctuation: true
26 |         regex_pattern: "(\\([A-Z]\\))"
27 | 
28 | metric_list:
29 |   - metric: exact_match
30 |     aggregation: mean
31 |     higher_is_better: true
32 |     ignore_case: true
33 |     ignore_punctuation: true
34 |       
35 | model_specific_prompt_kwargs:
36 |   default:
37 |     pre_prompt: ""
38 |     post_prompt: ""
39 |   gpt4v:
40 |     pre_prompt: ""
41 |     post_prompt: ""
42 | metadata:
43 |   - version: 0.0
44 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOPlus
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOplus
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "val", "testA", "testB"]
 5 | splits = ["val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco+_{task}", "task": f"refcoco+_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco+_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco+", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+
2 | task:
3 | - refcoco+_seg_val
4 | - refcoco+_seg_testA
5 | - refcoco+_seg_testB
6 | - refcoco+_bbox_val
7 | - refcoco+_bbox_testA
8 | - refcoco+_bbox_testB
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testA
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_testB
3 | include: _default_template_bbox_rec_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox_rec
2 | task: refcoco+_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testA
3 | include: _default_template_bbox_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_testB
3 | include: _default_template_bbox_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_bbox
2 | task: refcoco+_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testA
3 | include: _default_template_seg_yaml
4 | test_split: testA
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_testB
3 | include: _default_template_seg_yaml
4 | test_split: testB
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco+/refcoco+_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco+_seg
2 | task: refcoco+_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCO
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val", "testA", "testB"]
 5 | splits = ["test", "val", "testA", "testB"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcoco_{task}", "task": f"refcoco_{task}_{split}", "test_split": split, "include": f"_default_template_{task}_yaml"}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcoco_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcoco", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/_refcoco.yaml:
--------------------------------------------------------------------------------
 1 | group: refcoco
 2 | task:
 3 | - refcoco_seg_test
 4 | - refcoco_seg_val
 5 | - refcoco_seg_testA
 6 | - refcoco_seg_testB
 7 | - refcoco_bbox_test
 8 | - refcoco_bbox_val
 9 | - refcoco_bbox_testA
10 | - refcoco_bbox_testB
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_test
3 | test_split: test
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testA
3 | test_split: testA
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_testB
3 | test_split: testB
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox_rec
2 | task: refcoco_bbox_rec_val
3 | test_split: val
4 | include: _default_template_bbox_rec_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_test
3 | test_split: test
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testA
3 | test_split: testA
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_testB
3 | test_split: testB
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_bbox
2 | task: refcoco_bbox_val
3 | test_split: val
4 | include: _default_template_bbox_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_test
3 | test_split: test
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testA.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testA
3 | test_split: testA
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_testB.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_testB
3 | test_split: testB
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcoco/refcoco_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcoco_seg
2 | task: refcoco_seg_val
3 | test_split: val
4 | include: _default_template_seg_yaml
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | process_docs: !function utils_rec.refcoco_bbox_rec_preprocess_dataset
 4 | doc_to_visual: !function utils_rec.refcoco_bbox_rec_doc_to_visual
 5 | doc_to_text: !function utils_rec.refcoco_bbox_rec_doc_to_text
 6 | doc_to_target: "bbox"
 7 | generation_kwargs:
 8 |   until:
 9 |     - "ASSISTANT:"
10 | process_results: !function utils_rec.refcoco_bbox_rec_process_result
11 | metric_list:
12 |   - metric: refcoco_IoU
13 |     aggregation : !function utils_rec.refcoco_bbox_rec_iou
14 |     higher_is_better : true
15 |   - metric: refcoco_ACC@0.1
16 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc01
17 |     higher_is_better : true
18 |   - metric: refcoco_ACC@0.3
19 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc03
20 |     higher_is_better : true
21 |   - metric: refcoco_ACC@0.5
22 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc05
23 |     higher_is_better : true
24 |   - metric: refcoco_ACC@0.7
25 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc07
26 |     higher_is_better : true
27 |   - metric: refcoco_ACC@0.9
28 |     aggregation : !function utils_rec.refcoco_bbox_rec_acc09
29 |     higher_is_better : true
30 |   - metric: refcoco_Center_ACC
31 |     aggregation : !function utils_rec.refcoco_bbox_rec_center_acc
32 |     higher_is_better : true
33 | metadata:
34 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_bbox_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_bbox_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_default_template_seg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/RefCOCOg
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.refcoco_seg_doc_to_visual
 4 | doc_to_text: !function utils.refcoco_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.refcoco_process_result
10 | metric_list:
11 |   - metric: refcoco_Bleu_4 
12 |     aggregation : !function utils.refcoco_bleu4
13 |     higher_is_better : true
14 |   - metric: refcoco_Bleu_3
15 |     aggregation : !function utils.refcoco_bleu3
16 |     higher_is_better : true
17 |   - metric: refcoco_Bleu_2
18 |     aggregation : !function utils.refcoco_bleu2
19 |     higher_is_better : true
20 |   - metric: refcoco_Bleu_1
21 |     aggregation : !function utils.refcoco_bleu1
22 |     higher_is_better : true
23 |   - metric: refcoco_METEOR
24 |     aggregation : !function utils.refcoco_meteor
25 |     higher_is_better : true
26 |   - metric: refcoco_ROUGE_L
27 |     aggregation : !function utils.refcoco_rougel
28 |     higher_is_better : true
29 |   - metric: refcoco_CIDEr
30 |     aggregation : !function utils.refcoco_cider
31 |     higher_is_better : true
32 |   #- metric: refcoco_SPICE
33 |   #  aggregation : !function utils.refcoco_spice
34 |   #  higher_is_better : true
35 | metadata:
36 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | # splits = ["train", "test", "val"]
 5 | splits = ["test", "val"]
 6 | tasks = ["seg", "bbox"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     dump_tasks = []
10 |     for task in tasks:
11 |         for split in splits:
12 |             yaml_dict = {"group": f"refcocog_{task}", "task": f"refcocog_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
13 |             if split == "train":
14 |                 yaml_dict.pop("group")
15 |             else:
16 |                 dump_tasks.append(f"refcoco_{task}_{split}")
17 | 
18 |             save_path = f"./refcocog_{task}_{split}.yaml"
19 |             print(f"Saving to {save_path}")
20 |             with open(save_path, "w") as f:
21 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
22 | 
23 |     group_dict = {"group": "refcocog", "task": dump_tasks}
24 | 
25 |     with open("./_refcoco.yaml", "w") as f:
26 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
27 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/_refcoco.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog
2 | task:
3 | - refcocog_seg_test
4 | - refcocog_seg_val
5 | - refcocog_bbox_test
6 | - refcocog_bbox_val
7 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_test
3 | include: _default_template_bbox_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_rec_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox_rec
2 | task: refcocog_bbox_rec_val
3 | include: _default_template_bbox_rec_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_test
3 | include: _default_template_bbox_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_bbox_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_bbox
2 | task: refcocog_bbox_val
3 | include: _default_template_bbox_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_test.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_test
3 | include: _default_template_seg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/refcocog/refcocog_seg_val.yaml:
--------------------------------------------------------------------------------
1 | group: refcocog_seg
2 | task: refcocog_seg_val
3 | include: _default_template_seg_yaml
4 | test_split: val
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-FULL
 3 | task: "scienceqa"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 |   
33 | model_specific_generation_kwargs:
34 |   llava:
35 |     image_aspect_ratio: original
36 |   
37 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_full.yaml:
--------------------------------------------------------------------------------
1 | group: scienceqa_full
2 | task:
3 |   - scienceqa
4 |   - scienceqa_img


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/scienceqa_img.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ScienceQA
 2 | dataset_name: ScienceQA-IMG
 3 | task: "scienceqa_img"
 4 | dataset_kwargs:
 5 |   token: True
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.sqa_doc_to_visual
 9 | doc_to_text: !function utils.sqa_doc_to_text
10 | doc_to_target: !function utils.sqa_doc_to_target
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   temperature: 0
14 |   do_sample: False
15 | metric_list:
16 |   - metric: exact_match
17 |     aggregation: mean
18 |     higher_is_better: true
19 |     ignore_case: true
20 |     ignore_punctuation: true
21 | process_results: !function utils.sqa_process_results
22 | metadata:
23 |   - version: 0.0
24 | 
25 | model_specific_prompt_kwargs:
26 |   default:
27 |     format: default
28 |     pre_prompt: ""
29 |     post_prompt: "\nAnswer with the option's letter from the given choices directly."
30 |   qwen_vl:
31 |     format: qwen_vl
32 | model_specific_generation_kwargs:
33 |   llava:
34 |     image_aspect_ratio: original
35 |   
36 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/scienceqa/utils.py:
--------------------------------------------------------------------------------
 1 | def sqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
 2 |     context, question, choices = doc["hint"], doc["question"], doc["choices"]
 3 |     len_choices = len(choices)
 4 |     options = [chr(ord("A") + i) for i in range(len_choices)]
 5 |     choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
 6 |     if model_specific_prompt_kwargs["format"] == "default":
 7 |         if context:
 8 |             context = f"Context: {context}\n"
 9 | 
10 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
11 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
12 |         return f"{pre_prompt}{context}{question}\n{choices_str}{post_prompt}"
13 |     elif model_specific_prompt_kwargs["format"] == "qwen_vl":
14 |         prompt = "Context: {}\nQuestion: {}\nOptions: {}\nAnswer:"
15 |         context = context if context else "N/A"
16 |         prompt = prompt.format(context, question, choices_str)
17 |         return prompt
18 |     else:
19 |         raise ValueError(f"Unknown prompt format: {model_specific_prompt_kwargs}")
20 | 
21 | 
22 | def sqa_doc_to_visual(doc):
23 |     if doc["image"] is None:
24 |         return []
25 |     return [doc["image"].convert("RGB")]
26 | 
27 | 
28 | def sqa_doc_to_target(doc):
29 |     len_choices = len(doc["choices"])
30 |     options = [chr(ord("A") + i) for i in range(len_choices)]
31 |     return options[doc["answer"]]
32 | 
33 | 
34 | def sqa_process_results(doc, results):
35 |     # I know this is weird, but it's how llava parse it.
36 |     target = sqa_doc_to_target(doc)
37 |     pred = results[0]
38 |     if pred == target:
39 |         return {"exact_match": 1.0}
40 |     # pattern: ^[A-Z]\. .*
41 |     if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".":
42 |         result = 1.0 if pred[0] == target else 0.0
43 |         return {"exact_match": result}
44 |     return {"exact_match": 0.0}
45 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_rec_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual
 4 | doc_to_text: !function utils_rec.screenspot_rec_doc_to_text
 5 | doc_to_target: "bbox"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils_rec.screenspot_rec_process_result
10 | metric_list:
11 |   - metric: screenspot_IoU
12 |     aggregation : !function utils_rec.screenspot_rec_iou
13 |     higher_is_better : true
14 |   - metric: screenspot_ACC@0.1
15 |     aggregation : !function utils_rec.screenspot_rec_acc01
16 |     higher_is_better : true
17 |   - metric: screenspot_ACC@0.3
18 |     aggregation : !function utils_rec.screenspot_rec_acc03
19 |     higher_is_better : true
20 |   - metric: screenspot_ACC@0.5
21 |     aggregation : !function utils_rec.screenspot_rec_acc05
22 |     higher_is_better : true
23 |   - metric: screenspot_ACC@0.7
24 |     aggregation : !function utils_rec.screenspot_rec_acc07
25 |     higher_is_better : true
26 |   - metric: screenspot_ACC@0.9
27 |     aggregation : !function utils_rec.screenspot_rec_acc09
28 |     higher_is_better : true
29 |   - metric: screenspot_Center_ACC
30 |     aggregation : !function utils_rec.screenspot_rec_center_acc
31 |     higher_is_better : true
32 | metadata:
33 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_default_template_reg_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/ScreenSpot
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.screenspot_bbox_doc_to_visual
 4 | doc_to_text: !function utils.screenspot_doc_to_text
 5 | doc_to_target: "instruction"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.screenspot_process_result
10 | metric_list:
11 |   - metric: screenspot_CIDEr
12 |     aggregation : !function utils.screenspot_cider
13 |     higher_is_better : true
14 | metadata:
15 |   version: '0.0'


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/_screenspot.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot
2 | task:
3 | - screenspot_reg_test
4 | - screenspot_rec_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_rec_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_rec
2 | task: screenspot_rec_test
3 | include: _default_template_rec_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/screenspot/screenspot_reg_test.yaml:
--------------------------------------------------------------------------------
1 | group: screenspot_reg
2 | task: screenspot_reg_test
3 | include: _default_template_reg_yaml
4 | test_split: test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   image_aspect_ratio: original
14 | # The return value of process_results will be used by metrics
15 | process_results: !function utils.seed_process_result
16 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17 | metric_list:
18 |   - metric: seed_image
19 |     aggregation: !function utils.seed_aggregation_result
20 |     higher_is_better: true
21 |   - metric: seed_video
22 |     aggregation: !function utils.seed_aggregation_result
23 |     higher_is_better: true
24 |   - metric: seed_all
25 |     aggregation: !function utils.seed_aggregation_result
26 |     higher_is_better: true
27 | metadata:
28 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/seedbench_ppl.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench_ppl"
 5 | test_split: test
 6 | output_type: multiple_choice
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text_mc
 9 | doc_to_choice : !function utils.seed_doc_to_choice
10 | doc_to_target: !function utils.seed_doc_to_mc_target
11 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
12 | metric_list:
13 |   - metric: acc
14 | metadata:
15 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def seed_doc_to_visual(doc):
 5 |     return [image.convert("RGB") for image in doc["image"]]
 6 | 
 7 | 
 8 | def seed_doc_to_text(doc):
 9 |     question = doc["question"]
10 |     question += "\n" + f"A. {doc['choice_a']}\n"
11 |     question += f"B. {doc['choice_b']}\n"
12 |     question += f"C. {doc['choice_c']}\n"
13 |     question += f"D. {doc['choice_d']}"
14 |     return f"{question}\nAnswer with the option's letter from the given choices directly."
15 | 
16 | 
17 | def seed_process_result(doc, result):
18 |     pred = result[0].strip()
19 |     if len(pred) > 1:
20 |         pred = pred[0]
21 |     answer = doc["answer"]
22 |     data_type = doc["data_type"]
23 | 
24 |     return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
25 | 
26 | 
27 | def seed_aggregation_result(results):
28 |     total_count = 0
29 |     total_correct = 0
30 |     for result in results:
31 |         if result["pred"] == result["answer"]:
32 |             total_correct += 1
33 |         total_count += 1
34 |     return total_correct / total_count
35 | 
36 | 
37 | def seed_aggregation_result_all(results):
38 |     score = seed_aggregation_result(results)
39 |     stored_results = []
40 |     for result in results:
41 |         stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
42 |     with open("./seed_submission.json", "w") as f:
43 |         json.dump(stored_results, f, indent=4)
44 |     print("Storing files for seed_submission ...")
45 | 
46 |     return score
47 | 
48 | 
49 | def seed_doc_to_text_mc(doc):
50 |     question = doc["question"]
51 |     return f"{question} Answer :"
52 | 
53 | 
54 | def seed_doc_to_choice(doc):
55 |     return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]]
56 | 
57 | 
58 | def seed_doc_to_mc_target(doc):
59 |     answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"}
60 |     return doc[answer2choice[doc["answer"]]]
61 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench_2/seedbench_2.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/SEED-Bench-2
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "seedbench-2"
 5 | test_split: test
 6 | output_type: generate_until
 7 | doc_to_visual: !function utils.seed_doc_to_visual
 8 | doc_to_text: !function utils.seed_doc_to_text
 9 | doc_to_target: "answer"
10 | generation_kwargs:
11 |   until:
12 |     - "ASSISTANT:"
13 |   max_new_tokens: 16
14 |   image_aspect_ratio: original
15 | # The return value of process_results will be used by metrics
16 | process_results: !function utils.seed_process_result
17 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
18 | metric_list:
19 |   - metric: seed_Video
20 |     aggregation: !function utils.seed_aggregation_result
21 |     higher_is_better: true
22 |   - metric: seed_Multiple_Images
23 |     aggregation: !function utils.seed_aggregation_result
24 |     higher_is_better: true
25 |   - metric: seed_Image_&_Text_Generation
26 |     aggregation: !function utils.seed_aggregation_result
27 |     higher_is_better: true
28 |   - metric: seed_Single_Image
29 |     aggregation: !function utils.seed_aggregation_result
30 |     higher_is_better: true
31 |   - metric: seed_Image_Generation
32 |     aggregation: !function utils.seed_aggregation_result
33 |     higher_is_better: true
34 |   - metric: seed_Interleaved_Image
35 |     aggregation: !function utils.seed_aggregation_result
36 |     higher_is_better: true
37 |   - metric: seed_all
38 |     aggregation: !function utils.seed_aggregation_result
39 |     higher_is_better: true
40 | metadata:
41 |   - version: 0.0
42 | 
43 | model_specific_prompt_kwargs:
44 |   llava :
45 |     img_token : <image>
46 |     post_prompt : "Answer with the option's letter from the given choices directly."
47 |   gpt4V :
48 |     img_token : <image>
49 |     post_prompt : "Answer with the option's letter from the given choices directly."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/seedbench_2/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def seed_doc_to_visual(doc):
 5 |     return [image.convert("RGB") for image in doc["image"]]
 6 | 
 7 | 
 8 | def parse_choice_img(choice: str, img_token: str):
 9 |     if "jpg" in choice or "png" in choice:
10 |         return img_token
11 |     return choice
12 | 
13 | 
14 | def seed_doc_to_text(doc, model_specific_kwargs=None):
15 |     question = doc["question"]
16 |     question.replace("<img>", model_specific_kwargs["img_token"])
17 |     question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
18 |     question += f"B. {parse_choice_img(doc['choice_b'], model_specific_kwargs['img_token'])}\n"
19 |     question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n"
20 |     question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}"
21 |     if doc["data_type"] == "Image Generation":
22 |         num_img_in_question = len(doc["data_id"]) - 4
23 |         prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question
24 |         question = " ".join(prepend_tokens) + "\n" + question
25 |     return f"{question}\n{model_specific_kwargs['post_prompt']}"
26 | 
27 | 
28 | def seed_process_result(doc, result):
29 |     pred = result[0].strip()
30 |     if len(pred) > 1:
31 |         pred = pred[0]
32 |     answer = doc["answer"]
33 |     data_type = doc["data_type"].split(" ")
34 |     data_type = "_".join(data_type)
35 | 
36 |     return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
37 | 
38 | 
39 | def seed_aggregation_result(results):
40 |     total_count = 0
41 |     total_correct = 0
42 |     for result in results:
43 |         if result["pred"] == result["answer"]:
44 |             total_correct += 1
45 |         total_count += 1
46 |     return total_correct / total_count if total_count != 0 else 0
47 | 
48 | 
49 | def seed_aggregation_result_all(results):
50 |     score = seed_aggregation_result(results)
51 |     stored_results = []
52 |     for result in results:
53 |         stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
54 |     with open("./seed_submission.json", "w") as f:
55 |         json.dump(stored_results, f, indent=4)
56 |     print("Storing files for seed_submission ...")
57 | 
58 |     return score
59 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/stvqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/ST-VQA
 2 | task: "stvqa"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.stvqa_doc_to_visual
 6 | doc_to_text: !function utils.stvqa_doc_to_text
 7 | doc_to_target: "answers"
 8 | generation_kwargs:
 9 |   max_new_tokens: 32
10 |   temperature: 0
11 |   do_sample: False
12 | process_results: !function utils.stvqa_process_results
13 | metric_list:
14 |   - metric: submission
15 |     aggregation: !function utils.stvqa_aggregate_submissions
16 | model_specific_prompt_kwargs:
17 |   default:
18 |     pre_prompt: ""
19 |     post_prompt: "\nAnswer the question using a single word or phrase."
20 |   


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/stvqa/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | 
 5 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 6 | 
 7 | 
 8 | def stvqa_doc_to_text(doc, model_specific_prompt_kwargs):
 9 |     question = doc["question"]
10 |     pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
11 |     post_prompt = model_specific_prompt_kwargs["post_prompt"]
12 |     return f"{pre_prompt}{question}{post_prompt}"
13 | 
14 | 
15 | def stvqa_doc_to_visual(doc):
16 |     return [doc["image"].convert("RGB")]
17 | 
18 | 
19 | def stvqa_process_results(doc, results):
20 |     answer = results[0]
21 |     return {"submission": {"question_id": int(doc["question_id"]), "answer": answer}}
22 | 
23 | 
24 | def stvqa_aggregate_submissions(results, args):
25 |     file = generate_submission_file("stvqa_test_for_submission.json", args)
26 |     with open(file, "w") as f:
27 |         json.dump(results, f)
28 |     logging.getLogger("lmms-eval").info(f"Results saved to {file}")
29 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/_default_template_textcaps_yaml:
--------------------------------------------------------------------------------
1 | model_specific_prompt_kwargs:
2 |   default:
3 |     prompt: Provide a one-sentence caption for the provided image.


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps.yaml:
--------------------------------------------------------------------------------
1 | group : textcaps
2 | task:
3 |   - textcaps_val
4 |   - textcaps_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_test"
 5 | group : "textcaps_caption"
 6 | test_split: test
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_test_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_passthrough 
21 |     aggregation : !function utils.textcaps_test_aggregation_result
22 |     higher_is_better : true
23 | metadata:
24 |   - version: 0.0
25 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_train.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task : "textcaps_train"
 5 | group : "textcaps_caption"
 6 | test_split: train
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   until:
13 |     - "ASSISTANT:"
14 |   max_new_tokens: 1024
15 |   temperature: 0
16 |   top_p: 0
17 |   num_beams: 1
18 |   do_sample: false
19 | process_results: !function utils.textcaps_process_result
20 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
21 | metric_list:
22 |   - metric: textcaps_Bleu_4 
23 |     aggregation : !function utils.textcaps_bleu4
24 |     higher_is_better : true
25 |   - metric: textcaps_Bleu_3
26 |     aggregation : !function utils.textcaps_bleu3
27 |     higher_is_better : true
28 |   - metric: textcaps_Bleu_2
29 |     aggregation : !function utils.textcaps_bleu2
30 |     higher_is_better : true
31 |   - metric: textcaps_Bleu_1
32 |     aggregation : !function utils.textcaps_bleu1
33 |     higher_is_better : true
34 |   - metric: textcaps_METEOR
35 |     aggregation : !function utils.textcaps_meteor
36 |     higher_is_better : true
37 |   - metric: textcaps_ROUGE_L
38 |     aggregation : !function utils.textcaps_rougel
39 |     higher_is_better : true
40 |   - metric: textcaps_CIDEr
41 |     aggregation : !function utils.textcaps_cider
42 |     higher_is_better : true
43 |   #- metric: textcaps_SPICE
44 |   #  aggregation : !function utils.textcaps_spice
45 |   #  higher_is_better : true
46 | metadata:
47 |   - version: 0.0
48 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textcaps/textcaps_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/TextCaps
 2 | dataset_kwargs:
 3 |   token: True
 4 | task: "textcaps_val"
 5 | group : "textcaps_caption"
 6 | test_split: val
 7 | output_type: generate_until
 8 | doc_to_visual: !function utils.textcaps_doc_to_visual
 9 | doc_to_text: !function utils.textcaps_doc_to_text
10 | doc_to_target: "answer"
11 | generation_kwargs:
12 |   max_new_tokens: 64
13 |   temperature: 0
14 |   top_p: 0
15 |   num_beams: 1
16 |   do_sample: false
17 | process_results: !function utils.textcaps_process_result
18 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19 | metric_list:
20 |   - metric: textcaps_Bleu_4 
21 |     aggregation : !function utils.textcaps_bleu4
22 |     higher_is_better : true
23 |   - metric: textcaps_Bleu_3
24 |     aggregation : !function utils.textcaps_bleu3
25 |     higher_is_better : true
26 |   - metric: textcaps_Bleu_2
27 |     aggregation : !function utils.textcaps_bleu2
28 |     higher_is_better : true
29 |   - metric: textcaps_Bleu_1
30 |     aggregation : !function utils.textcaps_bleu1
31 |     higher_is_better : true
32 |   - metric: textcaps_METEOR
33 |     aggregation : !function utils.textcaps_meteor
34 |     higher_is_better : true
35 |   - metric: textcaps_ROUGE_L
36 |     aggregation : !function utils.textcaps_rougel
37 |     higher_is_better : true
38 |   - metric: textcaps_CIDEr
39 |     aggregation : !function utils.textcaps_cider
40 |     higher_is_better : true
41 |   #- metric: textcaps_SPICE
42 |   #  aggregation : !function utils.textcaps_spice
43 |   #  higher_is_better : true
44 | metadata:
45 |   - version: 0.0
46 | include: _default_template_textcaps_yaml


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/textvqa
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.textvqa_doc_to_visual
 4 | doc_to_text: !function utils.textvqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | process_results: !function utils.textvqa_process_results
10 | model_specific_prompt_kwargs:
11 |   default:
12 |     pre_prompt: ""
13 |     post_prompt: "\nAnswer the question using a single word or phrase."
14 |     ocr: true
15 |   qwen_vl:
16 |     pre_prompt: ""
17 |     post_prompt: " Answer:"
18 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/_textvqa.yaml:
--------------------------------------------------------------------------------
1 | group: textvqa
2 | task:
3 | - textvqa_val
4 | - textvqa_test
5 | - textvqa_val_noocr


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_test.yaml:
--------------------------------------------------------------------------------
1 | task: textvqa_test
2 | test_split: test
3 | metric_list:
4 |   - metric: submission
5 |     aggregation: !function utils.textvqa_aggreate_submissions
6 |     higher_is_better: true
7 | include: _default_template_textvqa_yaml
8 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_val.yaml:
--------------------------------------------------------------------------------
 1 | task: textvqa_val
 2 | test_split: validation
 3 | metric_list:
 4 |   - metric: exact_match
 5 |     aggregation: mean
 6 |     higher_is_better: true
 7 |     ignore_case: true
 8 |     ignore_punctuation: true
 9 |   - metric: submission
10 |     aggregation: !function utils.textvqa_aggreate_submissions
11 |     higher_is_better: true
12 | include: _default_template_textvqa_yaml
13 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/textvqa/textvqa_val_noocr.yaml:
--------------------------------------------------------------------------------
 1 | task: textvqa_val_noocr
 2 | test_split: validation
 3 | metric_list:
 4 |   - metric: exact_match
 5 |     aggregation: mean
 6 |     higher_is_better: true
 7 |     ignore_case: true
 8 |     ignore_punctuation: true
 9 |   - metric: submission
10 |     aggregation: !function utils.textvqa_aggreate_submissions
11 |     higher_is_better: true
12 | include: _default_template_textvqa_yaml
13 | model_specific_prompt_kwargs:
14 |   default:
15 |     pre_prompt: ""
16 |     post_prompt: "\nAnswer the question using a single word or phrase."
17 |     ocr: false
18 |   qwen_vl:
19 |     pre_prompt: ""
20 |     post_prompt: " Answer:"
21 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VizWiz-VQA
 2 | output_type: generate_until
 3 | doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual
 4 | doc_to_text: !function utils.vizwiz_vqa_doc_to_text
 5 | doc_to_target: "answer"
 6 | generation_kwargs:
 7 |   until:
 8 |     - "ASSISTANT:"
 9 | metadata:
10 |   - version: 0.0
11 | model_specific_prompt_kwargs:
12 |   default:
13 |     pre_prompt: ""
14 |     post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase."
15 | process_results: !function utils.vizwiz_vqa_process_results
16 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_generate_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | splits = ["val", "test"]
 5 | tasks = ["vqa"]
 6 | 
 7 | if __name__ == "__main__":
 8 |     dump_tasks = []
 9 |     for task in tasks:
10 |         for split in splits:
11 |             yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split}
12 |             if split == "train":
13 |                 yaml_dict.pop("group")
14 |             else:
15 |                 dump_tasks.append(f"vizwiz_{task}_{split}")
16 | 
17 |             save_path = f"./vizwiz_{task}_{split}.yaml"
18 |             print(f"Saving to {save_path}")
19 |             with open(save_path, "w") as f:
20 |                 yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False)
21 | 
22 |     group_dict = {"group": "vizwiz_vqa", "task": dump_tasks}
23 | 
24 |     with open("./_vizwiz_vqa.yaml", "w") as f:
25 |         yaml.dump(group_dict, f, default_flow_style=False, indent=4)
26 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml:
--------------------------------------------------------------------------------
1 | group: vizwiz_vqa
2 | task:
3 | - vizwiz_vqa_val
4 | - vizwiz_vqa_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | import yaml
 5 | import pathlib
 6 | import logging
 7 | import datetime
 8 | import statistics
 9 | 
10 | from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
11 | from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor
12 | 
13 | eval_logger = logging.getLogger("lmms-eval")
14 | 
15 | 
16 | def vizwiz_vqa_doc_to_visual(doc):
17 |     return [doc["image"].convert("RGB")]
18 | 
19 | 
20 | def vizwiz_vqa_process_results(doc, result):
21 |     eval_ai_processor = EvalAIAnswerProcessor()
22 |     assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}."
23 |     resAns = eval_ai_processor(result[0])
24 |     accuracy = 0
25 | 
26 |     if "answers" in doc and doc["answers"] is not None:
27 |         gtAcc = []
28 | 
29 |         for i in range(len(doc["answers"])):
30 |             doc["answers"][i] = eval_ai_processor(doc["answers"][i])
31 | 
32 |         for i in range(len(doc["answers"])):
33 |             otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j]
34 |             matchingAns = [item for item in otherGTAns if item == resAns]
35 |             acc = min(1, float(len(matchingAns)) / 3)
36 |             gtAcc.append(acc)
37 |         if gtAcc:
38 |             accuracy = statistics.mean(gtAcc)
39 |         else:
40 |             accuracy = 0
41 | 
42 |     return {
43 |         "exact_match": accuracy,
44 |         "submission": {
45 |             "image": f"{doc['question_id']}.jpg",
46 |             "answer": resAns,
47 |         },
48 |     }
49 | 
50 | 
51 | def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None):
52 |     if model_specific_prompt_kwargs is None:
53 |         model_specific_prompt_kwargs = {}
54 |     pre_prompt = ""
55 |     post_prompt = ""
56 |     if "pre_prompt" in model_specific_prompt_kwargs:
57 |         pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
58 |     if "post_prompt" in model_specific_prompt_kwargs:
59 |         post_prompt = model_specific_prompt_kwargs["post_prompt"]
60 |     text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}"
61 |     return text
62 | 
63 | 
64 | def vizwiz_vqa_aggreate_submissions(results, args):
65 |     now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
66 |     submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json"
67 |     path = generate_submission_file(submission_file_name, args)
68 |     with open(path, "w") as f:
69 |         json.dump(results, f)
70 |     print(f"Submission file saved to {path}")
71 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_test
 3 | test_split: test
 4 | include: _default_template_vqa_yaml
 5 | process_results: !function utils.vizwiz_vqa_process_results
 6 | metric_list:
 7 |   # - metric: exact_match
 8 |   #   aggregation: mean
 9 |   #   higher_is_better: true
10 |   #   ignore_case: true
11 |   #   ignore_punctuation: true
12 |   - metric: submission
13 |     aggregation: !function utils.vizwiz_vqa_aggreate_submissions
14 |     higher_is_better: true
15 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml:
--------------------------------------------------------------------------------
 1 | group: vizwiz_vqa
 2 | task: vizwiz_vqa_val
 3 | test_split: val
 4 | include: _default_template_vqa_yaml
 5 | metric_list:
 6 |   - metric: exact_match
 7 |     aggregation: mean
 8 |     higher_is_better: true
 9 |     ignore_case: true
10 |     ignore_punctuation: true
11 |   # - metric: submission
12 |   #   aggregation: !function utils.vizwiz_vqa_aggreate_submissions
13 |   #   higher_is_better: true


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: lmms-lab/VQAv2
 2 | dataset_kwargs:
 3 |   token: True
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.vqav2_doc_to_visual
 6 | doc_to_text: !function utils.vqav2_doc_to_text
 7 | doc_to_target: "answer"
 8 | generation_kwargs:
 9 |   max_new_tokens: 16
10 | metadata:
11 |   - version: 0.0
12 | model_specific_prompt_kwargs:
13 |   default:
14 |     pre_prompt: ""
15 |     post_prompt: "\nAnswer the question using a single word or phrase."


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/_vqav2.yaml:
--------------------------------------------------------------------------------
1 | group: vqav2
2 | task:
3 | - vqav2_val
4 | - vqav2_test


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_test.yaml:
--------------------------------------------------------------------------------
1 | task: "vqav2_test"
2 | include: _default_template_vqav2_yaml
3 | test_split: test
4 | metric_list:
5 |   - metric: submission
6 |     aggregation: !function utils.vqav2_aggreate_submissions
7 |     higher_is_better: true
8 | process_results: !function utils.vqav2_process_results_test
9 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/vqav2/vqav2_val.yaml:
--------------------------------------------------------------------------------
 1 | task: "vqav2_val"
 2 | include: _default_template_vqav2_yaml
 3 | test_split: validation
 4 | metric_list:
 5 |   - metric: exact_match
 6 |     aggregation: mean
 7 |     higher_is_better: true
 8 |     ignore_case: true
 9 |     ignore_punctuation: true
10 | process_results: !function utils.vqav2_process_results_val
11 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/README.md:
--------------------------------------------------------------------------------
 1 | # WebSRC
 2 | 
 3 | ## Paper 
 4 | 
 5 | Title: WebSRC: A Dataset for Web-Based Structural Reading Comprehension
 6 | 
 7 | Abstract: https://arxiv.org/abs/2101.09465
 8 | 
 9 | Homepage: https://x-lance.github.io/WebSRC/#
10 | 
11 | WebSRC is a dataset for web-based structural reading comprehension.
12 | Its full train/dev/test split contains over 400k questions across 6.4k webpages. 
13 | This version of the dataset does not contain OCR or original HTML, it simply treats WebSRC as a image-and-text-based multimodal Q&A benchmark on webpage screenshots.
14 | 
15 | ## Citation
16 | 
17 | ```bibtex
18 | @inproceedings{chen2021websrc,
19 |   title={WebSRC: A Dataset for Web-Based Structural Reading Comprehension},
20 |   author={Chen, Xingyu and Zhao, Zihan and Chen, Lu and Ji, Jiabao and Zhang, Danyang and Luo, Ao and Xiong, Yuxuan and Yu, Kai},
21 |   booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
22 |   pages={4173--4185},
23 |   year={2021}
24 | }
25 | ```
26 | 
27 | ## Groups & Tasks
28 | 
29 | ### Groups 
30 | 
31 | - `websrc`: Evaluates `websrc-val` and generates a submission file for `websrc-test`.
32 | 
33 | ### Tasks
34 | 
35 | - `websrc-val`: Given a question and a web page, predict the answer.
36 | - `websrc-test`: Given a question and a web page, predict the answer. Ground truth is not provided for this task.
37 | 
38 | ## Metrics
39 | 
40 | This task uses SQUAD-style evaluation metrics, of which F1 score over tokens is used. 
41 | The orignal paper also uses Exact Match (EM) score, but this is not implemented here as that metric is more conducive for Encoder-only extraction models.
42 | 
43 | ### F1 Score
44 | 
45 | F1 Score is the harmonic mean of precision and recall.
46 | We calculate precision and recall at the token level, then compute the F1 score as normal using these values.
47 | 
48 | ### Test Submission
49 | 
50 | When evaluaing on the test split, a prediction JSON will be compiled instead of metrics computed. 
51 | Instructions for submission are available on the [WebSRC homepage](https://x-lance.github.io/WebSRC/#) and in their [Original GitHub Repo](https://github.com/X-LANCE/WebSRC-Baseline#obtain-test-result).


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc.yaml:
--------------------------------------------------------------------------------
1 | group: websrc
2 | task:
3 | - websrc_val
4 | - websrc_test
5 | 


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_test.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc-test
 2 | task: "websrc_test"
 3 | test_split: test
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: submission
16 |     aggregation: !function utils.websrc_test_aggregate_results_for_submission
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/lmms_eval/tasks/websrc/websrc_val.yaml:
--------------------------------------------------------------------------------
 1 | dataset_path: rootsautomation/websrc
 2 | task: "websrc_val"
 3 | test_split: dev
 4 | output_type: generate_until
 5 | doc_to_visual: !function utils.websrc_doc_to_visual
 6 | doc_to_text: !function utils.websrc_doc_to_text
 7 | doc_to_target: "answer"
 8 | # The return value of process_results will be used by metrics
 9 | process_results: !function utils.websrc_process_results
10 | # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
11 | generation_kwargs:
12 |   max_new_tokens: 16
13 |   image_aspect_ratio: pad
14 | metric_list:
15 |   - metric: websrc_squad_f1
16 |     aggregation: !function utils.websrc_aggregate_results
17 |     higher_is_better: true
18 | metadata:
19 |   - version: 0.0


--------------------------------------------------------------------------------
/lmms-eval/miscs/llava_result_check.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MengLcool/DeepStack-VL/3aab69bff6d7ce587b839f518dda40d3379e522d/lmms-eval/miscs/llava_result_check.md


--------------------------------------------------------------------------------
/lmms-eval/miscs/repr_scripts.sh:
--------------------------------------------------------------------------------
 1 | # install lmms_eval without building dependencies
 2 | cd lmms_eval;
 3 | pip install --no-deps -U -e .
 4 | 
 5 | # install LLaVA without building dependencies
 6 | cd LLaVA
 7 | pip install --no-deps -U -e .
 8 | 
 9 | # install all the requirements that require for reproduce llava results
10 | pip install -r llava_repr_requirements.txt
11 | 
12 | # Run and exactly reproduce llava_v1.5 results!
13 | # mme as an example
14 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False,device_map=auto"   --tasks mme  --batch_size 1 --log_samples --log_samples_suffix reproduce --output_path ./logs/


--------------------------------------------------------------------------------
/lmms-eval/miscs/script.sh:
--------------------------------------------------------------------------------
 1 | accelerate launch --num_processes=1 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-7b"   --tasks mme_llava_prompt  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
 2 | 
 3 | 
 4 | gpu = 8 bs 1:
 5 | 
 6 | llava (pretrained=llava-hf/llava-1.5-7b-hf), gen_kwargs: (), limit: None, num_fewshot: None, batch_size: 1
 7 | |     Tasks      |Version|Filter|n-shot|  Metric   |Value|   |Stderr |
 8 | |----------------|-------|------|-----:|-----------|----:|---|------:|
 9 | |mme_llava_prompt|Yaml   |none  |     0|exact_match| 1873|±  |38.4331|
10 | 
11 | gpu = 8 bs 1 use_flash_attention_2=True:
12 | 
13 | 
14 | 
15 | 
16 | 
17 | gpu = 4 bs 1 use_flash_attention_2=True:
18 | 
19 | 
20 | 
21 | accelerate launch --num_processes=8 -m lmms_eval --model llava   --model_args pretrained="liuhaotian/llava-v1.5-13b"   --tasks scienceqa  --batch_size 1 --log_samples --log_samples_sufix debug --output_path ./logs/
22 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_llava.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | 
 4 | import torch
 5 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 6 | 
 7 | model_id = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | prompt_1 = "USER: <image>\nWhat does this image show?\nASSISTANT:"
10 | prompt_2 = "USER: <image> <image> \nWhat is the difference between these two images?\nASSISTANT:"
11 | image_file_1 = "image1.png"
12 | image_file_2 = "image2.png"
13 | model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 | raw_image_1 = Image.open(image_file_1)
16 | raw_image_2 = Image.open(image_file_2)
17 | inputs = processor([prompt_1, prompt_2], [raw_image_1, raw_image_1, raw_image_2], padding=True, return_tensors="pt").to(0, torch.float16)
18 | import pdb
19 | 
20 | pdb.set_trace()
21 | output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
22 | print(processor.batch_decode(output, skip_special_tokens=True))
23 | 


--------------------------------------------------------------------------------
/lmms-eval/miscs/test_scienceqa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | dataset = load_dataset("Otter-AI/ScienceQA", trust_remote_code=True)["test"]
4 | for doc in dataset:
5 |     print(doc["id"])
6 | 


--------------------------------------------------------------------------------
/lmms-eval/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 240
 3 | 
 4 | [build-system]
 5 | requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "lmms_eval"
10 | version = "0.1.2"
11 | authors = [
12 |     { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
13 | ]
14 | description = "A framework for evaluating large multi-modality language models"
15 | readme = "README.md"
16 | classifiers = [
17 |     "Programming Language :: Python :: 3",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Operating System :: OS Independent",
20 | ]
21 | requires-python = ">=3.8"
22 | license = { text = "MIT" }
23 | dependencies = [
24 |     "accelerate>=0.21.0",
25 |     "black==24.1.0",
26 |     "datasets==2.16.1",
27 |     "evaluate>=0.4.0",
28 |     "jsonlines",
29 |     "numexpr",
30 |     "peft>=0.2.0",
31 |     "pybind11>=2.6.2",
32 |     "pytablewriter",
33 |     "rouge-score>=0.0.4",
34 |     "sacrebleu>=1.5.0",
35 |     "scikit-learn>=0.24.1",
36 |     "sqlitedict",
37 |     "torch>=1.8",
38 |     "openai>=1.0.0",
39 |     "pycocoevalcap",
40 |     "tqdm-multiprocess",
41 |     "transformers",
42 |     "zstandard",
43 |     "pillow",
44 |     "pyyaml",
45 |     "sympy",
46 |     "mpmath",
47 |     "Jinja2",
48 |     "openpyxl",
49 |     "Levenshtein",
50 |     "hf_transfer",
51 |     "tenacity",
52 |     "wandb>=0.16.0",
53 |     "transformers-stream-generator",
54 |     "tiktoken",
55 |     "pre-commit",
56 |     "pydantic",
57 | ]
58 | 
59 | [tool.setuptools.packages.find]
60 | include = ["lmms_eval*"]
61 | 
62 | [tool.setuptools.package-data]
63 | lmms_eval = ["**/*.yaml", "tasks/**/*"]
64 | 
65 | [project.scripts]
66 | lmms-eval = "lmms_eval.__main__:cli_evaluate"
67 | lmms_eval = "lmms_eval.__main__:cli_evaluate"
68 | 
69 | [project.urls]
70 | Homepage = "https://lmms-lab.github.io/lmms-eval-blog/"
71 | Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
72 | 


--------------------------------------------------------------------------------
/lmms-eval/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | 
3 | # This is to make sure that the package supports editable installs
4 | setuptools.setup()
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.2.2.post1"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.1.2", "torchvision==0.16.2",
17 |     "transformers==4.39.3", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.27.2", "peft", "bitsandbytes",
19 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==4.16.0", "gradio_client==0.8.1",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.12.6", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://llava-vl.github.io"
31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
38 | 


--------------------------------------------------------------------------------
/scripts/eval_lmms.sh:
--------------------------------------------------------------------------------
 1 | # login to huggingface
 2 | 
 3 | CKPT=${1:-"None"}
 4 | conv_template=${2:-"vicuna_v1"}
 5 | vistoken_patch_size=${3:-"None"}
 6 | 
 7 | eval_tasks=${eval_tasks:-"textvqa,chartqa,docvqa"}
 8 | master_port=${master_port:-"12345"}
 9 | GPUS=`nvidia-smi -L | wc -l`
10 | 
11 | echo $CKPT, $conv_template
12 | 
13 | accelerate launch --num_processes=$GPUS --main_process_port=${master_port} -m lmms_eval --model llava   \
14 |     --model_args pretrained=$CKPT,conv_template=${conv_template} \
15 |     --tasks $eval_tasks  --batch_size 1 --log_samples --log_samples_suffix lmms_eval --output_path $CKPT/logs/
16 | 
17 | 


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------