├── .env.sample
├── .github
    └── workflows
    │   ├── deploy-github-pages.yml
    │   ├── python-publish.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── LICENSE
├── README.md
├── assets
    ├── streamlit_visualization.png
    └── teaser.png
├── eval_all.sh
├── eval_with_vllm.sh
├── examples
    ├── asagi.py
    ├── base_vllm.py
    ├── base_vlm.py
    ├── evovlm_jp_v1.py
    ├── gemma3.py
    ├── gpt4o.py
    ├── heron_nvila.py
    ├── internvl2.py
    ├── japanese_instructblip_alpha.py
    ├── japanese_stable_vlm.py
    ├── llama_3_2_vision.py
    ├── llama_3_evovlm_jp_v2.py
    ├── llava_1_5.py
    ├── llava_1_6_mistral_hf.py
    ├── llava_calm2_siglip.py
    ├── llm_jp_3_vila.py
    ├── model_table.py
    ├── pangea_hf.py
    ├── phi4_multimodal.py
    ├── pixtral.py
    ├── qwen2_5_vl.py
    ├── qwen2_vl.py
    ├── sample.py
    ├── sample_vllm.py
    ├── sarashina2_vision.py
    ├── test_model.py
    ├── utils.py
    ├── vila.py
    ├── vllm_registry.py
    └── xcomposer2d5.py
├── github_pages
    ├── .gitignore
    ├── .prettierrc
    ├── README.md
    ├── package-lock.json
    ├── package.json
    ├── public
    │   ├── dataset_url.json
    │   ├── default_metrics.json
    │   ├── index.html
    │   └── leaderboard.json
    └── src
    │   ├── Author.css
    │   ├── Author.js
    │   ├── BibTex.css
    │   ├── BibTex.js
    │   ├── Figure.css
    │   ├── Figure.js
    │   ├── Footer.css
    │   ├── Footer.js
    │   ├── Introduction.css
    │   ├── Introduction.js
    │   ├── Leaderboard.css
    │   ├── Leaderboard.js
    │   ├── LinkButton.css
    │   ├── LinkButton.js
    │   ├── Main.css
    │   ├── Main.js
    │   ├── Method.css
    │   ├── Method.js
    │   ├── PageLayout.css
    │   ├── PageLayout.js
    │   ├── PaperMetaData.css
    │   ├── PaperMetaData.js
    │   ├── Result.css
    │   ├── Result.js
    │   ├── assets
    │       └── teaser.png
    │   ├── index.css
    │   ├── index.js
    │   ├── logo.svg
    │   ├── reportWebVitals.js
    │   └── setupTests.js
├── pyproject.toml
├── scripts
    ├── browse_prediction.py
    ├── consistency_mecha_ja.py
    ├── make_leaderboard.py
    └── prepare_jic_vqa.py
├── src
    └── eval_mm
    │   ├── __init__.py
    │   ├── _version.py
    │   ├── metrics
    │       ├── __init__.py
    │       ├── cc_ocr_scorer.py
    │       ├── exact_match_scorer.py
    │       ├── heron_bench_scorer.py
    │       ├── jdocqa_scorer.py
    │       ├── jic_vqa_scorer.py
    │       ├── jmmmu_scorer.py
    │       ├── llm_as_a_judge_scorer.py
    │       ├── mecha_ja_scorer.py
    │       ├── mmmu_scorer.py
    │       ├── rougel_scorer.py
    │       ├── scorer.py
    │       ├── scorer_registry.py
    │       └── substring_match_scorer.py
    │   ├── tasks
    │       ├── __init__.py
    │       ├── cc_ocr.py
    │       ├── cvqa.py
    │       ├── ja_multi_image_vqa.py
    │       ├── ja_vg_vqa_500.py
    │       ├── ja_vlm_bench_in_the_wild.py
    │       ├── japanese_heron_bench.py
    │       ├── jdocqa.py
    │       ├── jic_vqa.py
    │       ├── jmmmu.py
    │       ├── llava_bench_in_the_wild.py
    │       ├── mecha_ja.py
    │       ├── mmmlu.py
    │       ├── mmmu.py
    │       ├── mnist.py
    │       ├── task.py
    │       └── task_registry.py
    │   └── utils
    │       ├── __init__.py
    │       └── azure_client.py
├── test.sh
├── test_model.sh
└── tips
    └── evaluation.md


/.env.sample:
--------------------------------------------------------------------------------
1 | # For Azure OpenAI API
2 | AZURE_OPENAI_ENDPOINT=
3 | AZURE_OPENAI_KEY=
4 | # For OpenAI API
5 | OPENAI_API_KEY=
6 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-github-pages.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     paths:
 8 |       - .github/workflows/deploy-github-pages.yml
 9 |       - github_pages/**
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     defaults:
16 |       run:
17 |         working-directory: github_pages
18 | 
19 |     steps:
20 |       - name: Checkout
21 |         uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0
24 | 
25 |       - name: Instal Node.js
26 |         uses: actions/setup-node@v4
27 | 
28 |       - name: Install dependencies
29 |         run: npm install
30 | 
31 |       - name: Build
32 |         run: npm run build
33 |         env:
34 |           PUBLIC_URL: /llm-jp-eval-mm
35 | 
36 |       - name: Upload Pages artifact
37 |         uses: actions/upload-pages-artifact@v3
38 |         with:
39 |           path: github_pages/build
40 | 
41 |   deploy:
42 |     needs: build
43 | 
44 |     permissions:
45 |       pages: write
46 |       id-token: write
47 | 
48 |     environment:
49 |       name: github-pages
50 |       url: ${{ steps.deployment.outputs.page_url }}
51 | 
52 |     runs-on: ubuntu-latest
53 | 
54 |     steps:
55 |       - name: Deploy to GitHub Pages
56 |         uses: actions/deploy-pages@v4
57 |         id: deployment
58 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Release workflow
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v[0123456789].*"
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   id-token: write
11 | 
12 | jobs:
13 |   release:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: checkout
17 |         uses: actions/checkout@v4
18 |       - name: setup python
19 |         uses: actions/setup-python@v2
20 |         with:
21 |           python-version: "3.x"
22 |       - name: build
23 |         run: |
24 |           python -m pip install --upgrade build hatch
25 |           python -m hatch version "${GITHUB_REF_NAME}"
26 |           python -m build
27 |       - name: publish
28 |         uses: pypa/gh-action-pypi-publish@release/v1
29 |         with:
30 |           password: ${{ secrets.PYPI_API_TOKEN }}
31 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test workflow
 2 | 
 3 | on:
 4 |   push:
 5 | 
 6 | jobs:
 7 |   uv-example:
 8 |     name: python
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - name: Install uv
15 |         uses: astral-sh/setup-uv@v5
16 | 
17 |       - name: Install the project
18 |         run: uv sync --dev
19 | 
20 |       - name: Run tests
21 |         # For example, using `pytest`
22 |         run: uv run pytest src/eval_mm/metrics/*.py
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # Environments
 83 | .venv
 84 | env/
 85 | venv/
 86 | ENV/
 87 | env.bak/
 88 | venv.bak/
 89 | .env
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # weights and biases
105 | wandb/
106 | outputs/
107 | 
108 | # config
109 | configs/config.yaml
110 | 
111 | # debug
112 | dataset/
113 | logs/
114 | 
115 | # submodules
116 | submodule/
117 | 
118 | # temporary
119 | notebooks/tmp
120 | tmp/
121 | 
122 | # verbose output
123 | *verbose.jsonl
124 | __depr__/
125 | 
126 | # examples/llava for evaluating LLM-jp-3 VILA
127 | examples/llava/*
128 | 
129 | # experiments
130 | result/
131 | 
132 | # uv.lock
133 | uv.lock
134 | 
135 | # cursor config
136 | .cursor/
137 | .cursorrules
138 | .cursorignore
139 | 
140 | # vscode
141 | .vscode/
142 | 
143 | # cache
144 | .cache/
145 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     # Ruff version.
 4 |     rev: v0.1.4
 5 |     hooks:
 6 |       # Run the Ruff linter.
 7 |       - id: ruff
 8 |         exclude: "(__init__\\.py|migrations/)"
 9 |         args: ["--extend-ignore=F401,E501"]
10 |       # Run the Ruff formatter.
11 |       - id: ruff-format
12 |         exclude: "(__init__\\.py|migrations/)"
13 | 
14 | 
15 |   - repo: https://github.com/pre-commit/pre-commit-hooks
16 |     rev: v4.4.0
17 |     hooks:
18 |       - id: trailing-whitespace        # 末尾の空白を除去
19 |       - id: end-of-file-fixer          # ファイル末の改行統一
20 |       - id: check-merge-conflict       # コンフリクト残りの検出
21 |       - id: check-yaml                 # YAML構文確認
22 |       - id: check-added-large-files    # 巨大ファイルの誤add防止
23 |       - id: no-commit-to-branch        # main/master直コミット防止
24 |         args: ["--branch", "main", "--branch", "master"]
25 | 
26 |   - repo: https://github.com/pre-commit/mirrors-mypy
27 |     rev: v1.9.0
28 |     hooks:
29 |       - id: mypy
30 |         additional_dependencies: [types-requests]
31 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12.1
2 | 


--------------------------------------------------------------------------------
/assets/streamlit_visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/assets/streamlit_visualization.png


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/assets/teaser.png


--------------------------------------------------------------------------------
/eval_all.sh:
--------------------------------------------------------------------------------
 1 | # Set CUDA devices
 2 | set -eux  # エラーが発生したらスクリプトを停止する
 3 | 
 4 | #export CUDA_VISIBLE_DEVICES=0
 5 | 
 6 | # Model name to group name mapping
 7 | declare -A MODEL_GROUP_MAP=(
 8 |     # ["stabilityai/japanese-instructblip-alpha"]="normal"
 9 |     # ["stabilityai/japanese-stable-vlm"]="normal"
10 |     # ["cyberagent/llava-calm2-siglip"]="calm"
11 |     # ["llava-hf/llava-1.5-7b-hf"]="normal"
12 |     # ["llava-hf/llava-v1.6-mistral-7b-hf"]="normal"
13 |     # ["neulab/Pangea-7B-hf"]="sarashina"
14 |     # ["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal"
15 |     # ["meta-llama/Llama-3.2-90B-Vision-Instruct"]="normal"
16 |     # ["OpenGVLab/InternVL2-8B"]="normal"
17 |     # ["OpenGVLab/InternVL2-26B"]="normal"
18 |     # ["Qwen/Qwen2-VL-7B-Instruct"]="normal"
19 |     # ["Qwen/Qwen2-VL-72B-Instruct"]="normal"
20 |     # ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
21 |     # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
22 |     # ["gpt-4o-2024-11-20"]="normal"
23 |     # ["mistralai/Pixtral-12B-2409"]="pixtral"
24 |     # ["llm-jp/llm-jp-3-vila-14b"]="vilaja"
25 |     # ["Efficient-Large-Model/VILA1.5-13b"]="vilaja"
26 |     # ["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm"
27 |     # ["google/gemma-3-4b-it"]="normal"
28 |     # ["google/gemma-3-12b-it"]="normal"
29 |     # ["google/gemma-3-27b-it"]="normal"
30 |     # ["sbintuitions/sarashina2-vision-8b"]="sarashina"
31 |     # ["sbintuitions/sarashina2-vision-14b"]="sarashina"
32 |     # ["microsoft/Phi-4-multimodal-instruct"]="phi"
33 |     ["turing-motors/Heron-NVILA-Lite-15B"]="heron_nvila"
34 | )
35 | 
36 | # Task list
37 | declare -a task_list=(
38 |     # "japanese-heron-bench"
39 |     "ja-vlm-bench-in-the-wild"
40 |     # "ja-vg-vqa-500"
41 |     "jmmmu"
42 |     "ja-multi-image-vqa"
43 |     "jdocqa"
44 |     "mmmu"
45 |     "llava-bench-in-the-wild"
46 |     # "jic-vqa"
47 |     "mecha-ja"
48 |     # "cc-ocr"
49 |     # "cvqa"
50 | )
51 | 
52 | # Define metrics per task
53 | declare -A METRIC_MAP=(
54 |     ["japanese-heron-bench"]="heron-bench"
55 |     ["ja-vlm-bench-in-the-wild"]="llm-as-a-judge rougel"
56 |     ["ja-vg-vqa-500"]="llm-as-a-judge rougel"
57 |     ["jmmmu"]="jmmmu"
58 |     ["ja-multi-image-vqa"]="llm-as-a-judge rougel"
59 |     ["jdocqa"]="jdocqa llm-as-a-judge"
60 |     ["mmmu"]="mmmu"
61 |     ["llava-bench-in-the-wild"]="llm-as-a-judge rougel"
62 |     ["jic-vqa"]="jic-vqa"
63 |     ["mecha-ja"]="mecha-ja"
64 |     ["cc-ocr"]="cc-ocr"
65 |     ["cvqa"]="substring-match"
66 | )
67 | 
68 | # Result directories
69 | declare -a result_dir_list=(
70 |     "result"
71 | )
72 | 
73 | # Main evaluation loop
74 | for RESULT_DIR in "${result_dir_list[@]}"; do
75 |     for task in "${task_list[@]}"; do
76 |         METRIC=${METRIC_MAP[$task]}
77 |         for model_name in "${!MODEL_GROUP_MAP[@]}"; do
78 |             model_group=${MODEL_GROUP_MAP[$model_name]}
79 |             uv sync --group $model_group
80 |             uv run --group $model_group  python examples/sample.py \
81 |                 --model_id "$model_name" \
82 |                 --task_id "$task" \
83 |                 --metrics $METRIC \
84 |                 --judge_model "gpt-4o-2024-11-20" \
85 |                 --result_dir "$RESULT_DIR"
86 |         done
87 |     done
88 | done
89 | 
90 | echo "All evaluations are done."
91 | 


--------------------------------------------------------------------------------
/eval_with_vllm.sh:
--------------------------------------------------------------------------------
 1 | # Set CUDA devices
 2 | set -eux  # エラーが発生したらスクリプトを停止する
 3 | 
 4 | #export CUDA_VISIBLE_DEVICES=0
 5 | 
 6 | # Model name to group name mapping
 7 | declare -A MODEL_GROUP_MAP=(
 8 |     ["Qwen/Qwen2.5-VL-3B-Instruct"]="normal"
 9 |     ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
10 |     ["Qwen/Qwen2.5-VL-32B-Instruct"]="normal"
11 |     # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
12 |     ["google/gemma-3-4b-it"]="normal"
13 |     ["google/gemma-3-12b-it"]="normal"
14 |     ["google/gemma-3-27b-it"]="normal"
15 | )
16 | 
17 | # Task list
18 | declare -a task_list=(
19 |     "japanese-heron-bench"
20 | )
21 | 
22 | # Define metrics per task
23 | declare -A METRIC_MAP=(
24 |     ["japanese-heron-bench"]="heron-bench"
25 |     ["ja-vlm-bench-in-the-wild"]="llm-as-a-judge,rougel"
26 |     ["ja-vg-vqa-500"]="llm-as-a-judge,rougel"
27 |     ["jmmmu"]="jmmmu"
28 |     ["ja-multi-image-vqa"]="llm-as-a-judge,rougel"
29 |     ["jdocqa"]="jdocqa,llm-as-a-judge"
30 |     ["mmmu"]="mmmu"
31 |     ["llava-bench-in-the-wild"]="llm-as-a-judge,rougel"
32 |     ["jic-vqa"]="jic-vqa"
33 |     ["mecha-ja"]="mecha-ja"
34 | )
35 | 
36 | # Result directories
37 | declare -a result_dir_list=(
38 |     "result"
39 | )
40 | 
41 | # Main evaluation loop
42 | for RESULT_DIR in "${result_dir_list[@]}"; do
43 |     for task in "${task_list[@]}"; do
44 |         METRIC=${METRIC_MAP[$task]}
45 |         for model_name in "${!MODEL_GROUP_MAP[@]}"; do
46 |             model_group=${MODEL_GROUP_MAP[$model_name]}
47 |             uv sync --group vllm_normal
48 |             uv run --group vllm_normal  python examples/sample_vllm.py \
49 |                 --model_id "$model_name" \
50 |                 --task_id "$task" \
51 |                 --metrics "$METRIC" \
52 |                 --judge_model "gpt-4o-2024-11-20" \
53 |                 --result_dir "$RESULT_DIR" \
54 |                 --inference_only
55 |         done
56 |     done
57 | done
58 | 
59 | echo "All evaluations are done."
60 | 


--------------------------------------------------------------------------------
/examples/asagi.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoModel, AutoProcessor
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | 
 7 | 
 8 | class VLM(BaseVLM):
 9 |     def __init__(self, model_id: str = "MIL-UT/Asagi-14B") -> None:
10 |         self.model_id = model_id
11 |         self.model = AutoModel.from_pretrained(
12 |             self.model_id,
13 |             trust_remote_code=True,
14 |             torch_dtype=torch.bfloat16,
15 |             device_map="auto",
16 |         )
17 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
18 | 
19 |     def generate(
20 |         self,
21 |         images: list[Image.Image] | None,
22 |         text: str,
23 |         gen_kwargs: GenerationConfig = GenerationConfig(),
24 |     ) -> str:
25 |         if images is None:
26 |             images = []
27 | 
28 |         prompt = f"""以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。
29 |         ### 指示:
30 |         {"<image>"*len(images)}
31 |         {text}
32 |         ### 応答:
33 |         """
34 | 
35 |         inputs = self.processor(text=prompt, images=images, return_tensors="pt")
36 | 
37 |         inputs_text = self.processor.tokenizer(prompt, return_tensors="pt")
38 |         inputs["input_ids"] = inputs_text["input_ids"]
39 |         inputs["attention_mask"] = inputs_text["attention_mask"]
40 |         inputs = {
41 |             k: inputs[k].to(self.model.device) for k in inputs if k != "token_type_ids"
42 |         }
43 | 
44 |         generate_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
45 |         generated_text = self.processor.batch_decode(
46 |             generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
47 |         )[0]
48 |         # truncate the text to remove the prompt
49 |         generated_text = generated_text.split("### 応答:")[1].strip()
50 |         return generated_text
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     vlm = VLM()
55 |     vlm.test_vlm()
56 | 


--------------------------------------------------------------------------------
/examples/base_vllm.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | from PIL import Image
 3 | from utils import GenerationConfig
 4 | from base_vlm import BaseVLM
 5 | from vllm_registry import VLLMModelRegistry
 6 | import torch
 7 | 
 8 | 
 9 | class VLLM(BaseVLM):
10 |     def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None:
11 |         self.model_id = model_id
12 |         self.registry = VLLMModelRegistry(self.model_id)
13 |         self.processor = self.registry.processor
14 |         self.vllm_loader = self.registry.loader_map[self.model_id]
15 | 
16 |         engine_config = self.registry.get_engine_config(self.model_id)
17 |         self.engine_args_dict = {
18 |             "model": self.model_id,
19 |             "tensor_parallel_size": 2,  # number of GPUs of the machine, but 40 should be divisible by tensor_parallel_size
20 |             "download_dir": "./.cache/vllm",
21 |             **engine_config,
22 |         }
23 |         self.model = LLM(**self.engine_args_dict)
24 | 
25 |     def generate(
26 |         self,
27 |         images: list[Image.Image] | None,
28 |         text: str,
29 |         gen_kwargs: GenerationConfig = GenerationConfig(),
30 |     ) -> str:
31 |         if images is None:
32 |             images = []
33 |         req_data = self.vllm_loader(text, images)
34 |         sampling_params = SamplingParams(
35 |             temperature=gen_kwargs.temperature,
36 |             max_tokens=gen_kwargs.max_new_tokens,
37 |             stop_token_ids=req_data.stop_token_ids,
38 |         )
39 |         outputs = self.model.generate(
40 |             {
41 |                 "prompt": req_data.prompt,
42 |                 "multi_modal_data": {"image": req_data.image_data},
43 |             },
44 |             sampling_params=sampling_params,
45 |             lora_request=req_data.lora_requests,
46 |         )
47 |         return outputs[0].outputs[0].text
48 | 
49 |     def batch_generate(
50 |         self,
51 |         images_list: list[list[Image.Image]] | None,
52 |         text_list: list[str],
53 |         gen_kwargs: GenerationConfig = GenerationConfig(),
54 |     ) -> list[str]:
55 |         if images_list is None:
56 |             images_list = [[] for _ in range(len(text_list))]
57 | 
58 |         assert len(images_list) == len(text_list)
59 | 
60 |         from tqdm import tqdm
61 | 
62 |         req_data_list = []
63 | 
64 |         for text, images in tqdm(zip(text_list, images_list)):
65 |             req_data_list.append(self.vllm_loader(text, images))
66 | 
67 |         sampling_params = SamplingParams(
68 |             temperature=gen_kwargs.temperature,
69 |             max_tokens=gen_kwargs.max_new_tokens,
70 |         )
71 | 
72 |         print(f"Generated {len(req_data_list)} requests")
73 | 
74 |         outputs = self.model.generate(
75 |             [
76 |                 {
77 |                     "prompt": req_data.prompt,
78 |                     "multi_modal_data": {"image": req_data.image_data},
79 |                 }
80 |                 for req_data in req_data_list
81 |             ],
82 |             sampling_params=sampling_params,
83 |         )
84 |         return [output.outputs[0].text for output in outputs]
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     print("=== Qwen/Qwen2.5-VL-3B-Instruct ===")
89 |     vllm = VLLM("Qwen/Qwen2.5-VL-3B-Instruct")
90 |     vllm.test_vlm()
91 |     vllm.test_vlm_batch_100()
92 | 


--------------------------------------------------------------------------------
/examples/base_vlm.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image
 3 | from utils import GenerationConfig
 4 | from loguru import logger
 5 | 
 6 | 
 7 | class BaseVLM:
 8 |     def __init__(self):
 9 |         raise NotImplementedError
10 | 
11 |     def generate(
12 |         self,
13 |         images: list[Image.Image] | None,
14 |         text: str,
15 |         gen_kwargs: GenerationConfig = GenerationConfig(),
16 |     ) -> str:
17 |         """Generate a response given an image (or list of images) and a prompt."""
18 |         raise NotImplementedError
19 | 
20 |     def batch_generate(
21 |         self,
22 |         images_list: list[list[Image.Image]] | None,
23 |         text_list: list[str],
24 |         gen_kwargs: GenerationConfig = GenerationConfig(),
25 |     ) -> list[str]:
26 |         """Generate a response given a list of images and a list of prompts."""
27 |         raise NotImplementedError
28 | 
29 |     def test_vlm(self):
30 |         """Test the model with one or two images."""
31 |         image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
32 |         image = Image.open(requests.get(image_file, stream=True).raw)
33 |         image_file2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
34 |         image2 = Image.open(requests.get(image_file2, stream=True).raw)
35 |         output = self.generate([image], "画像には何が映っていますか?")
36 |         logger.info(f"Output: {output}")
37 |         assert isinstance(
38 |             output, str
39 |         ), f"Expected output to be a string, but got {type(output)}"
40 | 
41 |         output = self.generate([image, image2], "これらの画像の違いはなんですか?")
42 |         logger.info(f"Output: {output}")
43 |         assert isinstance(
44 |             output, str
45 |         ), f"Expected output to be a string, but got {type(output)}"
46 | 
47 |         # --- No image case ---
48 |         # output = self.generate([], "画像には何が映っていますか?")
49 |         # logger.info(f"Output: {output}")
50 |         # assert isinstance(
51 |         #     output, str
52 |         # ), f"Expected output to be a string, but got {type(output)}"
53 | 
54 |     def test_vlm_100(self):
55 |         """Test the model with one or two images."""
56 |         image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
57 |         image = Image.open(requests.get(image_file, stream=True).raw)
58 | 
59 |         import time
60 | 
61 |         start_time = time.time()
62 |         for _ in range(100):
63 |             output = self.generate([image], "画像には何が映っていますか?")
64 |             logger.info(f"Output: {output}")
65 |             assert isinstance(
66 |                 output, str
67 |             ), f"Expected output to be a string, but got {type(output)}"
68 |         end_time = time.time()
69 |         logger.info(f"Time taken: {end_time - start_time} seconds for 100 times")
70 | 
71 |     def test_vlm_batch_100(self):
72 |         """Test the model with one or two images."""
73 | 
74 |         print("=== Batch 100 test ===")
75 |         print(f"Model: {self.model_id}")
76 | 
77 |         image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
78 |         image = Image.open(requests.get(image_file, stream=True).raw)
79 | 
80 |         import time
81 | 
82 |         image_list = [[image] for _ in range(100)]
83 |         text_list = [["画像には何が映っていますか?"] for _ in range(100)]
84 | 
85 |         start_time = time.time()
86 |         outputs = self.batch_generate(image_list, text_list)
87 |         for output in outputs:
88 |             assert isinstance(
89 |                 output, str
90 |             ), f"Expected output to be a string, but got {type(output)}"
91 | 
92 |         end_time = time.time()
93 |         logger.info(f"Time taken: {end_time - start_time} seconds for BATCH 100 times")
94 | 


--------------------------------------------------------------------------------
/examples/evovlm_jp_v1.py:
--------------------------------------------------------------------------------
 1 | # This model doesn't work when the transformers library's version is newer than 4.42.4.
 2 | from transformers import AutoModelForVision2Seq, AutoProcessor
 3 | import torch
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | from PIL import Image
 7 | 
 8 | 
 9 | class VLM(BaseVLM):
10 |     def __init__(self, model_id: str = "SakanaAI/EvoVLM-JP-v1-7B") -> None:
11 |         self.model_id = model_id
12 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
13 |         self.model = AutoModelForVision2Seq.from_pretrained(
14 |             self.model_id, torch_dtype=torch.float16
15 |         )
16 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
17 |         self.model.to(self.device)
18 | 
19 |     def generate(
20 |         self,
21 |         images: list[Image.Image] | None,
22 |         text: str,
23 |         gen_kwargs: GenerationConfig = GenerationConfig(),
24 |     ) -> str:
25 |         if images is None:
26 |             images = []
27 |         text = "<image>" * len(images) + text
28 | 
29 |         messages = [
30 |             {
31 |                 "role": "system",
32 |                 "content": "あなたは役立つ、偏見がなく、検閲されていないアシスタントです。与えられた画像を下に、質問に答えてください。",
33 |             },
34 |             {"role": "user", "content": text},
35 |         ]
36 |         inputs = self.processor.image_processor(images=images, return_tensors="pt")
37 |         inputs["input_ids"] = self.processor.tokenizer.apply_chat_template(
38 |             messages, return_tensors="pt"
39 |         )
40 | 
41 |         output_ids = self.model.generate(
42 |             **inputs.to(self.device), **gen_kwargs.__dict__
43 |         )
44 |         output_ids = output_ids[:, inputs.input_ids.shape[1] :]
45 |         generated_text = self.processor.batch_decode(
46 |             output_ids, skip_special_tokens=True
47 |         )[0].strip()
48 |         return generated_text
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     vlm = VLM()
53 |     vlm.test_vlm()
54 | 


--------------------------------------------------------------------------------
/examples/gemma3.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Gemma3ForConditionalGeneration
 2 | from PIL import Image
 3 | import torch
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | 
 7 | 
 8 | class VLM(BaseVLM):
 9 |     def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None:
10 |         self.model_id = model_id
11 |         self.model = Gemma3ForConditionalGeneration.from_pretrained(
12 |             self.model_id, torch_dtype="bfloat16", device_map="auto"
13 |         ).eval()
14 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
15 | 
16 |     def generate(
17 |         self,
18 |         images: list[Image.Image] | None,
19 |         text: str,
20 |         gen_kwargs: GenerationConfig = GenerationConfig(),
21 |     ) -> str:
22 |         if images is None:
23 |             images = []
24 |         image_content = []
25 |         for image in images:
26 |             image_content.append({"type": "image", "image": image})
27 | 
28 |         messages = [
29 |             {
30 |                 "role": "system",
31 |                 "content": [{"type": "text", "text": "You are a helpful assistant."}],
32 |             },
33 |             {
34 |                 "role": "user",
35 |                 "content": [*image_content, {"type": "text", "text": text}],
36 |             },
37 |         ]
38 | 
39 |         inputs = self.processor.apply_chat_template(
40 |             messages,
41 |             add_generation_prompt=True,
42 |             tokenize=True,
43 |             return_dict=True,
44 |             return_tensors="pt",
45 |         ).to(self.model.device, dtype=torch.bfloat16)
46 | 
47 |         input_len = inputs["input_ids"].shape[-1]
48 | 
49 |         with torch.inference_mode():
50 |             generation = self.model.generate(**inputs, **gen_kwargs.__dict__)
51 |             generation = generation[0][input_len:]
52 | 
53 |         decoded = self.processor.decode(generation, skip_special_tokens=True)
54 |         return decoded
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     vlm = VLM()
59 |     vlm.test_vlm()
60 | 


--------------------------------------------------------------------------------
/examples/gpt4o.py:
--------------------------------------------------------------------------------
 1 | from openai import AzureOpenAI, APIError
 2 | import os
 3 | from io import BytesIO
 4 | import base64
 5 | from base_vlm import BaseVLM
 6 | from utils import GenerationConfig
 7 | import backoff
 8 | from PIL import Image
 9 | 
10 | 
11 | def encode_image_to_base64(image):
12 |     buffered = BytesIO()
13 |     image.save(buffered, format="JPEG")
14 |     img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
15 |     return img_str
16 | 
17 | 
18 | @backoff.on_exception(backoff.expo, (ValueError, APIError), max_tries=5)
19 | def make_api_call(vlm, message, gen_kwargs):
20 |     return vlm.client.chat.completions.create(
21 |         model=vlm.model_id,
22 |         messages=message,
23 |         max_tokens=gen_kwargs.max_new_tokens,
24 |         temperature=gen_kwargs.temperature,
25 |         top_p=gen_kwargs.top_p,
26 |     )
27 | 
28 | 
29 | class VLM(BaseVLM):
30 |     def __init__(self, model_id: str = "gpt-4o-2024-05-13") -> None:
31 |         self.model_id = model_id
32 |         self.client = AzureOpenAI(
33 |             api_key=os.getenv("AZURE_OPENAI_KEY"),
34 |             api_version="2023-05-15",
35 |             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
36 |         )
37 | 
38 |     def generate(
39 |         self,
40 |         images: list[Image.Image] | None,
41 |         text: str,
42 |         gen_kwargs: GenerationConfig = GenerationConfig(),
43 |     ) -> str:
44 |         message = []
45 |         content: list[dict[str, str | dict[str, str]]] = []
46 |         if images is None:
47 |             images = []
48 |         image_base64_list = [encode_image_to_base64(img) for img in images]
49 |         message_base = {
50 |             "role": "user",
51 |             "content": [
52 |                 {
53 |                     "type": "text",
54 |                     "text": text,
55 |                 },
56 |             ],
57 |         }
58 | 
59 |         content.append(
60 |             {
61 |                 "type": "text",
62 |                 "text": text,
63 |             },
64 |         )
65 |         for image_base64 in image_base64_list:
66 |             content.append(
67 |                 {
68 |                     "type": "image_url",
69 |                     "image_url": {
70 |                         "url": f"data:image/jpeg;base64,{image_base64}",
71 |                         "detail": "auto",
72 |                     },
73 |                 }
74 |             )
75 |         message_base = {
76 |             "role": "user",
77 |             "content": content,
78 |         }
79 |         message = [message_base]
80 | 
81 |         response = make_api_call(self, message, gen_kwargs)
82 |         return response.choices[0].message.content
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     vlm = VLM()
87 |     vlm.test_vlm()
88 | 


--------------------------------------------------------------------------------
/examples/heron_nvila.py:
--------------------------------------------------------------------------------
 1 | from base_vlm import BaseVLM
 2 | from utils import GenerationConfig
 3 | import torch
 4 | from transformers import GenerationConfig as HFGenerationConfig, AutoModel
 5 | 
 6 | 
 7 | def create_prompt(text, image):
 8 |     if image is None or (isinstance(image, list) and len(image) == 0):
 9 |         return [text] if text else []
10 |     if not isinstance(image, list):
11 |         image = [image]
12 |     if not text:
13 |         return image
14 |     if "<image>" not in text:
15 |         prompt = image.copy()
16 |         prompt.append(text)
17 |         return prompt
18 |     parts = text.split("<image>")
19 |     prompt, idx = [], 0
20 |     if parts[0] == "":
21 |         prompt.append(image[idx])
22 |         idx += 1
23 |         parts = parts[1:]
24 |     for i, part in enumerate(parts):
25 |         if part:
26 |             prompt.append(part)
27 |         if idx < len(image) and (i < len(parts) - 1 or text.endswith("<image>")):
28 |             prompt.append(image[idx])
29 |             idx += 1
30 |     return prompt
31 | 
32 | 
33 | class VLM(BaseVLM):
34 |     def __init__(self, model_id="turing-motors/Heron-NVILA-Lite-15B"):
35 |         self.model_id = model_id
36 |         self.model = AutoModel.from_pretrained(
37 |             model_id, trust_remote_code=True, device_map="auto"
38 |         )
39 | 
40 |     def generate(
41 |         self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
42 |     ):
43 |         gen_cfg = HFGenerationConfig(**gen_kwargs.__dict__)
44 |         prompt = create_prompt(text, image)
45 |         with torch.no_grad():
46 |             return self.model.generate_content(prompt, generation_config=gen_cfg)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     VLM("turing-motors/Heron-NVILA-Lite-15B").test_vlm()
51 | 


--------------------------------------------------------------------------------
/examples/internvl2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import Tensor
  3 | import torchvision.transforms as T
  4 | from PIL import Image
  5 | from torchvision.transforms.functional import InterpolationMode
  6 | from transformers import AutoModel, AutoTokenizer
  7 | from base_vlm import BaseVLM
  8 | from utils import GenerationConfig
  9 | import copy
 10 | 
 11 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
 12 | IMAGENET_STD = (0.229, 0.224, 0.225)
 13 | 
 14 | 
 15 | def build_transform(input_size):
 16 |     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
 17 |     transform = T.Compose(
 18 |         [
 19 |             T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
 20 |             T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
 21 |             T.ToTensor(),
 22 |             T.Normalize(mean=MEAN, std=STD),
 23 |         ]
 24 |     )
 25 |     return transform
 26 | 
 27 | 
 28 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
 29 |     best_ratio_diff = float("inf")
 30 |     best_ratio = (1, 1)
 31 |     area = width * height
 32 |     for ratio in target_ratios:
 33 |         target_aspect_ratio = ratio[0] / ratio[1]
 34 |         ratio_diff = abs(aspect_ratio - target_aspect_ratio)
 35 |         if ratio_diff < best_ratio_diff:
 36 |             best_ratio_diff = ratio_diff
 37 |             best_ratio = ratio
 38 |         elif ratio_diff == best_ratio_diff:
 39 |             if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
 40 |                 best_ratio = ratio
 41 |     return best_ratio
 42 | 
 43 | 
 44 | def dynamic_preprocess(
 45 |     image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
 46 | ):
 47 |     orig_width, orig_height = image.size
 48 |     aspect_ratio = orig_width / orig_height
 49 | 
 50 |     # calculate the existing image aspect ratio
 51 |     target_ratios = set(
 52 |         (i, j)
 53 |         for n in range(min_num, max_num + 1)
 54 |         for i in range(1, n + 1)
 55 |         for j in range(1, n + 1)
 56 |         if i * j <= max_num and i * j >= min_num
 57 |     )
 58 |     target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
 59 | 
 60 |     # find the closest aspect ratio to the target
 61 |     target_aspect_ratio = find_closest_aspect_ratio(
 62 |         aspect_ratio, target_ratios, orig_width, orig_height, image_size
 63 |     )
 64 | 
 65 |     # calculate the target width and height
 66 |     target_width = image_size * target_aspect_ratio[0]
 67 |     target_height = image_size * target_aspect_ratio[1]
 68 |     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
 69 | 
 70 |     # resize the image
 71 |     resized_img = image.resize((target_width, target_height))
 72 |     processed_images = []
 73 |     for i in range(blocks):
 74 |         box = (
 75 |             (i % (target_width // image_size)) * image_size,
 76 |             (i // (target_width // image_size)) * image_size,
 77 |             ((i % (target_width // image_size)) + 1) * image_size,
 78 |             ((i // (target_width // image_size)) + 1) * image_size,
 79 |         )
 80 |         # split the image
 81 |         split_img = resized_img.crop(box)
 82 |         processed_images.append(split_img)
 83 |     assert len(processed_images) == blocks
 84 |     if use_thumbnail and len(processed_images) != 1:
 85 |         thumbnail_img = image.resize((image_size, image_size))
 86 |         processed_images.append(thumbnail_img)
 87 |     return processed_images
 88 | 
 89 | 
 90 | def load_image(image, input_size=448, max_num=12):
 91 |     transform = build_transform(input_size=input_size)
 92 |     images = dynamic_preprocess(
 93 |         image, image_size=input_size, use_thumbnail=True, max_num=max_num
 94 |     )
 95 |     pixel_values = [transform(image) for image in images]
 96 |     pixel_values = torch.stack(pixel_values)
 97 |     return pixel_values
 98 | 
 99 | 
100 | # 画像の数だけ画像を読み込んでcatする
101 | def load_images(images: Image.Image | list[Image.Image]):
102 |     if isinstance(images, list):
103 |         tuples: tuple[Tensor, ...] = ()
104 | 
105 |         for image in images:
106 |             tuples += (load_image(image).to(torch.bfloat16).cuda(),)
107 |         return torch.cat(tuples, dim=0)
108 |     else:
109 |         return load_image(images).to(torch.bfloat16).cuda()
110 | 
111 | 
112 | # 画像の数だけ <image> をpromptの先頭に追加する
113 | def add_image_tags(images: Image.Image | list[Image.Image], prompt: str) -> str:
114 |     if isinstance(images, list):
115 |         num_images = len(images)
116 |     else:
117 |         num_images = 1
118 | 
119 |     image_tags = "<image> " * num_images
120 |     new_prompt = image_tags + prompt
121 | 
122 |     return new_prompt
123 | 
124 | 
125 | class VLM(BaseVLM):
126 |     def __init__(self, model_id: str = "OpenGVLab/InternVL2-8B") -> None:
127 |         self.model_id = model_id
128 |         self.model = AutoModel.from_pretrained(
129 |             self.model_id,
130 |             torch_dtype=torch.bfloat16,
131 |             low_cpu_mem_usage=True,
132 |             use_flash_attn=True,
133 |             trust_remote_code=True,
134 |             device_map="auto",
135 |         )
136 |         self.tokenizer = AutoTokenizer.from_pretrained(
137 |             self.model_id, trust_remote_code=True, use_fast=False
138 |         )
139 | 
140 |     def generate(
141 |         self,
142 |         images: list[Image.Image] | None,
143 |         text: str,
144 |         gen_kwargs: GenerationConfig = GenerationConfig(),
145 |     ) -> str:
146 |         if images is None:
147 |             images = []
148 |         if "<image>" not in text:
149 |             image_tokens = " ".join(["<image>"] * len(images))
150 |             text = f"{image_tokens}\n{text}"
151 | 
152 |         pixel_values_list = []
153 |         for img in images:
154 |             pixel_values = (
155 |                 load_image(img, max_num=12).to(self.model.device).to(self.model.dtype)
156 |             )
157 |             pixel_values_list.append(pixel_values)
158 |         num_patches_list = [pixel_values.size(0) for pixel_values in pixel_values_list]
159 |         if len(images) == 0:
160 |             pixel_values = None
161 |         else:
162 |             pixel_values = torch.cat(pixel_values_list, dim=0)
163 | 
164 |         generation_config = copy.deepcopy(gen_kwargs.__dict__)
165 |         generation_config.pop("use_cache")
166 | 
167 |         response = self.model.chat(
168 |             self.tokenizer,
169 |             pixel_values,
170 |             text,
171 |             num_patches_list=num_patches_list,
172 |             generation_config=generation_config,
173 |         )
174 |         generated_text = response
175 |         return generated_text
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     vlm = VLM()
180 |     vlm.test_vlm()
181 | 


--------------------------------------------------------------------------------
/examples/llama_3_2_vision.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import MllamaForConditionalGeneration, AutoProcessor
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | 
 7 | 
 8 | class VLM(BaseVLM):
 9 |     def __init__(
10 |         self, model_id: str = "meta-llama/Llama-3.2-11B-Vision-Instruct"
11 |     ) -> None:
12 |         self.model_id = model_id
13 |         self.model = MllamaForConditionalGeneration.from_pretrained(
14 |             self.model_id,
15 |             torch_dtype=torch.bfloat16,
16 |             device_map="auto",
17 |         )
18 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
19 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
20 | 
21 |     def generate(
22 |         self,
23 |         images: list[Image.Image] | None,
24 |         text: str,
25 |         gen_kwargs: GenerationConfig = GenerationConfig(),
26 |     ) -> str:
27 |         if images is None:
28 |             images = []
29 |         num_images = len(images)
30 |         content = [{"type": "image"} for _ in range(num_images)]
31 |         content.extend([{"type": "text", "text": text}])
32 |         messages = [
33 |             {
34 |                 "role": "user",
35 |                 "content": content,
36 |             }
37 |         ]
38 |         input_text = self.processor.apply_chat_template(
39 |             messages, add_generation_prompt=True
40 |         )
41 | 
42 |         inputs = self.processor(
43 |             text=input_text,
44 |             images=images,
45 |             add_special_tokens=False,
46 |             return_tensors="pt",
47 |         ).to(self.device)
48 | 
49 |         output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
50 |         generated_ids = [
51 |             output_ids[len(input_ids) :]
52 |             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
53 |         ]
54 |         return self.processor.decode(
55 |             generated_ids[0],
56 |             skip_special_tokens=True,
57 |             clean_up_tokenization_spaces=True,
58 |         )
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     vlm = VLM()
63 |     vlm.test_vlm()
64 | 


--------------------------------------------------------------------------------
/examples/llama_3_evovlm_jp_v2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mantis.models.conversation import Conversation, SeparatorStyle
 3 | from mantis.models.mllava import (
 4 |     chat_mllava,
 5 |     LlavaForConditionalGeneration,
 6 |     MLlavaProcessor,
 7 | )
 8 | from mantis.models.mllava.utils import conv_templates
 9 | from base_vlm import BaseVLM
10 | from utils import GenerationConfig
11 | from PIL import Image
12 | 
13 | # 1. Set the system prompt
14 | conv_llama_3_elyza = Conversation(
15 |     system="<|start_header_id|>system<|end_header_id|>\n\nあなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。",
16 |     roles=("user", "assistant"),
17 |     messages=(),
18 |     offset=0,
19 |     sep_style=SeparatorStyle.LLAMA_3,
20 |     sep="<|eot_id|>",
21 | )
22 | conv_templates["llama_3"] = conv_llama_3_elyza
23 | 
24 | 
25 | class VLM(BaseVLM):
26 |     def __init__(self, model_id: str = "SakanaAI/Llama-3-EvoVLM-JP-v2") -> None:
27 |         self.model_id = model_id
28 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
29 |         self.model = LlavaForConditionalGeneration.from_pretrained(
30 |             self.model_id, torch_dtype=torch.float16, device_map=self.device
31 |         ).eval()
32 |         self.processor = MLlavaProcessor.from_pretrained(
33 |             "TIGER-Lab/Mantis-8B-siglip-llama3"
34 |         )
35 |         self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token
36 | 
37 |     def generate(
38 |         self,
39 |         images: list[Image.Image] | None,
40 |         text: str,
41 |         gen_kwargs: GenerationConfig = GenerationConfig(),
42 |     ) -> str:
43 |         if images is None:
44 |             images = []
45 |         if "<image>" not in text:
46 |             text = "<image> " * len(images) + "\n" + text
47 |         response, history = chat_mllava(
48 |             text, images, self.model, self.processor, **gen_kwargs.__dict__
49 |         )
50 |         return response
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     vlm = VLM()
55 |     vlm.test_vlm()
56 | 


--------------------------------------------------------------------------------
/examples/llava_1_5.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | 
 7 | DEFAULT_IMAGE_TOKEN = "<image>"
 8 | 
 9 | 
10 | class VLM(BaseVLM):
11 |     def __init__(self, model_id: str = "llava-hf/llava-1.5-7b-hf") -> None:
12 |         self.model_id = model_id
13 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
14 |         self.model = LlavaForConditionalGeneration.from_pretrained(
15 |             self.model_id,
16 |             torch_dtype=torch.float16,
17 |             low_cpu_mem_usage=True,
18 |         )
19 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
20 |         self.model.to(self.device)
21 | 
22 |     def generate(
23 |         self,
24 |         images: list[Image.Image] | None,
25 |         text: str,
26 |         gen_kwargs: GenerationConfig = GenerationConfig(),
27 |     ) -> str:
28 |         if images is None:
29 |             images = []
30 |         if DEFAULT_IMAGE_TOKEN in text:
31 |             text = text.replace(DEFAULT_IMAGE_TOKEN, "")
32 |         content = [{"type": "image"} for _ in range(len(images))]
33 |         content.extend([{"type": "text", "text": text}])
34 |         messages = [
35 |             {
36 |                 "role": "user",
37 |                 "content": content,
38 |             },
39 |         ]
40 | 
41 |         prompt = self.processor.apply_chat_template(
42 |             messages, add_generation_prompt=True
43 |         )
44 | 
45 |         # processorがimages=Noneと[]を区別する可能性があるため、分岐で処理
46 |         if len(images) == 0:
47 |             inputs = self.processor(text=prompt, return_tensors="pt").to(self.device)
48 |         else:
49 |             inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(
50 |                 self.device
51 |             )
52 | 
53 |         output = self.model.generate(**inputs, **gen_kwargs.__dict__)[0]
54 |         generated_text = self.processor.decode(output, skip_special_tokens=True)
55 |         answer = generated_text.split("ASSISTANT:")[-1].strip()
56 |         return answer
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     vlm = VLM()
61 |     vlm.test_vlm()
62 | 


--------------------------------------------------------------------------------
/examples/llava_1_6_mistral_hf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
 4 | from base_vlm import BaseVLM
 5 | from utils import GenerationConfig
 6 | 
 7 | 
 8 | class VLM(BaseVLM):
 9 |     def __init__(self, model_id: str = "llava-hf/llava-v1.6-mistral-7b-hf") -> None:
10 |         self.model_id = model_id
11 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
12 |         self.model = LlavaNextForConditionalGeneration.from_pretrained(
13 |             self.model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
14 |         )
15 |         self.processor = LlavaNextProcessor.from_pretrained(self.model_id)
16 |         self.model.to(self.device)
17 | 
18 |     def generate(
19 |         self,
20 |         images: list[Image.Image] | None,
21 |         text: str,
22 |         gen_kwargs: GenerationConfig = GenerationConfig(),
23 |     ) -> str:
24 |         if images is None:
25 |             images = []
26 |         content = [{"type": "image"} for _ in range(len(images))]
27 |         content.extend([{"type": "text", "text": text}])
28 |         messages = [
29 |             {
30 |                 "role": "user",
31 |                 "content": content,
32 |             }
33 |         ]
34 |         input_text = self.processor.apply_chat_template(
35 |             messages, add_generation_prompt=True
36 |         )
37 |         inputs = self.processor(
38 |             text=input_text,
39 |             images=images,
40 |             add_special_tokens=False,
41 |             return_tensors="pt",
42 |         ).to(self.device)
43 | 
44 |         # autoregressively complete prompt
45 |         output = self.model.generate(**inputs, **gen_kwargs.__dict__)[0]
46 | 
47 |         generated_text = self.processor.decode(output, skip_special_tokens=True)
48 |         # split [INST] and return the last part
49 |         generated_text = generated_text.split("[/INST]")[-1].strip()
50 |         return generated_text
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     vlm = VLM()
55 |     vlm.test_vlm()
56 | 


--------------------------------------------------------------------------------
/examples/llava_calm2_siglip.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 2 | import torch
 3 | from base_vlm import BaseVLM
 4 | from utils import GenerationConfig
 5 | from PIL import Image
 6 | 
 7 | 
 8 | class VLM(BaseVLM):
 9 |     def __init__(self, model_id: str = "cyberagent/llava-calm2-siglip") -> None:
10 |         self.model_id = model_id
11 |         self.model = LlavaForConditionalGeneration.from_pretrained(
12 |             self.model_id,
13 |             torch_dtype=torch.bfloat16,
14 |         ).to("cuda")
15 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
16 | 
17 |     def generate(
18 |         self,
19 |         images: list[Image.Image] | None,
20 |         text: str,
21 |         gen_kwargs: GenerationConfig = GenerationConfig(),
22 |     ) -> str:
23 |         if images is None:
24 |             images = []
25 |         prefix = None
26 |         if "<image>" in text:
27 |             prompt = "USER: " + text + "\nASSISTANT: "
28 |         else:
29 |             num_images = len(images)
30 |             prefix = "<image> " * num_images
31 |             prompt = "USER: " + prefix + text + "\nASSISTANT: "
32 | 
33 |         inputs = (
34 |             self.processor(
35 |                 text=prompt,
36 |                 images=images,
37 |                 add_special_tokens=False,
38 |                 return_tensors="pt",
39 |             )
40 |             .to(self.model.device)
41 |             .to(self.model.dtype)
42 |         )
43 |         output_ids = self.model.generate(
44 |             **inputs,
45 |             **gen_kwargs.__dict__,
46 |         )
47 |         generate_ids = [
48 |             output_ids[len(input_ids) :]
49 |             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
50 |         ]
51 | 
52 |         output = self.processor.tokenizer.decode(
53 |             generate_ids[0][:-1], clean_up_tokenization_spaces=False
54 |         )
55 | 
56 |         return output
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     vlm = VLM()
61 |     vlm.test_vlm()
62 | 


--------------------------------------------------------------------------------
/examples/llm_jp_3_vila.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from llava.constants import IMAGE_TOKEN_INDEX
 3 | from llava.conversation import conv_templates
 4 | from llava.mm_utils import (
 5 |     get_model_name_from_path,
 6 |     process_images,
 7 |     tokenizer_image_token,
 8 | )
 9 | from llava.model.builder import load_pretrained_model
10 | from base_vlm import BaseVLM
11 | from utils import GenerationConfig
12 | from PIL import Image
13 | 
14 | 
15 | class VLM(BaseVLM):
16 |     def __init__(self, model_id: str = "llm-jp/llm-jp-3-vila-14b") -> None:
17 |         self.model_id = model_id
18 |         model_name = get_model_name_from_path(self.model_id)
19 |         device = "cuda" if torch.cuda.is_available() else "cpu"
20 |         self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model(
21 |             self.model_id, model_name, device=device
22 |         )
23 | 
24 |     def generate(
25 |         self,
26 |         images: list[Image.Image] | None,
27 |         text: str,
28 |         gen_kwargs: GenerationConfig = GenerationConfig(),
29 |     ) -> str:
30 |         if images is None:
31 |             images = []
32 |         qs = text
33 |         if "<image>" not in text:
34 |             qs = "<image>\n" * len(images) + text
35 |         conv_mode = "llmjp_v3"
36 |         conv = conv_templates[conv_mode].copy()
37 |         conv.append_message(conv.roles[0], qs)
38 |         conv.append_message(conv.roles[1], None)
39 |         prompt = conv.get_prompt()
40 | 
41 |         images_tensor = [
42 |             process_images(images, self.image_processor, self.model.config).to(
43 |                 self.model.device, dtype=torch.float16
44 |             )
45 |         ]
46 |         input_ids = (
47 |             tokenizer_image_token(
48 |                 prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
49 |             )
50 |             .unsqueeze(0)
51 |             .to(self.model.device)
52 |         )
53 | 
54 |         with torch.inference_mode():
55 |             output_ids = self.model.generate(
56 |                 input_ids,
57 |                 images=images_tensor,
58 |                 **gen_kwargs.__dict__,
59 |             )
60 | 
61 |         outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
62 |         return outputs
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     vlm = VLM()
67 |     vlm.test_vlm()
68 | 


--------------------------------------------------------------------------------
/examples/model_table.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | MODEL_ID_TO_CLASS_PATH = {
 4 |     "llava-hf/llava-1.5-7b-hf": "llava_1_5.VLM",
 5 |     "llava-hf/llava-1.5-13b-hf": "llava_1_5.VLM",
 6 |     "llava-hf/llava-v1.6-mistral-7b-hf": "llava_1_6_mistral_hf.VLM",
 7 |     "SakanaAI/EvoVLM-JP-v1-7B": "evovlm_jp_v1.VLM",
 8 |     "gpt-4o-2024-05-13": "gpt4o.VLM",
 9 |     "gpt-4o-2024-11-20": "gpt4o.VLM",
10 |     "internlm/internlm-xcomposer2d5-7b": "xcomposer2d5.VLM",
11 |     "OpenGVLab/InternVL2-8B": "internvl2.VLM",
12 |     "OpenGVLab/InternVL2-26B": "internvl2.VLM",
13 |     "meta-llama/Llama-3.2-11B-Vision-Instruct": "llama_3_2_vision.VLM",
14 |     "meta-llama/Llama-3.2-90B-Vision-Instruct": "llama_3_2_vision.VLM",
15 |     "Kendamarron/Llama-3.2-11B-Vision-Instruct-Swallow-8B-Merge": "llama_3_2_vision.VLM",
16 |     "AXCXEPT/Llama-3-EZO-VLM-1": "llama_3_evovlm_jp_v2.VLM",
17 |     "SakanaAI/Llama-3-EvoVLM-JP-v2": "llama_3_evovlm_jp_v2.VLM",
18 |     "neulab/Pangea-7B-hf": "pangea_hf.VLM",
19 |     "mistralai/Pixtral-12B-2409": "pixtral.VLM",
20 |     "Qwen/Qwen2-VL-2B-Instruct": "qwen2_vl.VLM",
21 |     "Qwen/Qwen2-VL-7B-Instruct": "qwen2_vl.VLM",
22 |     "Qwen/Qwen2-VL-72B-Instruct": "qwen2_vl.VLM",
23 |     "Qwen/Qwen2.5-VL-3B-Instruct": "qwen2_5_vl.VLM",
24 |     "Qwen/Qwen2.5-VL-7B-Instruct": "qwen2_5_vl.VLM",
25 |     "Qwen/Qwen2.5-VL-32B-Instruct": "qwen2_5_vl.VLM",
26 |     "Qwen/Qwen2.5-VL-72B-Instruct": "qwen2_5_vl.VLM",
27 |     "llm-jp/llm-jp-3-vila-14b": "llm_jp_3_vila.VLM",
28 |     "stabilityai/japanese-instructblip-alpha": "japanese_instructblip_alpha.VLM",
29 |     "stabilityai/japanese-stable-vlm": "japanese_stable_vlm.VLM",
30 |     "cyberagent/llava-calm2-siglip": "llava_calm2_siglip.VLM",
31 |     "Efficient-Large-Model/VILA1.5-13b": "vila.VLM",
32 |     "google/gemma-3-1b-it": "gemma3.VLM",
33 |     "google/gemma-3-4b-it": "gemma3.VLM",
34 |     "google/gemma-3-12b-it": "gemma3.VLM",
35 |     "google/gemma-3-27b-it": "gemma3.VLM",
36 |     "sbintuitions/sarashina2-vision-8b": "sarashina2_vision.VLM",
37 |     "sbintuitions/sarashina2-vision-14b": "sarashina2_vision.VLM",
38 |     "microsoft/Phi-4-multimodal-instruct": "phi4_multimodal.VLM",
39 |     "MIL-UT/Asagi-14B": "asagi.VLM",
40 |     "turing-motors/Heron-NVILA-Lite-1B": "heron_nvila.VLM",
41 |     "turing-motors/Heron-NVILA-Lite-2B": "heron_nvila.VLM",
42 |     "turing-motors/Heron-NVILA-Lite-15B": "heron_nvila.VLM",
43 |     "turing-motors/Heron-NVILA-Lite-33B": "heron_nvila.VLM",
44 | }
45 | 
46 | 
47 | def get_class_from_path(class_path: str):
48 |     """指定されたパスからクラスを動的にインポートして返す"""
49 |     module_name, class_name = class_path.rsplit(".", 1)
50 |     module = importlib.import_module(module_name)
51 |     return getattr(module, class_name)
52 | 
53 | 
54 | def get_class_from_model_id(model_id: str):
55 |     return get_class_from_path(MODEL_ID_TO_CLASS_PATH[model_id])
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     for model_id, class_path in MODEL_ID_TO_CLASS_PATH.items():
60 |         try:
61 |             vlm_class = get_class_from_path(class_path)
62 |             vlm = vlm_class(model_id)
63 |             vlm.test_vlm()
64 |             print(f"Tested {model_id}")
65 |         except Exception as e:
66 |             print(f"Error testing {model_id}: {e}")
67 | 


--------------------------------------------------------------------------------
/examples/pangea_hf.py:
--------------------------------------------------------------------------------
 1 | # Assuming that you have text_input and image_path
 2 | from transformers import LlavaNextForConditionalGeneration, AutoProcessor
 3 | import torch
 4 | from PIL import Image
 5 | from base_vlm import BaseVLM
 6 | from utils import GenerationConfig
 7 | 
 8 | 
 9 | class VLM(BaseVLM):
10 |     def __init__(self, model_id: str = "neulab/Pangea-7B-hf") -> None:
11 |         self.model_id = model_id
12 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
13 |         self.model = LlavaNextForConditionalGeneration.from_pretrained(
14 |             self.model_id, torch_dtype=torch.float16
15 |         ).to(0)
16 |         self.processor = AutoProcessor.from_pretrained(self.model_id)
17 |         self.model.resize_token_embeddings(len(self.processor.tokenizer))
18 | 
19 |     def generate(
20 |         self,
21 |         images: list[Image.Image] | None,
22 |         text: str,
23 |         gen_kwargs: GenerationConfig = GenerationConfig(),
24 |     ) -> str:
25 |         if images is None:
26 |             images = []
27 | 
28 |         prompt_template = (
29 |             "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user"
30 |             + "\n<image>" * len(images)
31 |             + "\n{text}<|im_end|>\n<|im_start|>assistant\n"
32 |         )
33 |         input_text = prompt_template.format(text=text)
34 |         if images is None:
35 |             # TODO: text only need to reload model https://huggingface.co/neulab/Pangea-7B <-?
36 |             model_inputs = self.processor(text=input_text, return_tensors="pt").to(
37 |                 self.device, torch.float16
38 |             )
39 |         else:
40 |             model_inputs = self.processor(
41 |                 images=images, text=input_text, return_tensors="pt"
42 |             ).to(self.device, torch.float16)
43 | 
44 |         output = self.model.generate(
45 |             **model_inputs,
46 |             **gen_kwargs.__dict__,
47 |         )
48 |         output = output[0]
49 |         result = self.processor.decode(
50 |             output, skip_special_tokens=True, clean_up_tokenization_spaces=False
51 |         )
52 |         # extract the answer
53 |         result = result.split("assistant\n")[-1].strip()
54 |         return result
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     vlm = VLM()
59 |     vlm.test_vlm()
60 | 


--------------------------------------------------------------------------------
/examples/phi4_multimodal.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import AutoModelForCausalLM, AutoProcessor
 3 | import transformers
 4 | import torch
 5 | from base_vlm import BaseVLM
 6 | from utils import GenerationConfig
 7 | 
 8 | 
 9 | class VLM(BaseVLM):
10 |     def __init__(self, model_id: str = "microsoft/Phi-4-multimodal-instruct") -> None:
11 |         self.model_id = model_id
12 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 |         self.model = AutoModelForCausalLM.from_pretrained(
15 |             self.model_id,
16 |             trust_remote_code=True,
17 |             torch_dtype="auto",
18 |             _attn_implementation="flash_attention_2",
19 |         ).to(self.device)
20 | 
21 |         self.processor = AutoProcessor.from_pretrained(
22 |             self.model_id, trust_remote_code=True
23 |         )
24 | 
25 |     def generate(
26 |         self,
27 |         images: list[Image.Image] | None,
28 |         text: str,
29 |         gen_kwargs: GenerationConfig = GenerationConfig(),
30 |     ) -> str:
31 |         if images is None:
32 |             images = []
33 |         generation_config = transformers.GenerationConfig.from_pretrained(
34 |             self.model_id, "generation_config.json"
35 |         )
36 | 
37 |         ########################### vision (multi-frame) ################################
38 |         placeholder = ""
39 |         for i in range(len(images)):
40 |             placeholder += f"<|image_{i}|>"
41 | 
42 |         messages = [
43 |             {"role": "user", "content": placeholder + text},
44 |         ]
45 | 
46 |         prompt = self.processor.tokenizer.apply_chat_template(
47 |             messages, tokenize=False, add_generation_prompt=True
48 |         )
49 | 
50 |         if images is None:
51 |             images = []
52 |         inputs = self.processor(prompt, images, return_tensors="pt").to(self.device)
53 | 
54 |         generate_ids = self.model.generate(
55 |             **inputs,
56 |             **gen_kwargs.__dict__,
57 |             generation_config=generation_config,
58 |         )
59 | 
60 |         # 入力部分を取り除いた生成結果をデコード
61 |         generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
62 |         response = self.processor.batch_decode(
63 |             generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
64 |         )[0]
65 | 
66 |         return response
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     vlm = VLM()
71 |     vlm.test_vlm()
72 | 


--------------------------------------------------------------------------------
/examples/pixtral.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from vllm import LLM
 3 | from vllm.sampling_params import SamplingParams
 4 | import base64
 5 | from io import BytesIO
 6 | from base_vlm import BaseVLM
 7 | from utils import GenerationConfig
 8 | 
 9 | 
10 | def image_to_base64(img):
11 |     buffer = BytesIO()
12 |     # Check if the image has an alpha channel (RGBA)
13 |     if img.mode == "RGBA":
14 |         # Convert the image to RGB mode
15 |         img = img.convert("RGB")
16 |     img.save(buffer, format="JPEG")
17 |     buffer.seek(0)
18 |     img_str = base64.b64encode(buffer.getvalue()).decode("ascii")
19 |     return img_str
20 | 
21 | 
22 | def image_to_content(image: Image.Image) -> dict:
23 |     base64_image = image_to_base64(image)
24 |     content = {
25 |         "type": "image_url",
26 |         "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
27 |     }
28 |     return content
29 | 
30 | 
31 | class VLM(BaseVLM):
32 |     def __init__(self, model_id: str = "mistralai/Pixtral-12B-2409") -> None:
33 |         self.model_id = model_id
34 |         max_img_per_msg = 5
35 |         self.model = LLM(
36 |             model=self.model_id,
37 |             tokenizer_mode="mistral",
38 |             tensor_parallel_size=1,
39 |             limit_mm_per_prompt={"image": max_img_per_msg},
40 |             max_model_len=32768,
41 |         )
42 | 
43 |     def generate(
44 |         self,
45 |         images: list[Image.Image] | None,
46 |         text: str,
47 |         gen_kwargs: GenerationConfig = GenerationConfig(),
48 |     ) -> str:
49 |         if images is None:
50 |             images = []
51 |         content = [image_to_content(image) for image in images]
52 |         content.extend([{"type": "text", "text": text}])
53 |         messages = [
54 |             {
55 |                 "role": "user",
56 |                 "content": content,
57 |             }
58 |         ]
59 | 
60 |         sampling_params = SamplingParams(
61 |             max_tokens=gen_kwargs.max_new_tokens,
62 |             temperature=gen_kwargs.temperature,
63 |             top_p=gen_kwargs.top_p,
64 |         )
65 |         outputs = self.model.chat(
66 |             messages,
67 |             sampling_params=sampling_params,
68 |         )
69 |         return outputs[0].outputs[0].text
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     vlm = VLM()
74 |     vlm.test_vlm()
75 | 


--------------------------------------------------------------------------------
/examples/qwen2_5_vl.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     Qwen2_5_VLForConditionalGeneration,
 3 |     AutoProcessor,
 4 | )
 5 | from qwen_vl_utils import process_vision_info
 6 | from base_vlm import BaseVLM
 7 | from utils import GenerationConfig
 8 | from PIL import Image
 9 | 
10 | 
11 | class VLM(BaseVLM):
12 |     def __init__(self, model_id: str = "Qwen/Qwen2.5-VL-3B-Instruct") -> None:
13 |         self.model_id = model_id
14 |         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15 |             self.model_id,
16 |             torch_dtype="bfloat16",
17 |             device_map="auto",
18 |             attn_implementation="flash_attention_2",
19 |         )
20 | 
21 |         min_pixels = 256 * 28 * 28
22 |         max_pixels = 1280 * 28 * 28
23 |         self.processor = AutoProcessor.from_pretrained(
24 |             self.model_id, min_pixels=min_pixels, max_pixels=max_pixels
25 |         )
26 | 
27 |     def generate(
28 |         self,
29 |         images: list[Image.Image] | None,
30 |         text: str,
31 |         gen_kwargs: GenerationConfig = GenerationConfig(),
32 |     ) -> str:
33 |         if images is None:
34 |             images = []
35 |         if "<image>" in text:
36 |             text = text.replace("<image>", "")
37 |         message = []
38 |         image_content = []
39 | 
40 |         for img in images:
41 |             image_content.append(
42 |                 {
43 |                     "type": "image",
44 |                     "image": img,
45 |                 }
46 |             )
47 |         message.append(
48 |             {
49 |                 "role": "user",
50 |                 "content": image_content + [{"type": "text", "text": text}],
51 |             }
52 |         )
53 | 
54 |         texts = self.processor.apply_chat_template(
55 |             message, tokenize=False, add_generation_prompt=True
56 |         )
57 |         image_inputs, video_inputs = process_vision_info(message)
58 |         inputs = self.processor(
59 |             text=[texts],
60 |             images=image_inputs,
61 |             videos=video_inputs,
62 |             padding=True,
63 |             return_tensors="pt",
64 |         )
65 | 
66 |         inputs = inputs.to(self.model.device)
67 |         output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
68 |         generated_ids = [
69 |             output_ids[len(input_ids) :]
70 |             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
71 |         ]
72 |         generated_text = self.processor.batch_decode(
73 |             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
74 |         )[0]
75 |         return generated_text
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     vlm = VLM()
80 |     vlm.test_vlm()
81 | 


--------------------------------------------------------------------------------
/examples/qwen2_vl.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     Qwen2VLForConditionalGeneration,
 3 |     AutoProcessor,
 4 | )
 5 | from qwen_vl_utils import process_vision_info
 6 | from base_vlm import BaseVLM
 7 | from utils import GenerationConfig
 8 | from PIL import Image
 9 | 
10 | 
11 | class VLM(BaseVLM):
12 |     def __init__(self, model_id: str = "Qwen/Qwen2-VL-2B-Instruct") -> None:
13 |         self.model_id = model_id
14 |         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
15 |             self.model_id,
16 |             torch_dtype="bfloat16",
17 |             device_map="auto",
18 |             attn_implementation="flash_attention_2",
19 |         )
20 | 
21 |         min_pixels = 256 * 28 * 28
22 |         max_pixels = 1280 * 28 * 28
23 |         self.processor = AutoProcessor.from_pretrained(
24 |             self.model_id, min_pixels=min_pixels, max_pixels=max_pixels
25 |         )
26 | 
27 |     def generate(
28 |         self,
29 |         images: list[Image.Image] | None,
30 |         text: str,
31 |         gen_kwargs: GenerationConfig = GenerationConfig(),
32 |     ) -> str:
33 |         if images is None:
34 |             images = []
35 |         if "<image>" in text:
36 |             text = text.replace("<image>", "")
37 |         message = []
38 |         image_content = []
39 | 
40 |         for img in images:
41 |             image_content.append(
42 |                 {
43 |                     "type": "image",
44 |                     "image": img,
45 |                 }
46 |             )
47 |         message.append(
48 |             {
49 |                 "role": "user",
50 |                 "content": image_content + [{"type": "text", "text": text}],
51 |             }
52 |         )
53 | 
54 |         texts = self.processor.apply_chat_template(
55 |             message, tokenize=False, add_generation_prompt=True
56 |         )
57 |         image_inputs, video_inputs = process_vision_info(message)
58 |         inputs = self.processor(
59 |             text=[texts],
60 |             images=image_inputs,
61 |             videos=video_inputs,
62 |             padding=True,
63 |             return_tensors="pt",
64 |         )
65 | 
66 |         inputs = inputs.to(self.model.device)
67 |         output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
68 |         generated_ids = [
69 |             output_ids[len(input_ids) :]
70 |             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
71 |         ]
72 |         generated_text = self.processor.batch_decode(
73 |             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
74 |         )[0]
75 |         return generated_text
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     vlm = VLM()
80 |     vlm.test_vlm()
81 | 


--------------------------------------------------------------------------------
/examples/sample_vllm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | from dataclasses import asdict
  5 | from loguru import logger
  6 | 
  7 | import eval_mm
  8 | import eval_mm.metrics
  9 | from utils import GenerationConfig
 10 | from base_vllm import VLLM
 11 | 
 12 | 
 13 | def parse_args():
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument("--model_id", default="Qwen/Qwen2.5-VL-3B-Instruct")
 16 |     parser.add_argument(
 17 |         "--task_id",
 18 |         default="japanese-heron-bench",
 19 |         help=f"Task ID to evaluate. Available: {eval_mm.TaskRegistry().get_task_list()}",
 20 |     )
 21 |     parser.add_argument("--judge_model", default="gpt-4o-2024-11-20")
 22 |     parser.add_argument("--batch_size_for_evaluation", type=int, default=10)
 23 |     parser.add_argument("--overwrite", action="store_true")
 24 |     parser.add_argument("--result_dir", default="result")
 25 |     parser.add_argument("--inference_only", action="store_true")
 26 |     parser.add_argument("--max_new_tokens", type=int, default=256)
 27 |     parser.add_argument("--num_beams", type=int, default=1)
 28 |     parser.add_argument("--temperature", type=float, default=0.0)
 29 |     parser.add_argument("--top_p", type=float, default=1.0)
 30 |     parser.add_argument("--do_sample", action="store_true", default=False)
 31 |     parser.add_argument("--use_cache", action="store_true", default=True)
 32 |     parser.add_argument("--max_dataset_len", type=int)
 33 |     parser.add_argument(
 34 |         "--metrics",
 35 |         type=str,
 36 |         nargs="+",
 37 |         default=["heron-bench"],
 38 |         help=f"Metrics to evaluate. Available: {eval_mm.ScorerRegistry().get_metric_list()}",
 39 |     )
 40 |     parser.add_argument(
 41 |         "--rotate_choices", action="store_true", help="This option is used in MECHA-ja"
 42 |     )
 43 |     parser.add_argument(
 44 |         "--random_choice",
 45 |         action="store_true",
 46 |         help="If set, randomly choose the answer from the candidates when parse error occurs in JMMMU and MMMU tasks",
 47 |     )
 48 |     return parser.parse_args()
 49 | 
 50 | 
 51 | def load_or_generate_predictions(args, task, gen_kwargs, output_dir):
 52 |     prediction_path = os.path.join(output_dir, "prediction.jsonl")
 53 |     if os.path.exists(prediction_path) and not args.overwrite:
 54 |         logger.info(f"Loading predictions from {prediction_path}")
 55 |         with open(prediction_path) as f:
 56 |             preds = [json.loads(line) for line in f]
 57 |         assert len(preds) == len(
 58 |             task.dataset
 59 |         ), "Prediction length mismatch with dataset"
 60 |         return preds, []
 61 | 
 62 |     logger.info("Generating predictions...")
 63 |     model = VLLM(args.model_id)
 64 |     preds = []
 65 | 
 66 |     qids = [task.doc_to_id(doc) for doc in task.dataset]
 67 |     images = [task.doc_to_visual(doc) for doc in task.dataset]
 68 |     texts = [task.doc_to_text(doc).replace("<image>", "") for doc in task.dataset]
 69 | 
 70 |     preds = model.batch_generate(images, texts, gen_kwargs)
 71 |     preds = [{"question_id": qid, "text": pred} for qid, pred in zip(qids, preds)]
 72 | 
 73 |     save_jsonl(prediction_path, preds)
 74 |     logger.info(f"Predictions saved to {prediction_path}")
 75 |     return preds, []
 76 | 
 77 | 
 78 | def save_jsonl(path, data):
 79 |     with open(path, "w") as f:
 80 |         for item in data:
 81 |             f.write(json.dumps(item, ensure_ascii=False) + "\n")
 82 | 
 83 | 
 84 | def evaluate(args, task, preds, metrics):
 85 |     logger.info("Starting evaluation...")
 86 |     scores_by_metric = {}
 87 |     aggregated_metrics = {}
 88 | 
 89 |     for metric in metrics:
 90 |         scorer = eval_mm.ScorerRegistry.load_scorer(
 91 |             metric,
 92 |             eval_mm.ScorerConfig(
 93 |                 docs=task.dataset,
 94 |                 judge_model=args.judge_model,
 95 |                 batch_size=args.batch_size_for_evaluation,
 96 |                 client=eval_mm.OpenAIChatAPI(),
 97 |                 random_choice=args.random_choice,
 98 |             ),
 99 |         )
100 |         scores = scorer.score(
101 |             [task.doc_to_answer(doc) for doc in task.dataset],
102 |             [pred["text"] for pred in preds],
103 |         )
104 |         scores_by_metric[metric] = scores
105 |         aggregate = scorer.aggregate(scores)
106 |         aggregated_metrics[metric] = asdict(aggregate)
107 | 
108 |         logger.info(f"Scores for {metric}: {scores}")
109 |         logger.info(f"Aggregate for {metric}: {aggregate}")
110 | 
111 |     return scores_by_metric, aggregated_metrics
112 | 
113 | 
114 | def save_final_results(preds, task, metrics, scores_by_metric, output_path):
115 |     final_results = []
116 |     for i, pred in enumerate(preds):
117 |         doc = task.dataset[i]
118 |         result = {
119 |             "question_id": pred["question_id"],
120 |             "text": pred["text"],
121 |             "answer": task.doc_to_answer(doc),
122 |             "input_text": task.doc_to_text(doc),
123 |         }
124 |         for metric in metrics:
125 |             result[metric] = scores_by_metric[metric][i]
126 |         final_results.append(result)
127 | 
128 |     save_jsonl(output_path, final_results)
129 |     logger.info(f"Final prediction with scores saved to {output_path}")
130 | 
131 | 
132 | def main():
133 |     args = parse_args()
134 | 
135 |     gen_kwargs = GenerationConfig(
136 |         max_new_tokens=args.max_new_tokens,
137 |         temperature=args.temperature,
138 |         top_p=args.top_p,
139 |         num_beams=args.num_beams,
140 |         do_sample=args.do_sample,
141 |         use_cache=args.use_cache,
142 |     )
143 | 
144 |     task_config = eval_mm.TaskConfig(
145 |         max_dataset_len=args.max_dataset_len,
146 |         rotate_choices=args.rotate_choices,
147 |     )
148 |     task = eval_mm.TaskRegistry.load_task(args.task_id, task_config)
149 | 
150 |     output_dir = os.path.join(args.result_dir, args.task_id, args.model_id + "_vllm")
151 |     os.makedirs(output_dir, exist_ok=True)
152 | 
153 |     preds, _ = load_or_generate_predictions(args, task, gen_kwargs, output_dir)
154 | 
155 |     if args.inference_only:
156 |         logger.info("Inference only mode. Skipping evaluation.")
157 |         return
158 | 
159 |     scores_by_metric, aggregated_metrics = evaluate(args, task, preds, args.metrics)
160 | 
161 |     prediction_path = os.path.join(output_dir, "prediction.jsonl")
162 |     save_final_results(preds, task, args.metrics, scores_by_metric, prediction_path)
163 | 
164 |     evaluation_path = os.path.join(output_dir, "evaluation.jsonl")
165 |     with open(evaluation_path, "w") as f:
166 |         f.write(json.dumps(aggregated_metrics, ensure_ascii=False) + "\n")
167 |     logger.info(f"Evaluation result saved to {evaluation_path}")
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     main()
172 | 


--------------------------------------------------------------------------------
/examples/sarashina2_vision.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from transformers import AutoModelForCausalLM, AutoProcessor
 3 | from base_vlm import BaseVLM
 4 | from utils import GenerationConfig
 5 | 
 6 | 
 7 | class VLM(BaseVLM):
 8 |     def __init__(self, model_id: str = "sbintuitions/sarashina2-vision-8b") -> None:
 9 |         self.model_id = model_id
10 |         self.model = AutoModelForCausalLM.from_pretrained(
11 |             self.model_id,
12 |             device_map="cuda",
13 |             torch_dtype="auto",
14 |             trust_remote_code=True,
15 |         )
16 |         self.processor = AutoProcessor.from_pretrained(
17 |             self.model_id, trust_remote_code=True
18 |         )
19 | 
20 |     def generate(
21 |         self,
22 |         images: list[Image.Image] | None,
23 |         text: str,
24 |         gen_kwargs: GenerationConfig = GenerationConfig(),
25 |     ) -> str:
26 |         if images is None:
27 |             images = []
28 |         message = [{"role": "user", "content": text}]
29 | 
30 |         text = self.processor.apply_chat_template(message, add_generation_prompt=True)
31 |         # insert <|prefix|><|file|><|suffix|> after <s>
32 |         text = text.replace(
33 |             "<|prefix|><|file|><|suffix|>", "<|prefix|><|file|><|suffix|>" * len(images)
34 |         )
35 | 
36 |         # Use text-only processing if no images are provided
37 |         if images is None:
38 |             images = []
39 |         inputs = self.processor(
40 |             text=[text],
41 |             images=images,
42 |             padding=True,
43 |             return_tensors="pt",
44 |         ).to(self.model.device)
45 | 
46 |         stopping_criteria = self.processor.get_stopping_criteria(["\n###"])
47 | 
48 |         # Inference: Generation of the output
49 |         output_ids = self.model.generate(
50 |             **inputs,
51 |             **gen_kwargs.__dict__,
52 |             stopping_criteria=stopping_criteria,
53 |         )
54 |         generated_ids = [
55 |             output_ids[len(input_ids) :]
56 |             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
57 |         ]
58 |         output_text = self.processor.batch_decode(
59 |             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
60 |         )
61 |         return output_text[0]
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     vlm = VLM()
66 |     vlm.test_vlm()
67 | 


--------------------------------------------------------------------------------
/examples/test_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from model_table import get_class_from_model_id
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--model_id", type=str, default="llava-hf/llava-1.5-7b-hf")
 6 | 
 7 | args = parser.parse_args()
 8 | 
 9 | model = get_class_from_model_id(args.model_id)(args.model_id)
10 | model.test_vlm()
11 | 


--------------------------------------------------------------------------------
/examples/utils.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class GenerationConfig:
 6 |     max_new_tokens: int = 1024
 7 |     temperature: float = 0.0
 8 |     top_p: float = 1.0
 9 |     num_beams: int = 1
10 |     do_sample: bool = False
11 |     use_cache: bool = True
12 | 


--------------------------------------------------------------------------------
/examples/vila.py:
--------------------------------------------------------------------------------
 1 | # This file is modified from https://github.com/haotian-liu/LLaVA/
 2 | 
 3 | # rye add protobuf
 4 | # uv pip install flash-attn --no-build-isolation --python .venv
 5 | 
 6 | from base_vlm import BaseVLM
 7 | from utils import GenerationConfig
 8 | 
 9 | import torch
10 | 
11 | from llava_vila.conversation import SeparatorStyle, conv_templates
12 | from llava_vila.mm_utils import (
13 |     get_model_name_from_path,
14 |     process_images,
15 |     tokenizer_image_token,
16 | )
17 | from llava_vila.model.builder import load_pretrained_model
18 | from PIL import Image
19 | 
20 | 
21 | class VLM(BaseVLM):
22 |     def __init__(self, model_id: str = "Efficient-Large-Model/VILA1.5-13b"):
23 |         self.model_id = model_id
24 |         model_name = get_model_name_from_path(self.model_id)
25 |         self.model_name = model_name
26 |         self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model(
27 |             self.model_id, model_name
28 |         )
29 |         # from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
30 |         # self.model = AutoModelForCausalLM.from_pretrained("Efficient-Large-Model/VILA-13b")
31 |         # self.tokenizer = self.model.config.tokenizer
32 |         # self.image_processor = AutoProcessor.from_pretrained("Efficient-Large-Model/VILA-13b")
33 | 
34 |     def generate(
35 |         self,
36 |         images: list[Image.Image] | None,
37 |         text: str,
38 |         gen_kwargs: GenerationConfig = GenerationConfig(),
39 |     ) -> str:
40 |         qs = text
41 |         if images is None:
42 |             images = []
43 |         if "<image>" not in text:
44 |             qs = "<image>\n" * len(images) + text
45 | 
46 |         if "llama-2" in self.model_name.lower():
47 |             conv_mode = "llava_llama_2"
48 |         elif "v1" in self.model_name.lower():
49 |             conv_mode = "llava_v1"
50 |         elif "mpt" in self.model_name.lower():
51 |             conv_mode = "mpt"
52 |         else:
53 |             conv_mode = "llava_v0"
54 | 
55 |         conv = conv_templates[conv_mode].copy()
56 |         conv.append_message(conv.roles[0], qs)
57 |         conv.append_message(conv.roles[1], None)
58 |         prompt = conv.get_prompt()
59 |         if images is None:
60 |             images_tensor = None
61 |         else:
62 |             images_tensor = [
63 |                 process_images(images, self.image_processor, self.model.config).to(
64 |                     self.model.device, dtype=torch.float16
65 |                 )
66 |             ]
67 |         input_ids = (
68 |             tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt")
69 |             .unsqueeze(0)
70 |             .cuda()
71 |         )
72 | 
73 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
74 |         # keywords = [stop_str] # if needed, add keywords
75 | 
76 |         with torch.inference_mode():
77 |             output_ids = self.model.generate(
78 |                 input_ids,
79 |                 images=images_tensor,
80 |                 **gen_kwargs.__dict__,
81 |             )
82 |         outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
83 |         outputs = outputs.strip()
84 |         if outputs.endswith(stop_str):
85 |             outputs = outputs[: -len(stop_str)]
86 |         outputs = outputs.strip()
87 |         return outputs
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     vlm = VLM("Efficient-Large-Model/VILA1.5-13b")
92 |     vlm.test_vlm()
93 | 


--------------------------------------------------------------------------------
/examples/vllm_registry.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from dataclasses import dataclass
  3 | from PIL import Image
  4 | from vllm.lora.request import LoRARequest
  5 | from transformers import AutoProcessor
  6 | 
  7 | 
  8 | @dataclass
  9 | class ModelRequestData:
 10 |     prompt: str
 11 |     image_data: Optional[list[Image.Image]]
 12 |     stop_token_ids: Optional[list[int]] = None
 13 |     chat_template: Optional[str] = None
 14 |     lora_requests: Optional[list[LoRARequest]] = None
 15 | 
 16 | 
 17 | class VLLMModelRegistry:
 18 |     def __init__(self, model_name: str):
 19 |         self.model_name = model_name
 20 |         self.processor = AutoProcessor.from_pretrained(
 21 |             model_name, trust_remote_code=True
 22 |         )
 23 |         self.loader_map = {
 24 |             "Qwen/Qwen2.5-VL-3B-Instruct": self.load_qwen2_5_vl,
 25 |             "Qwen/Qwen2.5-VL-7B-Instruct": self.load_qwen2_5_vl,
 26 |             "Qwen/Qwen2.5-VL-32B-Instruct": self.load_qwen2_5_vl,
 27 |             "Qwen/Qwen2.5-VL-72B-Instruct": self.load_qwen2_5_vl,
 28 |             "google/gemma-3-4b-it": self.load_gemma3,
 29 |             "google/gemma-3-12b-it": self.load_gemma3,
 30 |             "google/gemma-3-27b-it": self.load_gemma3,
 31 |         }
 32 | 
 33 |     def get_engine_config(self, model_id: str) -> dict:
 34 |         return {
 35 |             "max_model_len": 32768,
 36 |             "max_num_seqs": 5,
 37 |             "limit_mm_per_prompt": {"image": 5},
 38 |             "trust_remote_code": True,
 39 |         }
 40 | 
 41 |     def load_qwen2_5_vl(
 42 |         self, text: str, images: list[Image.Image] | None
 43 |     ) -> ModelRequestData:
 44 |         try:
 45 |             from qwen_vl_utils import process_vision_info
 46 |         except ModuleNotFoundError:
 47 |             print(
 48 |                 "WARNING: `qwen-vl-utils` not installed, input images will not "
 49 |                 "be automatically resized. You can enable this functionality by "
 50 |                 "`pip install qwen-vl-utils`."
 51 |             )
 52 |             process_vision_info = None
 53 | 
 54 |         if images is None:
 55 |             images = []
 56 | 
 57 |         placeholders = [{"type": "image", "image": image} for image in images]
 58 |         messages = [
 59 |             {
 60 |                 "role": "user",
 61 |                 "content": [
 62 |                     *placeholders,
 63 |                     {"type": "text", "text": text},
 64 |                 ],
 65 |             }
 66 |         ]
 67 | 
 68 |         prompt = self.processor.apply_chat_template(
 69 |             messages, tokenize=False, add_generation_prompt=True
 70 |         )
 71 | 
 72 |         if process_vision_info is None:
 73 |             image_data = images
 74 |         else:
 75 |             image_data, _ = process_vision_info(messages, return_video_kwargs=False)
 76 | 
 77 |         return ModelRequestData(
 78 |             prompt=prompt,
 79 |             image_data=image_data,
 80 |         )
 81 | 
 82 |     def load_gemma3(
 83 |         self, text: str, images: list[Image.Image] | None
 84 |     ) -> ModelRequestData:
 85 |         if images is None:
 86 |             images = []
 87 | 
 88 |         placeholders = [{"type": "image", "image": image} for image in images]
 89 |         messages = [
 90 |             {
 91 |                 "role": "user",
 92 |                 "content": [
 93 |                     *placeholders,
 94 |                     {"type": "text", "text": text},
 95 |                 ],
 96 |             }
 97 |         ]
 98 | 
 99 |         prompt = self.processor.apply_chat_template(
100 |             messages, tokenize=False, add_generation_prompt=True
101 |         )
102 | 
103 |         return ModelRequestData(
104 |             prompt=prompt,
105 |             image_data=images,
106 |         )
107 | 


--------------------------------------------------------------------------------
/examples/xcomposer2d5.py:
--------------------------------------------------------------------------------
 1 | # flash-attn is required to run this example.
 2 | #
 3 | import torch
 4 | from transformers import AutoModel, AutoTokenizer
 5 | import os
 6 | from utils import GenerationConfig
 7 | from base_vlm import BaseVLM
 8 | from PIL import Image
 9 | 
10 | torch.set_grad_enabled(False)
11 | 
12 | 
13 | class VLM(BaseVLM):
14 |     def __init__(self, model_id: str = "internlm/internlm-xcomposer2d5-7b") -> None:
15 |         self.model_id = model_id
16 |         self.model = (
17 |             AutoModel.from_pretrained(
18 |                 self.model_id,
19 |                 torch_dtype=torch.bfloat16,
20 |                 trust_remote_code=True,
21 |             )
22 |             .cuda()
23 |             .eval()
24 |             .half()
25 |         )
26 |         self.tokenizer = AutoTokenizer.from_pretrained(
27 |             self.model_id, trust_remote_code=True
28 |         )
29 |         self.model.tokenizer = self.tokenizer
30 | 
31 |     def generate(
32 |         self,
33 |         images: list[Image.Image] | None,
34 |         text: str,
35 |         gen_kwargs: GenerationConfig = GenerationConfig(),
36 |     ) -> str:
37 |         if images is None:
38 |             images = []
39 |         if "<image>" not in text:
40 |             image_tokens = "".join(
41 |                 [f"Image{i} <ImageHere>; " for i in range(1, len(images) + 1)]
42 |             )
43 |             text = f"{image_tokens}{text}"
44 |         # make tmp files
45 |         os.makedirs("tmp", exist_ok=True)
46 |         image_files = []
47 |         for i, img in enumerate(images):
48 |             file_path = f"tmp/image_{i}.jpg"
49 |             img.save(file_path)
50 |             image_files.append(file_path)
51 | 
52 |         with torch.autocast(device_type="cuda", dtype=torch.float16):
53 |             response, _ = self.model.chat(
54 |                 self.tokenizer,
55 |                 text,
56 |                 image_files,
57 |                 generation_config=gen_kwargs.__dict__,
58 |             )
59 | 
60 |         # remove tmp files
61 |         for file_path in image_files:
62 |             os.remove(file_path)
63 |         return response
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     model = VLM()
68 |     model.test_vlm()
69 | 


--------------------------------------------------------------------------------
/github_pages/.gitignore:
--------------------------------------------------------------------------------
  1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
  2 | 
  3 | # dependencies
  4 | /node_modules
  5 | /.pnp
  6 | .pnp.js
  7 | 
  8 | # testing
  9 | /coverage
 10 | 
 11 | # production
 12 | /build
 13 | 
 14 | # misc
 15 | .DS_Store
 16 | .env.local
 17 | .env.development.local
 18 | .env.test.local
 19 | .env.production.local
 20 | 
 21 | npm-debug.log*
 22 | yarn-debug.log*
 23 | yarn-error.log*
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | share/python-wheels/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | MANIFEST
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | htmlcov/
 65 | .tox/
 66 | .nox/
 67 | .coverage
 68 | .coverage.*
 69 | .cache
 70 | nosetests.xml
 71 | coverage.xml
 72 | *.cover
 73 | *.py,cover
 74 | .hypothesis/
 75 | .pytest_cache/
 76 | cover/
 77 | 
 78 | # Translations
 79 | *.mo
 80 | *.pot
 81 | 
 82 | # Django stuff:
 83 | *.log
 84 | local_settings.py
 85 | db.sqlite3
 86 | db.sqlite3-journal
 87 | 
 88 | # Flask stuff:
 89 | instance/
 90 | .webassets-cache
 91 | 
 92 | # Scrapy stuff:
 93 | .scrapy
 94 | 
 95 | # Sphinx documentation
 96 | docs/_build/
 97 | 
 98 | # PyBuilder
 99 | .pybuilder/
100 | target/
101 | 
102 | # Jupyter Notebook
103 | .ipynb_checkpoints
104 | 
105 | # IPython
106 | profile_default/
107 | ipython_config.py
108 | 
109 | # pyenv
110 | #   For a library or package, you might want to ignore these files since the code is
111 | #   intended to run in multiple environments; otherwise, check them in:
112 | # .python-version
113 | 
114 | # pipenv
115 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | #   install all needed dependencies.
119 | #Pipfile.lock
120 | 
121 | # UV
122 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
123 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
124 | #   commonly ignored for libraries.
125 | #uv.lock
126 | 
127 | # poetry
128 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
129 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
130 | #   commonly ignored for libraries.
131 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
132 | #poetry.lock
133 | 
134 | # pdm
135 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
136 | #pdm.lock
137 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
138 | #   in version control.
139 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
140 | .pdm.toml
141 | .pdm-python
142 | .pdm-build/
143 | 
144 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
145 | __pypackages__/
146 | 
147 | # Celery stuff
148 | celerybeat-schedule
149 | celerybeat.pid
150 | 
151 | # SageMath parsed files
152 | *.sage.py
153 | 
154 | # Environments
155 | .env
156 | .venv
157 | env/
158 | venv/
159 | ENV/
160 | env.bak/
161 | venv.bak/
162 | 
163 | # Spyder project settings
164 | .spyderproject
165 | .spyproject
166 | 
167 | # Rope project settings
168 | .ropeproject
169 | 
170 | # mkdocs documentation
171 | /site
172 | 
173 | # mypy
174 | .mypy_cache/
175 | .dmypy.json
176 | dmypy.json
177 | 
178 | # Pyre type checker
179 | .pyre/
180 | 
181 | # pytype static type analyzer
182 | .pytype/
183 | 
184 | # Cython debug symbols
185 | cython_debug/
186 | 
187 | # PyCharm
188 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
189 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
190 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
191 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
192 | #.idea/
193 | 
194 | # PyPI configuration file
195 | .pypirc
196 | 


--------------------------------------------------------------------------------
/github_pages/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "semi": true,
3 |   "singleQuote": true,
4 |   "jsxSingleQuote": true,
5 |   "trailingComma": "all",
6 |   "tabWidth": 2,
7 | }
8 | 


--------------------------------------------------------------------------------
/github_pages/README.md:
--------------------------------------------------------------------------------
 1 | # llm-jp-eval-mm.github.io
 2 | 
 3 | This repository is a source code for the llm-jp-eval-mm leaderboard website.
 4 | [llm-jp-eval-mm](https://github.com/llm-jp/llm-jp-eval-mm) is used to evaluate the VLMs on the Japanese benchmark.
 5 | 
 6 | ## How to develop
 7 | ```bash
 8 | cd github_pages
 9 | sudo apt install -y nodejs npm
10 | sudo npm install n -g
11 | npm install
12 | npm run start
13 | ```
14 | 
15 | You may need to remove "homepage" from `github_pages/package.json` to start in the local environment.
16 | ```diff
17 | {
18 |   "name": "github_pages",
19 |   "version": "0.1.0",
20 | --  "homepage": "https://llm-jp.github.io/llm-jp-eval-mm",
21 | }
22 | ```
23 | 
24 | ## How to deploy
25 | ```bash
26 | cd github_pages
27 | npm run deploy
28 | ```
29 | 
30 | ## Add benchmark results to the leaderboard
31 | Please add the benchmark results to the `github_pages/public/leaderboard.json` file.
32 | The format of the benchmark results is as follows.
33 | ```json
34 |   {
35 |     "model": "Japanese InstructBLIP Alpha",
36 |     "url": "https://huggingface.co/stabilityai/japanese-instructblip-alpha",
37 |     "scores": {
38 |       "Heron": {
39 |         "conv": 22.8,
40 |         "detail": 24.1,
41 |         "complex": 19.5,
42 |         "overall": 22.7
43 |       },
44 |       "JVB-ItW": { "llm": 1.31, "rouge": 13.8 },
45 |       "MulIm-VQA": { "llm": 2.5, "rouge": 25.0 },
46 |       "JDocQA": { "Acc": 0.123, "llm": 1.9 },
47 |       "JMMMU": { "Acc": 0.271 }
48 |     }
49 |   },
50 | ```
51 | 
52 | ## Format the code
53 | ```bash
54 | npx prettier --write "./**/*.{js,jsx,ts,tsx,css,html}"
55 | ```
56 | 
57 | 
58 | ## Reference
59 | This repository refers to the following repositories. Thank you for your great work.
60 | - https://github.com/MMMU-Japanese-Benchmark/JMMMU
61 | - https://github.com/MMMU-Benchmark/mmmu-benchmark.github.io
62 | 


--------------------------------------------------------------------------------
/github_pages/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "github_pages",
 3 |   "version": "0.1.0",
 4 |   "homepage": "https://llm-jp.github.io/llm-jp-eval-mm",
 5 |   "private": true,
 6 |   "dependencies": {
 7 |     "cra-template": "1.2.0",
 8 |     "format": "^0.2.2",
 9 |     "react": "^19.0.0",
10 |     "react-dom": "^19.0.0",
11 |     "react-icons": "^5.4.0",
12 |     "react-scripts": "5.0.1"
13 |   },
14 |   "scripts": {
15 |     "predeploy": "npm run build",
16 |     "deploy": "gh-pages -d build",
17 |     "start": "react-scripts start",
18 |     "build": "react-scripts build",
19 |     "test": "react-scripts test",
20 |     "eject": "react-scripts eject"
21 |   },
22 |   "eslintConfig": {
23 |     "extends": [
24 |       "react-app",
25 |       "react-app/jest"
26 |     ]
27 |   },
28 |   "browserslist": {
29 |     "production": [
30 |       ">0.2%",
31 |       "not dead",
32 |       "not op_mini all"
33 |     ],
34 |     "development": [
35 |       "last 1 chrome version",
36 |       "last 1 firefox version",
37 |       "last 1 safari version"
38 |     ]
39 |   },
40 |   "devDependencies": {
41 |     "gh-pages": "^6.3.0",
42 |     "prettier": "^3.4.2",
43 |     "web-vitals": "^4.2.4"
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/github_pages/public/dataset_url.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Heron": {
 3 |         "url": "https://huggingface.co/datasets/turing-motors/Japanese-Heron-Bench"
 4 |     },
 5 |     "JVB-ItW": {
 6 |         "url": "https://huggingface.co/datasets/SakanaAI/JA-VLM-Bench-In-the-Wild"
 7 |     },
 8 |     "VG-VQA": {
 9 |         "url": "https://huggingface.co/datasets/SakanaAI/JA-VG-VQA-500"
10 |     },
11 |     "MulIm-VQA": {
12 |         "url": "https://huggingface.co/datasets/SakanaAI/JA-Multi-Image-VQA"
13 |     },
14 |     "JDocQA": {
15 |         "url": "https://github.com/mizuumi/JDocQA"
16 |     },
17 |     "JMMMU": {
18 |         "url": "https://huggingface.co/datasets/JMMMU/JMMMU"
19 |     },
20 |     "MMMU": {
21 |         "url": "https://huggingface.co/datasets/MMMU/MMMU"
22 |     },
23 |     "LLavaB": {
24 |         "url": "https://huggingface.co/datasets/lmms-lab/llava-bench-in-the-wild"
25 |     },
26 |     "JIC": {
27 |         "url": "https://huggingface.co/datasets/line-corporation/JIC-VQA"
28 |     },
29 |     "MECHA": {
30 |         "url": "https://huggingface.co/datasets/llm-jp/MECHA-ja"
31 |     },
32 |     "CC-OCR": {
33 |         "url": "https://huggingface.co/datasets/wulipc/CC-OCR"
34 |     },
35 |     "CVQA": {
36 |         "url": "https://huggingface.co/datasets/afaji/cvqa"
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/github_pages/public/default_metrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "default_metrics": {
 3 |         "Heron": "overall",
 4 |         "JVB-ItW": "rouge",
 5 |         "VGVQA": "rouge",
 6 |         "MulIm-VQA": "rouge",
 7 |         "JDocQA": "Acc",
 8 |         "JMMMU": "Acc",
 9 |         "MMMU": "Acc",
10 |         "LlavaB-ItW": "rouge"
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/github_pages/public/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <link
 8 |       rel="icon"
 9 |       href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>📊</text></svg>"
10 |     />
11 |     <link rel="preconnect" href="https://fonts.googleapis.com" />
12 |     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
13 |     <style>
14 |       @import url('https://fonts.googleapis.com/css2?family=Noto+Sans:ital,wght@0,100..900;1,100..900&display=swap');
15 |       @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+JP:ital,wght@0,100..900;1,100..900&display=swap');
16 |       @import url('https://fonts.googleapis.com/css?family=Google+Sans:ital,wght@0,100..900;1,100..900&display=swap');
17 |     </style>
18 |     <title>llm-jp-eval-mm</title>
19 |   </head>
20 |   <body>
21 |     <main id="root"></main>
22 |     <script src="./src/index.js"></script>
23 |   </body>
24 | </html>
25 | 


--------------------------------------------------------------------------------
/github_pages/src/Author.css:
--------------------------------------------------------------------------------
 1 | .author {
 2 |   position: relative;
 3 |   display: inline-block;
 4 | }
 5 | 
 6 | .author-name {
 7 |   color: inherit;
 8 |   font-size: 1.5rem;
 9 |   font-weight: 500;
10 |   line-height: 1.2;
11 | }
12 | 
13 | .author-website {
14 |   color: inherit;
15 |   text-decoration: none;
16 |   position: relative;
17 | }
18 | 
19 | .author-annotation {
20 |   position: relative;
21 |   bottom: 0.5rem;
22 |   font-size: 1rem;
23 |   font-weight: 500;
24 | }
25 | 
26 | .author-separator {
27 |   position: relative;
28 |   font-size: 1.5rem;
29 |   font-weight: 500;
30 | }
31 | 


--------------------------------------------------------------------------------
/github_pages/src/Author.js:
--------------------------------------------------------------------------------
 1 | import './Author.css';
 2 | 
 3 | const AFFILIATION_COLORS = [
 4 |   '',
 5 |   '#6fbf73',
 6 |   '#ed4b82',
 7 |   '#9400d3',
 8 |   '#4169E1',
 9 |   '#ffac33',
10 |   '#1e90ff',
11 |   '#ff69b4',
12 | ];
13 | export { AFFILIATION_COLORS };
14 | 
15 | const Author = ({
16 |   name,
17 |   affiliation,
18 |   annotation1,
19 |   annotation2,
20 |   url,
21 |   isLast,
22 | }) => {
23 |   return (
24 |     <div className='author'>
25 |       {url ? (
26 |         <a
27 |           className='author-website'
28 |           href={url}
29 |           target='_blank'
30 |           rel='noreferrer noopener'
31 |         >
32 |           <span className='author-name'>{name}</span>
33 |         </a>
34 |       ) : (
35 |         <span className='author-name'>{name}</span>
36 |       )}
37 |       <span className='author-annotation'>
38 |         {annotation1}
39 |         {affiliation.map((num, index) => (
40 |           <span key={num}>
41 |             <span style={{ color: AFFILIATION_COLORS[num] }}>{num}</span>
42 |             {index < affiliation.length - 1 && ', '}
43 |           </span>
44 |         ))}
45 |         {annotation2}
46 |       </span>
47 |       {(isLast === undefined || !isLast) && (
48 |         <span className='author-separator'>{','}&nbsp;</span>
49 |       )}
50 |     </div>
51 |   );
52 | };
53 | export default Author;
54 | 


--------------------------------------------------------------------------------
/github_pages/src/BibTex.css:
--------------------------------------------------------------------------------
 1 | .bibtex-title {
 2 |   font-size: 2rem;
 3 |   font-weight: 600;
 4 |   color: #363636;
 5 |   margin-block: 2.5rem 1.5rem;
 6 | }
 7 | 
 8 | .bibtex-entry {
 9 |   width: calc(100% - 1.5rem);
10 |   position: relative;
11 |   background-color: #eae5e3;
12 |   color: #0f2350;
13 |   font-size: 0.75rem;
14 |   padding: 0.75rem;
15 |   border-radius: 0.75rem;
16 |   margin: 0;
17 |   text-align: left;
18 |   display: inline-block;
19 |   white-space: pre-wrap;
20 |   word-wrap: break-word;
21 | }
22 | 
23 | .bibtex-copy-button {
24 |   position: absolute;
25 |   top: 0;
26 |   right: 0;
27 |   background-color: rgba(255, 255, 255, 0);
28 |   color: #595857;
29 | }
30 | @media (hover: hover) {
31 |   .bibtex-copy-button:hover {
32 |     color: #ba2636;
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/github_pages/src/BibTex.js:
--------------------------------------------------------------------------------
 1 | import { LuCopy } from 'react-icons/lu';
 2 | import './BibTex.css';
 3 | 
 4 | // BibTeX entry for the paper
 5 | const BIBTEX_ENTRY = `@inproceedings{maeda2025llm-jp-eval-mm,
 6 | author = {前田 航希 and 杉浦 一瑳 and 小田 悠介 and 栗田 修平 and 岡崎 直観},
 7 | month = mar,
 8 | series = {言語処理学会第31回年次大会 (NLP2025)},
 9 | title = {{llm-jp-eval-mm: 日本語視覚言語モデルの自動評価基盤}},
10 | year = {2025}
11 | }
12 | `;
13 | 
14 | const copyClipboard = () => {
15 |   navigator.clipboard.writeText(BIBTEX_ENTRY).catch((error) => {
16 |     console.error('Failed to copy BibTeX entry to clipboard', error);
17 |   });
18 | };
19 | 
20 | const BibTeX = () => {
21 |   return (
22 |     <div className='bibtex'>
23 |       <h1 className='bibtex-title'>BibTeX</h1>
24 |       <pre className='bibtex-entry'>
25 |         <code>{BIBTEX_ENTRY}</code>
26 |         <button className='bibtex-copy-button' onClick={copyClipboard}>
27 |           <LuCopy />
28 |         </button>
29 |       </pre>
30 |     </div>
31 |   );
32 | };
33 | 
34 | export default BibTeX;
35 | 


--------------------------------------------------------------------------------
/github_pages/src/Figure.css:
--------------------------------------------------------------------------------
 1 | .figure {
 2 |   padding: 1.5rem 0rem;
 3 |   margin: 0;
 4 |   display: flex;
 5 |   flex-direction: column;
 6 |   align-items: center;
 7 |   justify-content: center;
 8 | }
 9 | 
10 | .figure-image {
11 |   width: 100%;
12 |   max-width: 960px;
13 |   height: auto;
14 |   object-fit: contain;
15 | }
16 | 
17 | .figure-caption {
18 |   padding: 0.5rem;
19 | }
20 | 


--------------------------------------------------------------------------------
/github_pages/src/Figure.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import './Figure.css';
 3 | 
 4 | const Figure = ({ image, altText, caption }) => {
 5 |   return (
 6 |     <figure className='figure'>
 7 |       <img src={image} alt={altText} className='figure-image' />
 8 |       <figcaption className='figure-caption'>{caption}</figcaption>
 9 |     </figure>
10 |   );
11 | };
12 | export default Figure;
13 | 


--------------------------------------------------------------------------------
/github_pages/src/Footer.css:
--------------------------------------------------------------------------------
1 | .footer {
2 |   width: 100%;
3 |   margin: 0.5rem 0rem 0rem 0rem;
4 |   padding: 0.5rem 0rem 0.5rem 0rem;
5 |   font-size: 0.75rem;
6 | }
7 | 


--------------------------------------------------------------------------------
/github_pages/src/Footer.js:
--------------------------------------------------------------------------------
 1 | import './Footer.css';
 2 | 
 3 | const Footer = () => {
 4 |   return (
 5 |     <footer className='footer'>
 6 |       <div className='footer-content'>
 7 |         <p>
 8 |           The source code for this website is licensed under the{' '}
 9 |           <a
10 |             href='https://creativecommons.org/licenses/by-sa/4.0/'
11 |             target='_blank'
12 |             rel='noopener noreferrer'
13 |           >
14 |             Creative Commons Attribution-ShareAlike 4.0 International License
15 |           </a>
16 |           .
17 |         </p>
18 |         <p>
19 |           This website is inspired by and references{' '}
20 |           <a
21 |             href='https://nerfies.github.io/'
22 |             target='_blank'
23 |             rel='noopener noreferrer'
24 |           >
25 |             Nerfies
26 |           </a>{' '}
27 |           and{' '}
28 |           <a
29 |             href='https://mmmu-benchmark.github.io/'
30 |             target='_blank'
31 |             rel='noopener noreferrer'
32 |           >
33 |             MMMU
34 |           </a>
35 |           and{' '}
36 |           <a
37 |             href='https://mmmu-japanese-benchmark.github.io/JMMMU/'
38 |             target='_blank'
39 |             rel='noopener noreferrer'
40 |           >
41 |             JMMMU
42 |           </a>
43 |           .
44 |         </p>
45 |         <p>
46 |           Please also check out the{' '}
47 |           <a
48 |             href='https://github.com/speed1313/llm-jp-eval-mm.github.io'
49 |             target='_blank'
50 |             rel='noopener noreferrer'
51 |           >
52 |             GitHub repo
53 |           </a>
54 |           .
55 |         </p>
56 |       </div>
57 |     </footer>
58 |   );
59 | };
60 | 
61 | export default Footer;
62 | 


--------------------------------------------------------------------------------
/github_pages/src/Introduction.css:
--------------------------------------------------------------------------------
 1 | .introduction-title {
 2 |   font-size: 2rem;
 3 |   font-weight: 600;
 4 |   color: #363636;
 5 |   margin-block: 2.5rem 1.5rem;
 6 | }
 7 | 
 8 | .introduction-content {
 9 |   text-align: justify;
10 | }
11 | 
12 | a {
13 |   color: inherit;
14 |   text-decoration: underline;
15 | }
16 | 


--------------------------------------------------------------------------------
/github_pages/src/Introduction.js:
--------------------------------------------------------------------------------
 1 | import Figure from './Figure';
 2 | import overviewFigure from './assets/teaser.png';
 3 | import './Introduction.css';
 4 | 
 5 | const Introduction = () => {
 6 |   return (
 7 |     <div className='introduction'>
 8 |       <h1 className='introduction-title'>Introduction</h1>
 9 |       <div className='introduction-content'>
10 |         <span>
11 |           We introduce <b>llm-jp-eval-mm</b>, a toolkit for evaluating multiple
12 |           multimodal tasks related to Japanese language performance in a unified
13 |           environment. The toolkit is a benchmarking platform that integrates
14 |           six existing Japanese multimodal tasks and consistently evaluates
15 |           model outputs across multiple metrics. This paper outlines the design
16 |           of llm-jp-eval-mm for its construction and ongoing development,
17 |           reports the results of evaluating 13 publicly available Japanese and
18 |           multilingual VLMs, and discusses the findings in the light of existing
19 |           research.
20 |         </span>
21 |       </div>
22 |       <Figure
23 |         image={overviewFigure} // eslint-disable-line
24 |         altText='Overview of llm-jp-eval-mm'
25 |         caption={
26 |           <>
27 |             Figure 1: <b>Overview of llm-jp-eval-mm.</b>
28 |           </>
29 |         }
30 |       />
31 |     </div>
32 |   );
33 | };
34 | 
35 | export default Introduction;
36 | 


--------------------------------------------------------------------------------
/github_pages/src/Leaderboard.css:
--------------------------------------------------------------------------------
 1 | .Leaderboard {
 2 |   display: flex;
 3 |   flex-direction: column;
 4 |   align-items: center;
 5 |   justify-content: center;
 6 |   width: 100%;
 7 | }
 8 | 
 9 | .leaderboard-title {
10 |   font-size: 2rem;
11 |   font-weight: 600;
12 |   color: #363636;
13 |   margin-block: 2.5rem 1.5rem;
14 | }
15 | 
16 | .table-container {
17 |   width: calc(100vw - 2rem);
18 |   overflow-x: auto;
19 |   /* 横スクロールを有効化 */
20 |   border: 1px solid #ccc;
21 |   margin-top: 10px;
22 | }
23 | 
24 | table {
25 |   border-collapse: collapse;
26 |   width: 100%;
27 | 
28 |   /* スクロールを促すための最小幅 */
29 | }
30 | 
31 | thead {
32 |   position: sticky;
33 |   top: 0;
34 |   background-color: #f1f1f1;
35 |   z-index: 1;
36 | }
37 | 
38 | table th,
39 | table td {
40 |   border: 1px solid #ddd;
41 |   padding: 8px;
42 |   text-align: center;
43 |   font-size: 14px;
44 |   white-space: nowrap;
45 |   /* 折り返しを無効化 */
46 |   transition: background-color 0.2s ease;
47 | }
48 | 
49 | table th {
50 |   background-color: #f4f4f4;
51 |   cursor: pointer;
52 |   font-size: 14px;
53 | }
54 | 
55 | table tr:hover {
56 |   background-color: #f9f9f9;
57 | }
58 | 
59 | /* スマホ向けのメディアクエリ */
60 | @media (max-width: 768px) {
61 |   .Leaderboard {
62 |     padding: 10px;
63 |   }
64 | 
65 |   .leaderboard-title {
66 |     font-size: 1.5rem;
67 |     margin-block: 1.5rem 1rem;
68 |   }
69 | 
70 |   table th,
71 |   table td {
72 |     font-size: 12px;
73 |     /* スマホ向けに文字サイズを縮小 */
74 |     padding: 6px;
75 |     /* セル内の余白を調整 */
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/github_pages/src/Leaderboard.js:
--------------------------------------------------------------------------------
  1 | import React, { useEffect, useState } from 'react';
  2 | import './Leaderboard.css';
  3 | 
  4 | function Leaderboard() {
  5 |   const [data, setData] = useState([]);
  6 |   const [datasets, setDatasets] = useState([]);
  7 |   const [datasetUrl, setDatasetUrl] = useState({});
  8 | 
  9 |   const [metrics, setMetrics] = useState({});
 10 |   const [defaultMetrics, setDefaultMetrics] = useState({});
 11 |   const [sortConfig, setSortConfig] = useState(null);
 12 | 
 13 |   useEffect(() => {
 14 |     // Fetch leaderboard data
 15 |     fetch('leaderboard.json')
 16 |       .then((response) => response.json())
 17 |       .then((data) => {
 18 |         setData(data);
 19 | 
 20 |         // Extract datasets and metrics dynamically
 21 |         const datasetNames = data
 22 |           .reduce((acc, row) => {
 23 |             return acc.concat(Object.keys(row.scores));
 24 |           }, [])
 25 |           .filter((value, index, self) => self.indexOf(value) === index);
 26 |         setDatasets(datasetNames);
 27 | 
 28 |         const metricNames = {};
 29 |         datasetNames.forEach((dataset) => {
 30 |           const metricNamesArray = data
 31 |             .reduce((acc, row) => {
 32 |               return acc.concat(Object.keys(row.scores[dataset] || {}));
 33 |             }, [])
 34 |             .filter((value, index, self) => self.indexOf(value) === index);
 35 |           metricNames[dataset] = metricNamesArray;
 36 |         });
 37 |         setMetrics(metricNames);
 38 |       })
 39 |       .catch((error) =>
 40 |         console.error('Error loading leaderboard data:', error),
 41 |       );
 42 | 
 43 |     // Fetch default metrics
 44 |     fetch('default_metrics.json')
 45 |       .then((response) => response.json())
 46 |       .then((defaultMetrics) => {
 47 |         setDefaultMetrics(defaultMetrics.default_metrics); // Use the `default_metrics` field
 48 |       })
 49 |       .catch((error) => console.error('Error loading default metrics:', error));
 50 | 
 51 |     // Fetch dataset url
 52 |     // {
 53 |     // {
 54 |     //     "Heron": {
 55 |     //         "url": "https://huggingface.co/datasets/turing-motors/Japanese-Heron-Bench"
 56 |     //     },
 57 |     //     "JVB-ItW": {
 58 |     //         "url": "https://huggingface.co/datasets/SakanaAI/JA-VLM-Bench-In-the-Wild"
 59 |     //     },
 60 |     //     "VGVQA": {
 61 |     //         "url": "https://huggingface.co/datasets/SakanaAI/JA-VG-VQA-500"
 62 |     //     },
 63 |     //     "MulIm-VQA": {
 64 |     //         "url": "https://huggingface.co/datasets/SakanaAI/JA-Multi-Image-VQA"
 65 |     //     },
 66 |     //     "JDocQA": {
 67 |     //         "url": "https://huggingface.co/datasets/shunk031/JDocQA"
 68 |     //     },
 69 |     //     "JMMMU": {
 70 |     //         "url": "https://huggingface.co/datasets/JMMMU/JMMMU"
 71 |     //     }
 72 |     // }
 73 |     fetch('dataset_url.json')
 74 |       .then((response) => response.json())
 75 |       .then((datasetUrl) => {
 76 |         setDatasetUrl(datasetUrl);
 77 |       });
 78 |   }, []);
 79 | 
 80 |   const handleSort = (dataset, metric) => {
 81 |     let sortedData = [...data];
 82 |     const direction =
 83 |       sortConfig?.key === `${dataset}-${metric}` &&
 84 |       sortConfig.direction === 'asc'
 85 |         ? 'desc'
 86 |         : 'asc';
 87 |     sortedData.sort((a, b) => {
 88 |       const aValue = a.scores[dataset]?.[metric] || 0;
 89 |       const bValue = b.scores[dataset]?.[metric] || 0;
 90 |       if (aValue < bValue) return direction === 'asc' ? -1 : 1;
 91 |       if (aValue > bValue) return direction === 'asc' ? 1 : -1;
 92 |       return 0;
 93 |     });
 94 |     setSortConfig({ key: `${dataset}-${metric}`, direction });
 95 |     setData(sortedData);
 96 |   };
 97 | 
 98 |   const getSortArrow = (dataset, metric) => {
 99 |     if (sortConfig?.key === `${dataset}-${metric}`) {
100 |       return sortConfig.direction === 'asc' ? '↑' : '↓';
101 |     }
102 |     return '↕';
103 |   };
104 | 
105 |   return (
106 |     <div className='Leaderboard'>
107 |       <h1 className='leaderboard-title'>Leaderboard</h1>
108 | 
109 |       <div className='table-container'>
110 |         <table>
111 |           <thead>
112 |             <tr>
113 |               <th>Model</th>
114 |               {datasets.map((dataset) => (
115 |                 <th key={dataset} colSpan={metrics[dataset]?.length || 0}>
116 |                   <a href={datasetUrl[dataset]?.url}>{dataset}</a>
117 |                 </th>
118 |               ))}
119 |             </tr>
120 |             <tr>
121 |               <th></th>
122 |               {datasets.map((dataset) =>
123 |                 metrics[dataset]?.map((metric) => (
124 |                   <th
125 |                     key={`${dataset}-${metric}`}
126 |                     onClick={() => handleSort(dataset, metric)}
127 |                   >
128 |                     {metric} {getSortArrow(dataset, metric)}
129 |                   </th>
130 |                 )),
131 |               )}
132 |             </tr>
133 |           </thead>
134 |           <tbody>
135 |             {data.map((item, index) => (
136 |               <tr key={index}>
137 |                 <td>
138 |                   <a href={item.url}>{item.model}</a>
139 |                 </td>
140 |                 {datasets.map((dataset) =>
141 |                   metrics[dataset]?.map((metric) => (
142 |                     <td
143 |                       key={`${dataset}-${metric}`}
144 |                       className={
145 |                         defaultMetrics[dataset] === metric
146 |                           ? 'highlight-column'
147 |                           : ''
148 |                       }
149 |                     >
150 |                       {item.scores[dataset]?.[metric]?.toFixed(1) || '-'}
151 |                     </td>
152 |                   )),
153 |                 )}
154 |               </tr>
155 |             ))}
156 |           </tbody>
157 |         </table>
158 |       </div>
159 |     </div>
160 |   );
161 | }
162 | export default Leaderboard;
163 | 


--------------------------------------------------------------------------------
/github_pages/src/LinkButton.css:
--------------------------------------------------------------------------------
 1 | .link-button {
 2 |   height: 2.5rem;
 3 |   /* width: 8rem; */
 4 |   border: none;
 5 |   border-radius: 1.25rem;
 6 |   padding: 0rem 1.25rem;
 7 |   margin: 0.25rem;
 8 |   display: inline-flex;
 9 |   align-items: center;
10 |   justify-content: center;
11 |   background-color: #2f2f2f;
12 |   color: #ffffff;
13 |   font-size: 1rem;
14 |   font-weight: 400;
15 |   cursor: pointer;
16 | }
17 | 
18 | @media (hover: hover) {
19 |   .link-button:hover {
20 |     background-color: #595857;
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/github_pages/src/LinkButton.js:
--------------------------------------------------------------------------------
 1 | import './LinkButton.css';
 2 | 
 3 | const LinkButton = ({ url, children }) => {
 4 |   const handleClick = () => window.open(url, '_blank', 'noopener noreferrer');
 5 | 
 6 |   return (
 7 |     <button className='link-button' onClick={handleClick}>
 8 |       {children}
 9 |     </button>
10 |   );
11 | };
12 | export default LinkButton;
13 | 


--------------------------------------------------------------------------------
/github_pages/src/Main.css:
--------------------------------------------------------------------------------
1 | #root {
2 |   margin: 0 auto;
3 |   padding: 0;
4 |   text-align: center;
5 | }
6 | 


--------------------------------------------------------------------------------
/github_pages/src/Main.js:
--------------------------------------------------------------------------------
 1 | import BibTex from './BibTex';
 2 | // import Example from "./Example";
 3 | import Introduction from './Introduction';
 4 | import Leaderboard from './Leaderboard';
 5 | import PaperMetaData from './PaperMetaData';
 6 | 
 7 | import PageLayout from './PageLayout';
 8 | import './Main.css';
 9 | 
10 | const Main = () => {
11 |   return (
12 |     <PageLayout>
13 |       <PaperMetaData />
14 |       <Introduction />
15 |       {/* <Method /> */}
16 |       {/* <Example /> */}
17 |       <Leaderboard />
18 |       {/* <Result /> */}
19 |       <BibTex />
20 |     </PageLayout>
21 |   );
22 | };
23 | 
24 | export default Main;
25 | 


--------------------------------------------------------------------------------
/github_pages/src/Method.css:
--------------------------------------------------------------------------------
 1 | .method-title {
 2 |   font-size: 2rem;
 3 |   font-weight: 600;
 4 |   color: #363636;
 5 |   margin-block: 2.5rem 1.5rem;
 6 | }
 7 | 
 8 | .method-content {
 9 |   text-align: justify;
10 | }
11 | 


--------------------------------------------------------------------------------
/github_pages/src/Method.js:
--------------------------------------------------------------------------------
 1 | import Figure from './Figure';
 2 | import './Method.css';
 3 | 
 4 | const Method = () => {
 5 |   return (
 6 |     <div className='method'>
 7 |       <h1 className='method-title'>Design of llm-jp-eval-mm</h1>
 8 |       <div className='method-content'>TODO:</div>
 9 |     </div>
10 |   );
11 | };
12 | 
13 | export default Method;
14 | 


--------------------------------------------------------------------------------
/github_pages/src/PageLayout.css:
--------------------------------------------------------------------------------
1 | .main-content {
2 |   margin: 0 auto;
3 |   max-width: 1120px;
4 |   padding: 1rem;
5 | }
6 | 


--------------------------------------------------------------------------------
/github_pages/src/PageLayout.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import Footer from './Footer';
 3 | 
 4 | import './PageLayout.css';
 5 | 
 6 | const PageLayout = ({ children }) => {
 7 |   return (
 8 |     <>
 9 |       <div className='main-content'>{children}</div>
10 |       <Footer />
11 |     </>
12 |   );
13 | };
14 | 
15 | export default PageLayout;
16 | 


--------------------------------------------------------------------------------
/github_pages/src/PaperMetaData.css:
--------------------------------------------------------------------------------
 1 | .paper-title {
 2 |   font-family: 'Google Sans', sans-serif;
 3 |   font-size: 3rem;
 4 |   font-weight: 600;
 5 |   color: #363636;
 6 |   margin: 1rem 0rem 0rem 0rem;
 7 | }
 8 | 
 9 | .paper-subtitle {
10 |   font-family: 'Google Sans', sans-serif;
11 |   font-size: 2rem;
12 |   font-weight: 500;
13 |   margin: 0rem 0rem 1rem 0rem;
14 | }
15 | 
16 | .authors {
17 |   font-family: 'Google Sans', sans-serif;
18 |   font-weight: 300;
19 |   display: flex;
20 |   align-items: center;
21 |   justify-content: center;
22 |   gap: 0;
23 |   flex-wrap: wrap;
24 |   padding: 1rem;
25 | }
26 | 
27 | .authors a:any-link {
28 |   color: #209cee;
29 |   text-decoration: none;
30 | }
31 | 
32 | .authors-affiliations {
33 |   font-family: 'Google Sans', sans-serif;
34 |   font-weight: 300;
35 |   display: flex;
36 |   align-items: center;
37 |   justify-content: center;
38 |   gap: 0;
39 |   flex-wrap: wrap;
40 | 
41 |   span {
42 |     font-size: 1.25rem;
43 |   }
44 | }
45 | 
46 | .annotation-description {
47 |   font-family: 'Google Sans', sans-serif;
48 | 
49 |   p {
50 |     font-size: 1.25rem;
51 |     margin: 0.5rem 0rem;
52 |   }
53 | }
54 | 
55 | .link-buttons {
56 |   display: flex;
57 |   align-items: center;
58 |   justify-content: center;
59 |   gap: 0;
60 |   flex-wrap: wrap;
61 |   padding: 0.5rem;
62 | }
63 | 


--------------------------------------------------------------------------------
/github_pages/src/PaperMetaData.js:
--------------------------------------------------------------------------------
  1 | import { FaGithub, FaRegFilePdf} from 'react-icons/fa';
  2 | 
  3 | // import { AFFILIATION_COLORS, Author, AuthorProps, LinkButton, LinkButtonProps } from "./molecules";
  4 | import { AFFILIATION_COLORS } from './Author';
  5 | import Author from './Author';
  6 | import LinkButton from './LinkButton';
  7 | 
  8 | import './PaperMetaData.css';
  9 | 
 10 | // Paper title
 11 | const TITLE = <>llm-jp-eval-mm</>;
 12 | const SUBTITLE = (
 13 |   <>Automatic Evaluation Platform for Japanese Visual Language Models</>
 14 | );
 15 | 
 16 | // Authors of the paper
 17 | const AUTHORS = [
 18 |   {
 19 |     name: 'Koki Maeda',
 20 |     affiliation: [1, 4],
 21 |     annotation1: '†',
 22 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 23 |   },
 24 |   {
 25 |     name: 'Issa Sugiura',
 26 |     affiliation: [2, 4],
 27 |     annotation1: '†',
 28 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 29 |   },
 30 |   {
 31 |     name: 'Yusuke Oda',
 32 |     affiliation: [4],
 33 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 34 |   },
 35 |   {
 36 |     name: 'Shuhei Kurita',
 37 |     affiliation: [3, 4],
 38 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 39 |   },
 40 |   {
 41 |     name: 'Naoaki Okazaki',
 42 |     affiliation: [1, 4],
 43 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 44 |     isLast: true,
 45 |   },
 46 | ];
 47 | 
 48 | // Description of annotations
 49 | const AFFILIATIONS = [
 50 |   'dummy for index 0',
 51 |   'Institute of Science Tokyo',
 52 |   'Kyoto University',
 53 |   'NII',
 54 |   'NII LLMC',
 55 | ];
 56 | const ANNOTATION_DESCRIPTION = ['†: Equal Contribution'];
 57 | 
 58 | const LINK_BUTTONS = [
 59 |   {
 60 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 61 |     children: (
 62 |       <>
 63 |         <FaRegFilePdf />
 64 |         &nbsp;Paper (arXiv)
 65 |       </>
 66 |     ),
 67 |   },
 68 |   {
 69 |     url: 'https://github.com/llm-jp/llm-jp-eval-mm',
 70 |     children: (
 71 |       <>
 72 |         <FaGithub />
 73 |         &nbsp;Code
 74 |       </>
 75 |     ),
 76 |   },
 77 | ];
 78 | 
 79 | const PaperMetadata = () => {
 80 |   return (
 81 |     <div className='paper-metadata'>
 82 |       <h1 className='paper-title'>{TITLE}</h1>
 83 |       <h2 className='paper-subtitle'>{SUBTITLE}</h2>
 84 |       <div className='authors'>
 85 |         {AUTHORS.map((author, index) => (
 86 |           <Author key={`author${index}`} {...author} />
 87 |         ))}
 88 |       </div>
 89 |       <div className='authors-affiliations'>
 90 |         {AFFILIATIONS.map((affiliation, index) => {
 91 |           if (index === 0) return null;
 92 |           return (
 93 |             <span key={`affiliation${index}`}>
 94 |               <span
 95 |                 style={{ color: AFFILIATION_COLORS[index] }}
 96 |                 key={`affiliation${index}head`}
 97 |               >
 98 |                 {index}
 99 |               </span>
100 |               : {affiliation}
101 |               {index !== AFFILIATIONS.length - 1 && <>,&nbsp;</>}
102 |             </span>
103 |           );
104 |         })}
105 |       </div>
106 |       <div className='annotation-description'>
107 |         {ANNOTATION_DESCRIPTION.map((description, index) => (
108 |           <p key={`description${index}`}>{description}</p>
109 |         ))}
110 |       </div>
111 |       <div className='link-buttons'>
112 |         {LINK_BUTTONS.map((linkButton, index) => (
113 |           <LinkButton key={`linkButton${index}`} url={linkButton.url}>
114 |             {linkButton.children}
115 |           </LinkButton>
116 |         ))}
117 |       </div>
118 |     </div>
119 |   );
120 | };
121 | 
122 | export default PaperMetadata;
123 | 


--------------------------------------------------------------------------------
/github_pages/src/Result.css:
--------------------------------------------------------------------------------
 1 | .result-title {
 2 |   font-size: 2rem;
 3 |   font-weight: 600;
 4 |   color: #363636;
 5 |   margin-block: 2.5rem 1.5rem;
 6 | }
 7 | 
 8 | .result-content {
 9 |   text-align: justify;
10 | }
11 | 


--------------------------------------------------------------------------------
/github_pages/src/Result.js:
--------------------------------------------------------------------------------
 1 | import './Result.css';
 2 | 
 3 | const Result = () => {
 4 |   return (
 5 |     <div className='result'>
 6 |       <h1 className='result-title'>Findings</h1>
 7 |       <div className='result-content'>
 8 |         <span>In this section, we summarize our key observations.</span>
 9 |         {/* Culture-specific Split */}
10 |         <div>
11 |           <h2>Model Scaling</h2>
12 |           As the number of parameters increases, the performance of models
13 |           improves across ... TODO:
14 |         </div>
15 |         {/* Scores on Japanese Heritage */}
16 |         <div>
17 |           <h2>Variation in llm-as-a-judge scores.</h2>
18 |           TODO:
19 |           <h3>BAD behaviour of default metrics for each benchmark.</h3>
20 |           TODO:
21 |         </div>
22 |       </div>
23 |     </div>
24 |   );
25 | };
26 | 
27 | export default Result;
28 | 


--------------------------------------------------------------------------------
/github_pages/src/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/github_pages/src/assets/teaser.png


--------------------------------------------------------------------------------
/github_pages/src/index.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |   text-rendering: justify;
 3 |   font-family: 'Noto Sans', 'Noto Sans JP', sans-serif;
 4 |   font-weight: 400;
 5 | 
 6 |   color-scheme: normal;
 7 |   color: #4a4a4a;
 8 |   background-color: #ffffff;
 9 | 
10 |   font-synthesis: none;
11 |   text-rendering: optimizeLegibility;
12 |   -webkit-font-smoothing: antialiased;
13 |   -moz-osx-font-smoothing: grayscale;
14 |   -webkit-text-size-adjust: 100%;
15 | }
16 | 
17 | body {
18 |   margin: 0;
19 |   display: flex;
20 |   place-items: top center;
21 |   min-width: 300px;
22 |   min-height: 100vh;
23 | }
24 | 
25 | @media (hover: hover) {
26 |   a:any-link:hover {
27 |     color: #ba2636;
28 |   }
29 | }
30 | 
31 | h2 {
32 |   font-size: 1.25rem;
33 |   font-weight: 600;
34 |   margin: 0.75rem 0 0.25rem 0;
35 | }
36 | 
37 | button {
38 |   border-radius: 0.5rem;
39 |   border: 1px solid transparent;
40 |   padding: 0.5rem;
41 |   font-size: 1rem;
42 |   font-weight: 400;
43 |   font-family: inherit;
44 |   cursor: pointer;
45 | }
46 | 
47 | select {
48 |   border-radius: 0.5rem;
49 |   border: 1px solid transparent;
50 |   padding: 0.5rem;
51 |   font-size: 1rem;
52 |   font-weight: 400;
53 |   font-family: inherit;
54 |   cursor: pointer;
55 | }
56 | 


--------------------------------------------------------------------------------
/github_pages/src/index.js:
--------------------------------------------------------------------------------
1 | import { createRoot } from 'react-dom/client';
2 | import Main from './Main';
3 | import './index.css';
4 | 
5 | const container = document.querySelector('#root');
6 | const root = createRoot(container);
7 | 
8 | root.render(<Main />);
9 | 


--------------------------------------------------------------------------------
/github_pages/src/logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 841.9 595.3"><g fill="#61DAFB"><path d="M666.3 296.5c0-32.5-40.7-63.3-103.1-82.4 14.4-63.6 8-114.2-20.2-130.4-6.5-3.8-14.1-5.6-22.4-5.6v22.3c4.6 0 8.3.9 11.4 2.6 13.6 7.8 19.5 37.5 14.9 75.7-1.1 9.4-2.9 19.3-5.1 29.4-19.6-4.8-41-8.5-63.5-10.9-13.5-18.5-27.5-35.3-41.6-50 32.6-30.3 63.2-46.9 84-46.9V78c-27.5 0-63.5 19.6-99.9 53.6-36.4-33.8-72.4-53.2-99.9-53.2v22.3c20.7 0 51.4 16.5 84 46.6-14 14.7-28 31.4-41.3 49.9-22.6 2.4-44 6.1-63.6 11-2.3-10-4-19.7-5.2-29-4.7-38.2 1.1-67.9 14.6-75.8 3-1.8 6.9-2.6 11.5-2.6V78.5c-8.4 0-16 1.8-22.6 5.6-28.1 16.2-34.4 66.7-19.9 130.1-62.2 19.2-102.7 49.9-102.7 82.3 0 32.5 40.7 63.3 103.1 82.4-14.4 63.6-8 114.2 20.2 130.4 6.5 3.8 14.1 5.6 22.5 5.6 27.5 0 63.5-19.6 99.9-53.6 36.4 33.8 72.4 53.2 99.9 53.2 8.4 0 16-1.8 22.6-5.6 28.1-16.2 34.4-66.7 19.9-130.1 62-19.1 102.5-49.9 102.5-82.3zm-130.2-66.7c-3.7 12.9-8.3 26.2-13.5 39.5-4.1-8-8.4-16-13.1-24-4.6-8-9.5-15.8-14.4-23.4 14.2 2.1 27.9 4.7 41 7.9zm-45.8 106.5c-7.8 13.5-15.8 26.3-24.1 38.2-14.9 1.3-30 2-45.2 2-15.1 0-30.2-.7-45-1.9-8.3-11.9-16.4-24.6-24.2-38-7.6-13.1-14.5-26.4-20.8-39.8 6.2-13.4 13.2-26.8 20.7-39.9 7.8-13.5 15.8-26.3 24.1-38.2 14.9-1.3 30-2 45.2-2 15.1 0 30.2.7 45 1.9 8.3 11.9 16.4 24.6 24.2 38 7.6 13.1 14.5 26.4 20.8 39.8-6.3 13.4-13.2 26.8-20.7 39.9zm32.3-13c5.4 13.4 10 26.8 13.8 39.8-13.1 3.2-26.9 5.9-41.2 8 4.9-7.7 9.8-15.6 14.4-23.7 4.6-8 8.9-16.1 13-24.1zM421.2 430c-9.3-9.6-18.6-20.3-27.8-32 9 .4 18.2.7 27.5.7 9.4 0 18.7-.2 27.8-.7-9 11.7-18.3 22.4-27.5 32zm-74.4-58.9c-14.2-2.1-27.9-4.7-41-7.9 3.7-12.9 8.3-26.2 13.5-39.5 4.1 8 8.4 16 13.1 24 4.7 8 9.5 15.8 14.4 23.4zM420.7 163c9.3 9.6 18.6 20.3 27.8 32-9-.4-18.2-.7-27.5-.7-9.4 0-18.7.2-27.8.7 9-11.7 18.3-22.4 27.5-32zm-74 58.9c-4.9 7.7-9.8 15.6-14.4 23.7-4.6 8-8.9 16-13 24-5.4-13.4-10-26.8-13.8-39.8 13.1-3.1 26.9-5.8 41.2-7.9zm-90.5 125.2c-35.4-15.1-58.3-34.9-58.3-50.6 0-15.7 22.9-35.6 58.3-50.6 8.6-3.7 18-7 27.7-10.1 5.7 19.6 13.2 40 22.5 60.9-9.2 20.8-16.6 41.1-22.2 60.6-9.9-3.1-19.3-6.5-28-10.2zM310 490c-13.6-7.8-19.5-37.5-14.9-75.7 1.1-9.4 2.9-19.3 5.1-29.4 19.6 4.8 41 8.5 63.5 10.9 13.5 18.5 27.5 35.3 41.6 50-32.6 30.3-63.2 46.9-84 46.9-4.5-.1-8.3-1-11.3-2.7zm237.2-76.2c4.7 38.2-1.1 67.9-14.6 75.8-3 1.8-6.9 2.6-11.5 2.6-20.7 0-51.4-16.5-84-46.6 14-14.7 28-31.4 41.3-49.9 22.6-2.4 44-6.1 63.6-11 2.3 10.1 4.1 19.8 5.2 29.1zm38.5-66.7c-8.6 3.7-18 7-27.7 10.1-5.7-19.6-13.2-40-22.5-60.9 9.2-20.8 16.6-41.1 22.2-60.6 9.9 3.1 19.3 6.5 28.1 10.2 35.4 15.1 58.3 34.9 58.3 50.6-.1 15.7-23 35.6-58.4 50.6zM320.8 78.4z"/><circle cx="420.9" cy="296.5" r="45.7"/><path d="M520.5 78.1z"/></g></svg>
2 | 


--------------------------------------------------------------------------------
/github_pages/src/reportWebVitals.js:
--------------------------------------------------------------------------------
 1 | const reportWebVitals = (onPerfEntry) => {
 2 |   if (onPerfEntry && onPerfEntry instanceof Function) {
 3 |     import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
 4 |       getCLS(onPerfEntry);
 5 |       getFID(onPerfEntry);
 6 |       getFCP(onPerfEntry);
 7 |       getLCP(onPerfEntry);
 8 |       getTTFB(onPerfEntry);
 9 |     });
10 |   }
11 | };
12 | 
13 | export default reportWebVitals;
14 | 


--------------------------------------------------------------------------------
/github_pages/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "eval-mm"
  3 | description = "eval-mm is a tool for evaluating Multi-Modal Large Language Models."
  4 | authors = [
  5 |     { name = "Silviase", email = "koki.maeda@nlp.c.titech.ac.jp" },
  6 |     { name = "speed1313", email = "sugiura.issa.q29@kyoto-u.jp" },
  7 | ]
  8 | dependencies = [
  9 |     "datasets==2.18.0",
 10 |     "requests>=2.32.3",
 11 |     "python-dotenv>=1.0.1",
 12 |     "openai>=1.42.0",
 13 |     "rouge-score>=0.1.2",
 14 |     "emoji>=2.12.1",
 15 |     "fugashi>=1.3.2",
 16 |     "unidic-lite>=1.0.8",
 17 |     "sacrebleu[ja]>=2.4.3",
 18 |     "backoff>=2.2.1",
 19 |     "scipy>=1.15.1",
 20 |     "torch>=2.5.1",
 21 |     "webdataset>=0.2.111",
 22 |     "av>=14.1.0",
 23 |     "loguru>=0.7.3",
 24 |     "types-requests>=2.32.0.20250328",
 25 | ]
 26 | readme = "README.md"
 27 | license = "Apache-2.0"
 28 | requires-python = ">= 3.12.1"
 29 | 
 30 | dynamic = [
 31 |     "version"
 32 | ]
 33 | 
 34 | [project.urls]
 35 | Repository = "https://github.com/llm-jp/llm-jp-eval-mm"
 36 | 
 37 | [build-system]
 38 | requires = ["hatchling"]
 39 | build-backend = "hatchling.build"
 40 | 
 41 | [tool.hatch.version]
 42 | path = "src/eval_mm/_version.py"
 43 | 
 44 | [tool.hatch.build]
 45 | exclude = [
 46 |     "result",
 47 |     "scripts",
 48 | ]
 49 | 
 50 | [tool.uv.sources]
 51 | mantis-vl = { git = "https://github.com/TIGER-AI-Lab/Mantis" }
 52 | s2wrapper = { git = "https://github.com/bfshi/scaling_on_scales.git" }
 53 | 
 54 | [tool.hatch.metadata]
 55 | allow-direct-references = true
 56 | 
 57 | [dependency-groups]
 58 | dev = [
 59 |     "mypy>=1.15.0",
 60 |     "pre-commit>=4.2.0",
 61 |     "pytest>=8.3.4",
 62 |     "seaborn>=0.13.2",
 63 |     "streamlit>=1.43.2",
 64 | ]
 65 | 
 66 | evovlm = [
 67 |     "flash-attn>=2.7.3",
 68 |     "transformers==4.42.4",
 69 |     "mantis-vl",
 70 | ]
 71 | vilaja = [
 72 |     "flash-attn>=2.7.3",
 73 |     "accelerate==0.27.2",
 74 |     "deepspeed>=0.16.3",
 75 |     "einops>=0.8.0",
 76 |     "psutils>=3.3.9",
 77 |     "s2wrapper",
 78 |     "sentencepiece>=0.2.0",
 79 |     "torchvision>=0.20.1",
 80 |     "transformers==4.37.2",
 81 | ]
 82 | sarashina = [
 83 |     "flash-attn>=2.7.3",
 84 |     "accelerate>=0.27.2",
 85 |     "pillow>=10.4.0",
 86 |     "protobuf>=5.29.3",
 87 |     "sentencepiece>=0.2.0",
 88 |     "torch>=2.5.1",
 89 |     "torchvision>=0.20.1",
 90 |     "transformers==4.47.0",
 91 | ]
 92 | normal = [
 93 |     "flash-attn>=2.7.3",
 94 |     "accelerate>=1.2.1",
 95 |     "qwen-vl-utils>=0.0.8",
 96 |     "sentencepiece>=0.2.0",
 97 |     "timm>=1.0.13",
 98 |     "torchvision>=0.20.1",
 99 |     "transformers>=4.50.0",
100 | ]
101 | stablevlm = [
102 |     "flash-attn>=2.7.3",
103 |     "accelerate>=1.2.1",
104 |     "qwen-vl-utils>=0.0.8",
105 |     "sentencepiece>=0.2.0",
106 |     "timm>=1.0.13",
107 |     "torchvision>=0.20.1",
108 |     "transformers==4.45.0",
109 | ]
110 | phi = [
111 |     "accelerate==1.3.0",
112 |     "flash-attn==2.7.4.post1",
113 |     "torch==2.6.0",
114 |     "peft==0.13.2",
115 |     "soundfile==0.13.1",
116 |     "torchvision==0.21.0",
117 |     "transformers==4.48.2",
118 | ]
119 | pixtral = [
120 |     "flash-attn>=2.7.3",
121 |     "mistral-common>=1.5.4",
122 |     "vllm>=0.8.1",
123 | ]
124 | calm = [
125 |     "flash-attn>=2.7.3",
126 |     "transformers==4.45.0",
127 | ]
128 | heron_nvila = [
129 |     "setuptools>=80.1.0",
130 |     "psutil>=7.0.0",
131 |     "accelerate==0.27.2",
132 |     "deepspeed>=0.16.3",
133 |     "einops>=0.8.0",
134 |     "hydra-core>=1.3.2",
135 |     "loguru>=0.7.3",
136 |     "opencv-python-headless==4.10.0.84",
137 |     "psutils>=3.3.9",
138 |     "s2wrapper",
139 |     "sentencepiece>=0.2.0",
140 |     "torchvision>=0.20.1",
141 |     "transformers==4.46.0",
142 | ]
143 | vllm_normal = [
144 |     "flash-attn>=2.7.3",
145 |     "accelerate>=1.2.1",
146 |     "qwen-vl-utils>=0.0.8",
147 |     "sentencepiece>=0.2.0",
148 |     "timm>=1.0.13",
149 |     "torchvision>=0.20.1",
150 |     "transformers>=4.50.0",
151 |     "vllm>=0.8.1",
152 | ]
153 | 
154 | [tool.uv]
155 | conflicts = [
156 |     [
157 |       { group = "evovlm" },
158 |       { group = "vilaja" },
159 |       { group = "normal" },
160 |       { group = "pixtral" },
161 |       { group = "gemma" },
162 |       { group = "sarashina"},
163 |       { group = "calm"},
164 |       { group = "phi"},
165 |       { group = "llama4"},
166 |       { group = "stablevlm"},
167 |       { group = "heron_nvila"},
168 |       { group = "vllm_normal"},
169 |     ],
170 | ]
171 | no-build-isolation-package = ["flash-attn"]
172 | 
173 | [tool.hatch.build.targets.wheel]
174 | packages = ["src/eval_mm"]
175 | 


--------------------------------------------------------------------------------
/scripts/browse_prediction.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import random
  3 | import eval_mm
  4 | from argparse import ArgumentParser
  5 | import os
  6 | import json
  7 | 
  8 | 
  9 | def parse_args():
 10 |     parser = ArgumentParser()
 11 |     parser.add_argument("--task_id", type=str, default="japanese-heron-bench")
 12 |     parser.add_argument("--result_dir", type=str, default="result")
 13 |     parser.add_argument("--model_list", type=str, nargs="+", default=[])
 14 | 
 15 |     return parser.parse_args()
 16 | 
 17 | 
 18 | def scrollable_text(text):
 19 |     return (
 20 |         f'<div style="max-height: 300px; overflow-y: auto; height: auto;">{text}</div>'
 21 |     )
 22 | 
 23 | 
 24 | if __name__ == "__main__":
 25 |     args = parse_args()
 26 | 
 27 |     task = eval_mm.TaskRegistry().load_task(args.task_id)
 28 | 
 29 |     # Load model prediction
 30 |     predictions_per_model = {}
 31 |     for model_id in args.model_list:
 32 |         prediction_path = os.path.join(
 33 |             args.result_dir, args.task_id, model_id, "prediction.jsonl"
 34 |         )
 35 |         with open(prediction_path, "r") as f:
 36 |             predictions_per_model[model_id] = [json.loads(line) for line in f]
 37 | 
 38 |     # VQAデータ読み込み
 39 |     ds = task.dataset
 40 |     # session_stateの初期化
 41 |     st.set_page_config(layout="wide")
 42 |     if "page" not in st.session_state:
 43 |         st.session_state.page = 0  # 現在のページ番号
 44 | 
 45 |     SAMPLES_PER_PAGE = 30  # 1ページに表示する件数
 46 |     # Question ID, Image, Question, Answer, Prediction_model1, Prediction_model2,..
 47 |     column_width_list = [1, 3, 3, 3] + [4] * len(args.model_list)
 48 |     st.write(f"# {args.task_id}")
 49 | 
 50 |     def show_sample(idx):
 51 |         sample = ds[idx]
 52 |         cols = st.columns(column_width_list)
 53 |         cols[0].markdown(task.doc_to_id(sample))
 54 |         cols[1].image(task.doc_to_visual(sample)[0], width=300)
 55 |         cols[2].markdown(
 56 |             scrollable_text(task.doc_to_text(sample)), unsafe_allow_html=True
 57 |         )
 58 |         cols[3].markdown(
 59 |             scrollable_text(task.doc_to_answer(sample)), unsafe_allow_html=True
 60 |         )
 61 |         for model_id in args.model_list:
 62 |             cols[4 + args.model_list.index(model_id)].markdown(
 63 |                 scrollable_text(predictions_per_model[model_id][idx]["text"]),
 64 |                 unsafe_allow_html=True,
 65 |             )
 66 | 
 67 |     # ナビゲーションボタン
 68 |     nav_col1, nav_col2, nav_col3 = st.columns(3)
 69 |     if nav_col1.button(f"Prev {SAMPLES_PER_PAGE}"):
 70 |         st.session_state.page = max(st.session_state.page - 1, 0)
 71 |     if nav_col2.button("Random"):
 72 |         st.session_state.page = random.randint(0, len(ds) // SAMPLES_PER_PAGE)
 73 |     if nav_col3.button(f"Next {SAMPLES_PER_PAGE}"):
 74 |         st.session_state.page = min(
 75 |             st.session_state.page + 1, len(ds) // SAMPLES_PER_PAGE
 76 |         )
 77 | 
 78 |     # 現在のページのサンプルを表示
 79 |     start_idx = st.session_state.page * SAMPLES_PER_PAGE
 80 |     end_idx = min(start_idx + SAMPLES_PER_PAGE, len(ds))
 81 | 
 82 |     st.write(f"### Showing samples {start_idx + 1} to {end_idx} / {len(ds)}")
 83 | 
 84 |     # ヘッダー columnを表示
 85 |     header_cols = st.columns(column_width_list)
 86 |     header_cols[0].markdown("ID")
 87 |     header_cols[1].markdown("Image")
 88 |     header_cols[2].markdown("Question")
 89 |     header_cols[3].markdown("Answer")
 90 |     for model_id in args.model_list:
 91 |         header_cols[4 + args.model_list.index(model_id)].markdown(
 92 |             f"Prediction ({model_id})"
 93 |         )
 94 | 
 95 |     # サンプルを表示
 96 |     for idx in range(start_idx, end_idx):
 97 |         with st.container():
 98 |             show_sample(idx)
 99 |             st.markdown("---")
100 | 


--------------------------------------------------------------------------------
/scripts/consistency_mecha_ja.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import japanize_matplotlib  # noqa
  6 | import numpy as np
  7 | 
  8 | # ======================================
  9 | # モデルごとのjsonlファイルパスを設定
 10 | # ======================================
 11 | vis_dir = "logs/mecha-ja/visualize/"
 12 | root_dir = "logs/mecha-ja/prediction/"
 13 | files = os.listdir(root_dir)
 14 | model_names = [
 15 |     os.path.basename(f).replace(".jsonl", "") for f in files if f.endswith(".jsonl")
 16 | ]
 17 | 
 18 | model_files = {
 19 |     model_name: os.path.join(root_dir, f"{model_name}.jsonl")
 20 |     for model_name in model_names
 21 | }
 22 | 
 23 | model_dfs = {}
 24 | 
 25 | 
 26 | # ======================================
 27 | # JSONLを読み込み、DataFrame化する関数
 28 | # ======================================
 29 | def load_jsonl_to_df(file_path):
 30 |     data_list = []
 31 |     with open(file_path, "r", encoding="utf-8") as f:
 32 |         for line in f:
 33 |             data_list.append(json.loads(line.strip()))
 34 |     return pd.DataFrame(data_list)
 35 | 
 36 | 
 37 | # ======================================
 38 | # データ読み込み & 予測列の追加
 39 | # ======================================
 40 | def check_abcd(text):
 41 |     letters = ["A", "B", "C", "D"]
 42 |     found = [
 43 |         ch for ch in letters if ch in text
 44 |     ]  # テキスト中に含まれる A/B/C/D をリスト化
 45 |     # 含まれている文字がちょうど1つならその文字、そうでなければ F を返す
 46 |     return found[0] if len(found) == 1 else "F"
 47 | 
 48 | 
 49 | for model_name, file_path in model_files.items():
 50 |     df = load_jsonl_to_df(file_path)
 51 |     df["pred"] = df["text"].apply(check_abcd)
 52 |     model_dfs[model_name] = df
 53 | 
 54 | rotate_map = {
 55 |     "A": ["A", "D", "C", "B"],
 56 |     "B": ["B", "A", "D", "C"],
 57 |     "C": ["C", "B", "A", "D"],
 58 |     "D": ["D", "C", "B", "A"],
 59 | }
 60 | 
 61 | # ======================================
 62 | # (1) モデルごとの回答選択肢の分布を可視化（相対頻度）
 63 | # ======================================
 64 | # 3 x 3 のグリッドで可視化
 65 | fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10, 10), sharex=True, sharey=True)
 66 | 
 67 | # axesを1次元にする
 68 | axes = axes.flatten()
 69 | 
 70 | for ax, (model_name, df) in zip(axes, model_dfs.items()):
 71 |     # 予測回答の分布（相対頻度）をカウント
 72 |     pred_counts = (
 73 |         df["pred"].value_counts().reindex(["A", "B", "C", "D", "F"], fill_value=0)
 74 |     )
 75 |     pred_counts = pred_counts / pred_counts.sum()  # 相対頻度に変換
 76 | 
 77 |     ax.bar(
 78 |         pred_counts.index,
 79 |         pred_counts.values,
 80 |         color=["#FF9999", "#FFE888", "#99FF99", "#99CCFF", "#CCCCCC"],
 81 |     )
 82 | 
 83 |     # 0.25 に赤い線を引く
 84 |     ax.axhline(y=0.25, color="r", linestyle="--", linewidth=1)
 85 | 
 86 |     ax.set_title(f"{model_name}")
 87 |     ax.set_xlabel("選択肢")
 88 |     ax.set_ylabel("選択頻度")
 89 | 
 90 | plt.tight_layout()
 91 | plt.savefig(os.path.join(vis_dir, "prediction_distribution.png"))
 92 | plt.close()
 93 | 
 94 | # ======================================
 95 | # (2) soft accuracy, strict accuracy, consistency の計算
 96 | # --------------------------------------
 97 | # ・soft accuracy: 単純に df["mecha-ja"] の平均値
 98 | # ・strict accuracy: 同じ問題 (q_base) で全て mecha-ja==1 なら正解とカウント
 99 | # ・consistency: rot0 の予測に応じて rotate_map 通りになっている割合
100 | # ======================================
101 | results = []
102 | 
103 | for model_name, df in model_dfs.items():
104 |     # soft accuracy
105 |     soft_accuracy = df["mecha-ja"].mean()  # 1と0の平均 = 正解率
106 | 
107 |     # "X_rotY" の "X" 部分を q_base として抽出
108 |     df["q_base"] = df["question_id"].apply(lambda x: x.split("_rot")[0])
109 |     # 回転番号を取得
110 |     df["rot"] = df["question_id"].apply(lambda x: int(x.split("_rot")[1]))
111 | 
112 |     grouped = df.groupby("q_base")
113 |     unique_questions = df["q_base"].unique()
114 | 
115 |     correct_count = 0  # strict用
116 |     consistent_count = 0
117 | 
118 |     for q_id, group in grouped:
119 |         # strict正答率 (全rotで mecha-ja == 1)
120 |         if all(group["mecha-ja"] == 1):
121 |             correct_count += 1
122 | 
123 |         # 一貫性 (rotate_map に従っているか)
124 |         group_sorted = group.sort_values("rot")
125 |         preds = group_sorted["pred"].tolist()  # rot0→rot1→rot2→rot3 の順
126 | 
127 |         pred_rot0 = preds[0]  # 最初が rot0
128 |         if pred_rot0 in rotate_map:
129 |             expected_sequence = rotate_map[pred_rot0]
130 |             # 一貫しているかどうか
131 |             if len(preds) == 4 and preds == expected_sequence:
132 |                 consistent_count += 1
133 | 
134 |     strict_accuracy = (
135 |         correct_count / len(unique_questions) if len(unique_questions) else 0
136 |     )
137 |     consistency = (
138 |         consistent_count / len(unique_questions) if len(unique_questions) else 0
139 |     )
140 | 
141 |     results.append(
142 |         {
143 |             "model": model_name,
144 |             "soft_accuracy": soft_accuracy,
145 |             "strict_accuracy": strict_accuracy,
146 |             "consistency": consistency,
147 |         }
148 |     )
149 | 
150 | results_df = pd.DataFrame(results)
151 | print(results_df)
152 | 
153 | # ======================================
154 | # (3) 3種の指標 (soft, strict, consistency) を棒グラフで比較
155 | # ======================================
156 | metrics = ["soft_accuracy", "strict_accuracy", "consistency"]
157 | x = np.arange(len(results_df))  # モデル数
158 | width = 0.25  # 棒の幅
159 | 
160 | fig, ax = plt.subplots(figsize=(8, 8))
161 | 
162 | ax.bar(x, results_df["soft_accuracy"], width=width, label="Soft Accuracy", alpha=0.7)
163 | ax.bar(
164 |     x - width,
165 |     results_df["strict_accuracy"],
166 |     width=width,
167 |     label="Strict Accuracy",
168 |     alpha=0.7,
169 | )
170 | ax.bar(
171 |     x + width + 0.05,
172 |     results_df["consistency"],
173 |     width=width,
174 |     label="Consistency",
175 |     alpha=0.7,
176 | )
177 | 
178 | ax.set_xticks(x)
179 | ax.set_xticklabels(results_df["model"], rotation=90)
180 | ax.set_ylim(0, 1)
181 | ax.set_ylabel("Rate")
182 | ax.set_title("Soft/Strict Accuracy & Consistency by Model")
183 | ax.legend()
184 | 
185 | plt.tight_layout()
186 | plt.savefig(os.path.join(vis_dir, "accuracy_consistency_comparison.png"))
187 | plt.close()
188 | 


--------------------------------------------------------------------------------
/scripts/prepare_jic_vqa.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset
  2 | import os
  3 | import requests
  4 | from PIL import Image as PILImage
  5 | from io import BytesIO
  6 | import backoff
  7 | import webdataset as wds
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | # 画像をダウンロード
 12 | @backoff.on_exception(
 13 |     backoff.expo,
 14 |     requests.exceptions.RequestException,
 15 |     max_tries=5,
 16 | )
 17 | def download_image(image_url: str) -> PILImage.Image:
 18 |     user_agent_string = (
 19 |         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
 20 |     )
 21 |     response = requests.get(
 22 |         image_url, headers={"User-Agent": user_agent_string}, timeout=10
 23 |     )
 24 |     response.raise_for_status()
 25 |     image = PILImage.open(BytesIO(response.content)).convert("RGB")
 26 |     return image
 27 | 
 28 | 
 29 | def download_image_wrap(image_url: str) -> PILImage.Image | None:
 30 |     try:
 31 |         return download_image(image_url)
 32 |     except Exception as e:
 33 |         print(f"Failed to process {image_url}: {e}")
 34 |         return None
 35 | 
 36 | 
 37 | def get_domain_from_question(question: str) -> str | None:
 38 |     for keyword, domain in domain_dict.items():
 39 |         if keyword in question:
 40 |             return domain
 41 |     return None
 42 | 
 43 | 
 44 | # 型アノテーション付き変数定義
 45 | input_texts: list[str] = []
 46 | answers: list[str] = []
 47 | images: list[PILImage.Image | None] = []
 48 | question_ids: list[str] = []
 49 | domains: list[str] = []
 50 | 
 51 | # ドメイン辞書
 52 | domain_dict: dict[str, str] = {
 53 |     "花": "jaflower30",
 54 |     "食べ物": "jafood101",
 55 |     "ランドマーク": "jalandmark10",
 56 |     "施設": "jafacility20",
 57 | }
 58 | 
 59 | # データセット読み込み＆画像保存処理
 60 | output_dir = "dataset/jic_vqa"
 61 | os.makedirs(output_dir, exist_ok=True)
 62 | 
 63 | ds = load_dataset("line-corporation/JIC-VQA", split="train")
 64 | 
 65 | if not os.path.exists(f"{output_dir}/images.tar"):
 66 |     with wds.TarWriter(f"{output_dir}/images.tar") as sink:
 67 |         for i, example in tqdm(enumerate(ds), total=len(ds)):
 68 |             image_url = example["url"]
 69 |             image = download_image_wrap(image_url)
 70 |             if image is not None:
 71 |                 image = image.resize((224, 224)).convert("RGB")
 72 |             if image is None:
 73 |                 continue
 74 |             sample = {
 75 |                 "__key__": str(example["id"]),
 76 |                 "jpg": image,
 77 |                 "txt": example["category"],
 78 |                 "url.txt": image_url,
 79 |                 "question.txt": example["question"],
 80 |             }
 81 |             sink.write(sample)
 82 | 
 83 | # WebDatasetの読み込みと加工処理
 84 | ds = load_dataset("webdataset", data_files=f"{output_dir}/images.tar", split="train")
 85 | print(ds)
 86 | print(ds[0])
 87 | 
 88 | ds = ds.remove_columns(["__url__"])
 89 | ds = ds.rename_columns(
 90 |     {
 91 |         "txt": "category",
 92 |         "url.txt": "url",
 93 |         "question.txt": "question",
 94 |     }
 95 | )
 96 | 
 97 | ds = ds.map(
 98 |     lambda x: {
 99 |         "input_text": x["question"].decode("utf-8"),
100 |         "url": x["url"].decode("utf-8").encode("utf-8"),
101 |         "answer": str(x["category"]),
102 |         "image": x["jpg"],
103 |         "question_id": int(x["__key__"]),
104 |         "domain": get_domain_from_question(str(x["question"].decode("utf-8"))),
105 |     }
106 | )
107 | 
108 | ds = ds.remove_columns(["question", "__key__", "jpg"])
109 | 
110 | print(ds)
111 | print(ds[0])
112 | ds.to_parquet("dataset/jic_vqa.parquet")
113 | 


--------------------------------------------------------------------------------
/src/eval_mm/__init__.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv as _load_dotenv
 2 | from .tasks.task_registry import TaskRegistry
 3 | from .tasks.task import TaskConfig
 4 | from .metrics.scorer_registry import ScorerRegistry
 5 | from .metrics.scorer import ScorerConfig
 6 | from .utils.azure_client import OpenAIChatAPI
 7 | 
 8 | # Load environment variables
 9 | _load_dotenv()
10 | 
11 | __all__ = [
12 |     "TaskConfig",
13 |     "TaskRegistry",
14 |     "ScorerRegistry",
15 |     "ScorerConfig",
16 |     "OpenAIChatAPI",
17 | ]
18 | 


--------------------------------------------------------------------------------
/src/eval_mm/_version.py:
--------------------------------------------------------------------------------
1 | """Version specifier.
2 | 
3 | DON'T TOUCH THIS FILE.
4 | This file is replaced during the release process.
5 | """
6 | 
7 | __version__ = "0.0.0"
8 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | from .heron_bench_scorer import HeronBenchScorer
 2 | from .exact_match_scorer import ExactMatchScorer
 3 | from .llm_as_a_judge_scorer import LlmAsaJudgeScorer
 4 | from .rougel_scorer import RougeLScorer
 5 | from .substring_match_scorer import SubstringMatchScorer
 6 | from .scorer import Scorer
 7 | from .jmmmu_scorer import JMMMUScorer
 8 | from .mmmu_scorer import MMMUScorer
 9 | from .jdocqa_scorer import JDocQAScorer
10 | from .jic_vqa_scorer import JICVQAScorer
11 | from .mecha_ja_scorer import MECHAJaScorer
12 | from .cc_ocr_scorer import CCOCRScorer
13 | from .scorer_registry import ScorerRegistry
14 | 
15 | 
16 | __all__ = [
17 |     "HeronBenchScorer",
18 |     "ExactMatchScorer",
19 |     "LlmAsaJudgeScorer",
20 |     "RougeLScorer",
21 |     "SubstringMatchScorer",
22 |     "Scorer",
23 |     "JMMMUScorer",
24 |     "MMMUScorer",
25 |     "JDocQAScorer",
26 |     "JICVQAScorer",
27 |     "MECHAJaScorer",
28 |     "CCOCRScorer",
29 |     "ScorerRegistry",
30 | ]
31 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/exact_match_scorer.py:
--------------------------------------------------------------------------------
 1 | from .scorer import Scorer, AggregateOutput
 2 | 
 3 | 
 4 | class ExactMatchScorer(Scorer):
 5 |     @staticmethod
 6 |     def score(refs: list[str], preds: list[str]) -> list[int]:
 7 |         scores = [int(ref == pred) for ref, pred in zip(refs, preds)]
 8 |         return scores
 9 | 
10 |     @staticmethod
11 |     def aggregate(scores: list[int]) -> AggregateOutput:
12 |         mean = sum(scores) / len(scores)
13 |         return AggregateOutput(mean, {"exact_match": mean})
14 | 
15 | 
16 | def test_exact_match_scorer():
17 |     from .scorer import ScorerConfig
18 | 
19 |     scorer = ExactMatchScorer(ScorerConfig())
20 |     refs = ["私は猫です。", "私は犬です。"]
21 |     preds = ["私は犬です。", "私は犬です。"]
22 |     scores = scorer.score(refs, preds)
23 |     assert scores == [0, 1]
24 |     scores = scorer.aggregate([1, 1, 1, 0])
25 |     assert scores.overall_score == 0.75
26 |     assert scores.details == {"exact_match": 0.75}
27 |     scores = scorer.aggregate([1, 1, 0, 0])
28 |     assert scores.overall_score == 0.5
29 |     assert scores.details == {"exact_match": 0.5}
30 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/jdocqa_scorer.py:
--------------------------------------------------------------------------------
  1 | from eval_mm.metrics.scorer import Scorer, AggregateOutput
  2 | from sacrebleu import sentence_bleu
  3 | from unicodedata import normalize
  4 | 
  5 | ANSWER_TYPE_MAP = {
  6 |     "yesno": 0,  # Yes/No questions
  7 |     "factoid": 1,  # Factoid questions
  8 |     "numerical": 2,  # Numerical questions
  9 |     "open-ended": 3,  # Open-ended questions
 10 | }
 11 | 
 12 | NUM_TO_ANSWER_TYPE = {v: k for k, v in ANSWER_TYPE_MAP.items()}
 13 | 
 14 | 
 15 | def jdocqa_normalize(text):
 16 |     text = (
 17 |         text.replace("です", "")
 18 |         .replace("。", "")
 19 |         .replace("、", "")
 20 |         .replace(" ", "")
 21 |         .strip()
 22 |     )
 23 |     text = normalize("NFKC", text)
 24 |     return text
 25 | 
 26 | 
 27 | def bleu_ja(refs, pred):
 28 |     """Calculate BLEU score for Japanese text. Score is normalized to [0, 1]."""
 29 |     bleu_score = sentence_bleu(
 30 |         hypothesis=pred,
 31 |         references=refs,
 32 |         smooth_method="exp",
 33 |         smooth_value=0.0,
 34 |         tokenize="ja-mecab",
 35 |         use_effective_order=False,
 36 |         lowercase=False,
 37 |     )
 38 |     return bleu_score.score / 100
 39 | 
 40 | 
 41 | class JDocQAScorer(Scorer):
 42 |     def score(self, refs: list[str], preds: list[str]) -> list[int]:
 43 |         docs = self.config.docs
 44 |         assert docs is not None
 45 |         scores = []
 46 | 
 47 |         for doc, ref, pred in zip(docs, refs, preds):
 48 |             if doc["answer_type"] == ANSWER_TYPE_MAP["open-ended"]:
 49 |                 scores.append(bleu_ja([ref], pred))
 50 |             elif doc["answer_type"] in [
 51 |                 ANSWER_TYPE_MAP["yesno"],
 52 |                 ANSWER_TYPE_MAP["factoid"],
 53 |                 ANSWER_TYPE_MAP["numerical"],
 54 |             ]:
 55 |                 ref = jdocqa_normalize(ref)
 56 |                 pred = jdocqa_normalize(pred)
 57 |                 if ref in pred:
 58 |                     scores.append(1)
 59 |                 else:
 60 |                     scores.append(0)
 61 |             else:
 62 |                 raise NotImplementedError("Bad answer type.")
 63 | 
 64 |         return scores
 65 | 
 66 |     def aggregate(self, scores: list[int]) -> AggregateOutput:
 67 |         docs = self.config.docs
 68 |         assert docs is not None
 69 | 
 70 |         # スコア収集用の dict（値はリスト）
 71 |         raw_metrics: dict[str, list[float]] = {
 72 |             "yesno_exact": [],
 73 |             "factoid_exact": [],
 74 |             "numerical_exact": [],
 75 |             "open-ended_bleu": [],
 76 |         }
 77 | 
 78 |         for doc, score in zip(docs, scores):
 79 |             answer_type = doc["answer_type"]
 80 |             if answer_type == ANSWER_TYPE_MAP["open-ended"]:
 81 |                 raw_metrics["open-ended_bleu"].append(score)
 82 |             else:
 83 |                 key = f"{NUM_TO_ANSWER_TYPE[answer_type]}_exact"
 84 |                 raw_metrics[key].append(score)
 85 | 
 86 |         # 平均値をとって dict[str, float] にする
 87 |         metrics: dict[str, float] = {}
 88 |         for key, value in raw_metrics.items():
 89 |             metrics[key] = sum(value) / len(value) if value else 0.0
 90 | 
 91 |         metrics["overall"] = sum(scores) / len(scores) if scores else 0.0
 92 | 
 93 |         return AggregateOutput(metrics["overall"], metrics)
 94 | 
 95 | 
 96 | def test_jdocqa_scorer():
 97 |     refs = ["私は猫です。"]
 98 |     preds = ["私は猫です。"]
 99 |     from .scorer import ScorerConfig
100 | 
101 |     scorer = JDocQAScorer(ScorerConfig(docs=[{"answer_type": 1}]))
102 |     scores = scorer.score(refs, preds)
103 |     assert scores == [1.0]
104 |     output = scorer.aggregate(scores)
105 |     assert output.overall_score == 1.0
106 |     assert output.details == {
107 |         "factoid_exact": 1.0,
108 |         "yesno_exact": 0,
109 |         "numerical_exact": 0,
110 |         "open-ended_bleu": 0,
111 |         "overall": 1.0,
112 |     }
113 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/jic_vqa_scorer.py:
--------------------------------------------------------------------------------
 1 | from .scorer import Scorer, AggregateOutput
 2 | 
 3 | 
 4 | class JICVQAScorer(Scorer):
 5 |     @staticmethod
 6 |     def score(refs: list[str], preds: list[str]) -> list[int]:
 7 |         scores = [int(ref in pred) for ref, pred in zip(refs, preds)]
 8 |         return scores
 9 | 
10 |     def aggregate(self, scores: list[int]) -> AggregateOutput:
11 |         docs = self.config.docs
12 |         assert docs is not None
13 |         domain_scores = {}
14 | 
15 |         # Accumulate scores for each domain and overall
16 |         for i, domain in enumerate(docs["domain"]):
17 |             if domain not in domain_scores:
18 |                 domain_scores[domain] = {"total_score": 0, "count": 0}
19 |             domain_scores[domain]["total_score"] += scores[i]
20 |             domain_scores[domain]["count"] += 1
21 | 
22 |         # Calculate average for each domain
23 |         result = {}
24 |         domain_averages = []
25 |         for domain, values in domain_scores.items():
26 |             average = values["total_score"] / values["count"]
27 |             result[domain] = average
28 |             domain_averages.append(average)
29 | 
30 |         # Calculate the average of all domain averages
31 |         if domain_averages:
32 |             result["average"] = sum(domain_averages) / len(domain_averages)
33 |         else:
34 |             result["average"] = 0.0
35 |         output = AggregateOutput(result["average"], result)
36 |         return output
37 | 
38 | 
39 | def test_jic_vqa_test():
40 |     refs = ["私は猫です。"]
41 |     preds = ["私は猫です。"]
42 |     from .scorer import ScorerConfig
43 | 
44 |     scorer = JICVQAScorer(ScorerConfig(docs={"domain": ["test"]}))
45 |     scores = scorer.score(refs, preds)
46 |     assert scores == [1]
47 |     output = scorer.aggregate(scores)
48 |     assert output.overall_score == 1.0
49 |     assert output.details == {"test": 1.0, "average": 1.0}
50 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/llm_as_a_judge_scorer.py:
--------------------------------------------------------------------------------
  1 | from eval_mm.metrics.scorer import Scorer, AggregateOutput
  2 | from tqdm import tqdm
  3 | import re
  4 | 
  5 | 
  6 | INSTRUCTION = """
  7 | You are an expert evaluator. You are given a (Question, Answer, Prediction) triplet. Your task is to evaluate how well the Prediction aligns with the Answer in the context of the Question.
  8 | 
  9 | Please assign a score from 1 to 5 based on the following criteria:
 10 | 
 11 | 5: Excellent — The Prediction fully matches the Answer with complete correctness and relevance.
 12 | 4: Good — The Prediction is mostly correct with only minor inaccuracies or missing details.
 13 | 3: Fair — The Prediction is partially correct but contains noticeable errors or missing key points.
 14 | 2: Poor — The Prediction is mostly incorrect or irrelevant, but there are small fragments of correctness.
 15 | 1: Very Poor — The Prediction is completely incorrect or irrelevant.
 16 | Output only the score (an integer from 1 to 5). Do not add any explanation.
 17 | 
 18 | Triplet:
 19 | Question: {Question}
 20 | Answer: {Answer}
 21 | Prediction: {Prediction}
 22 | 
 23 | Your Score:
 24 | """
 25 | 
 26 | 
 27 | class LlmAsaJudgeScorer(Scorer):
 28 |     def score(
 29 |         self,
 30 |         refs,
 31 |         preds,
 32 |     ):
 33 |         client = self.config.client
 34 |         model_name = self.config.judge_model
 35 |         batch_size = self.config.batch_size
 36 |         docs = self.config.docs
 37 |         questions = docs["input_text"]
 38 | 
 39 |         def build_message(question: str, answer: str, pred: str):
 40 |             content = INSTRUCTION.format(
 41 |                 Question=question, Answer=answer, Prediction=pred
 42 |             )
 43 |             message = [
 44 |                 {"role": "system", "content": "You are an expert evaluator."},
 45 |                 {"role": "user", "content": content},
 46 |             ]
 47 |             return message
 48 | 
 49 |         messages = [
 50 |             build_message(question, answer, pred)
 51 |             for question, answer, pred in zip(questions, refs, preds)
 52 |         ]
 53 |         messages_list = [
 54 |             messages[i : i + batch_size] for i in range(0, len(messages), batch_size)
 55 |         ]
 56 |         completion = []
 57 |         for ms in tqdm(messages_list, desc="Evaluating LLM as a Judge"):
 58 |             completion.extend(
 59 |                 client.batch_generate_chat_response(
 60 |                     ms, max_tokens=1024, temperature=0.0, seed=0, model_name=model_name
 61 |                 )
 62 |             )
 63 | 
 64 |         scores = []
 65 |         for i, c in enumerate(completion):
 66 |             print(c)
 67 |             score = re.search(r"\d", c)
 68 |             if score:
 69 |                 scores.append(int(score.group()))
 70 |             else:
 71 |                 scores.append(0)
 72 |         # if preds is empty, return 0 (TODO: this process should be done before calling llm)
 73 |         # for i, pred in enumerate(preds):
 74 |         #     if pred == "":
 75 |         #         scores[i] = 0
 76 |         return scores
 77 | 
 78 |     @staticmethod
 79 |     def aggregate(scores: list) -> AggregateOutput:
 80 |         mean = sum(scores) / len(scores)
 81 |         return AggregateOutput(mean, {"llm_as_a_judge": mean})
 82 | 
 83 | 
 84 | def test_llm_as_a_judge_scorer():
 85 |     from eval_mm.utils.azure_client import MockChatAPI, OpenAIChatAPI
 86 | 
 87 |     questions = ["What is the capital of Japan?", "What is the capital of France?"]
 88 |     answers = ["Tokyo", "Paris"]
 89 |     preds = ["", ""]
 90 |     from .scorer import ScorerConfig
 91 | 
 92 |     scorer = LlmAsaJudgeScorer(
 93 |         ScorerConfig(docs={"input_text": questions}, client=MockChatAPI())
 94 |     )
 95 |     scores = scorer.score(answers, preds)
 96 |     assert scores == [0, 0]
 97 |     output = scorer.aggregate(scores)
 98 |     assert output.overall_score == 0.0
 99 |     assert output.details == {"llm_as_a_judge": 0.0}
100 | 
101 |     import os
102 | 
103 |     if os.getenv("AZURE_OPENAI_KEY"):
104 |         questions = ["What is the capital of Japan?", "What is the capital of France?"]
105 |         answers = ["Tokyo", "Paris"]
106 |         preds = ["Tokyo", "Paris"]
107 |         model_name = "gpt-4o-mini-2024-07-18"
108 |         scorer = LlmAsaJudgeScorer(
109 |             ScorerConfig(
110 |                 docs={"input_text": questions},
111 |                 client=OpenAIChatAPI(),
112 |                 judge_model=model_name,
113 |             )
114 |         )
115 |         scores = scorer.score(answers, preds)
116 |         assert scores == [5, 5]
117 | 
118 |         questions = [
119 |             "前年の合計所得が33万円以下の世帯の場合、軽減の割合は何割ですか。\n解答は数量のみで答えてください。",
120 |             "3つのラベルはどのような違いがありますか。\n解答は自由に記述してください。",
121 |             "国内の日本語教育の概要において、外国人に対する日本語教育の現状として、日本語教師数の数や学習者の数はどうなっていますか。\n解答は自由に記述してください。",
122 |         ]
123 |         answers = [
124 |             "7割です",
125 |             "下部に文字がプリントされているかどうか、プリントの内容が「CARBONNEUTRAL」または「CERTIFIEDMODEL」かの違いがあります。",
126 |             "国内の日本語教育の概要において、外国人に対する日本語教育の現状として、日本語教師数の数は34392人、学習者の数は139613人となっています。",
127 |         ]
128 | 
129 |         preds = [
130 |             "7割",
131 |             "",
132 |             "日本語教師数の数は34392人、学習者の数は139613人となっています。",
133 |         ]
134 |         model_name = "gpt-4o-2024-05-13"
135 |         scorer = LlmAsaJudgeScorer(
136 |             ScorerConfig(
137 |                 docs={"input_text": questions},
138 |                 client=OpenAIChatAPI(),
139 |                 judge_model=model_name,
140 |             )
141 |         )
142 |         scores = scorer.score(answers, preds)
143 |         assert scores == [5, 1, 5]
144 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/mecha_ja_scorer.py:
--------------------------------------------------------------------------------
  1 | # mecha-ja-scorer.py
  2 | from .scorer import Scorer, AggregateOutput
  3 | import re
  4 | from collections import defaultdict
  5 | 
  6 | ANSWER_TYPE_MAP = {
  7 |     "Factoid": 0,
  8 |     "Non-Factoid": 1,
  9 | }
 10 | 
 11 | 
 12 | class MECHAJaScorer(Scorer):
 13 |     @staticmethod
 14 |     def _parse_rotation_id(qid: str) -> str:
 15 |         """
 16 |         question_id に '_rot{i}' が含まれているかを正規表現で調べる。
 17 |         含まれていれば i (数字) を文字列として返し、無ければ "no_rot" を返す。
 18 |         """
 19 |         pattern = r"(.*)_rot(\d+)$"
 20 |         match = re.match(pattern, qid)
 21 |         if match:
 22 |             return match.group(2)  # 例えば "2"
 23 |         else:
 24 |             return "no_rot"
 25 | 
 26 |     @staticmethod
 27 |     def score(refs: list[str], preds: list[str]) -> list[int]:
 28 |         """
 29 |         Checks whether each reference string is contained in the corresponding
 30 |         prediction string and returns a list of integer scores (1 for True, 0 for False).
 31 |         """
 32 |         scores = []
 33 |         for ref, pred in zip(refs, preds):
 34 |             score = 1 if ref in pred else 0
 35 |             scores.append(score)
 36 |         return scores
 37 | 
 38 |     def aggregate(self, scores: list[int]) -> AggregateOutput:
 39 |         """
 40 |         回転IDごとにスコアを集計して、以下のような構造を返す:
 41 |         {
 42 |           "rot_{rot_id}": {
 43 |             "overall": float,
 44 |             "Factoid": float,
 45 |             "Non-Factoid": float,
 46 |             "with_background": float,
 47 |             "without_background": float
 48 |           },
 49 |           "no_rot": { ... },
 50 |           ...
 51 |         }
 52 |         """
 53 |         data_all = defaultdict(list)
 54 |         data_by_rot: defaultdict[str, dict[str, list[int]]] = defaultdict(
 55 |             lambda: {
 56 |                 "overall": [],
 57 |                 "factoid": [],
 58 |                 "non_factoid": [],
 59 |                 "with_bg": [],
 60 |                 "without_bg": [],
 61 |             }
 62 |         )
 63 |         docs = self.config.docs
 64 |         assert docs is not None
 65 |         for doc, score in zip(docs, scores):
 66 |             rot_id = MECHAJaScorer._parse_rotation_id(doc["question_id"])
 67 |             is_factoid = doc["answer_type"] == ANSWER_TYPE_MAP["Factoid"]
 68 |             has_bg = bool(doc["background_text"])
 69 | 
 70 |             # Overall
 71 |             data_all["overall"].append(score)
 72 |             data_by_rot[rot_id]["overall"].append(score)
 73 | 
 74 |             # Factoid / Non-Factoid
 75 |             if is_factoid:
 76 |                 data_all["factoid"].append(score)
 77 |                 data_by_rot[rot_id]["factoid"].append(score)
 78 |             else:
 79 |                 data_all["non_factoid"].append(score)
 80 |                 data_by_rot[rot_id]["non_factoid"].append(score)
 81 | 
 82 |             # With / Without Background
 83 |             if has_bg:
 84 |                 data_all["with_bg"].append(score)
 85 |                 data_by_rot[rot_id]["with_bg"].append(score)
 86 |             else:
 87 |                 data_all["without_bg"].append(score)
 88 |                 data_by_rot[rot_id]["without_bg"].append(score)
 89 | 
 90 |         def avg(lst):
 91 |             return sum(lst) / len(lst) if lst else 0.0
 92 | 
 93 |         result = {
 94 |             "overall": avg(data_all["overall"]),
 95 |             "Factoid": avg(data_all["factoid"]),
 96 |             "Non-Factoid": avg(data_all["non_factoid"]),
 97 |             "with_background": avg(data_all["with_bg"]),
 98 |             "without_background": avg(data_all["without_bg"]),
 99 |         }
100 | 
101 |         for rot_id, cat_scores in data_by_rot.items():
102 |             result[f"rot_{rot_id}"] = {
103 |                 "overall": avg(cat_scores["overall"]),
104 |                 "Factoid": avg(cat_scores["factoid"]),
105 |                 "Non-Factoid": avg(cat_scores["non_factoid"]),
106 |                 "with_background": avg(cat_scores["with_bg"]),
107 |                 "without_background": avg(cat_scores["without_bg"]),
108 |             }
109 |         output = AggregateOutput(result["overall"], result)
110 | 
111 |         return output
112 | 
113 | 
114 | def test_mechaja_scorer():
115 |     refs = ["私は猫です。"]
116 |     preds = ["私は猫です。"]
117 |     from .scorer import ScorerConfig
118 | 
119 |     scorer = MECHAJaScorer(
120 |         ScorerConfig(
121 |             docs=[{"question_id": "q1", "answer_type": 0, "background_text": ""}]
122 |         )
123 |     )
124 |     scores = MECHAJaScorer.score(refs, preds)
125 | 
126 |     assert scores == [1]
127 |     output = scorer.aggregate(scores)
128 |     assert output.overall_score == 1.0
129 |     assert output.details == {
130 |         "overall": 1.0,
131 |         "Factoid": 1.0,
132 |         "Non-Factoid": 0.0,
133 |         "with_background": 0.0,
134 |         "without_background": 1.0,
135 |         "rot_no_rot": {
136 |             "overall": 1.0,
137 |             "Factoid": 1.0,
138 |             "Non-Factoid": 0.0,
139 |             "with_background": 0.0,
140 |             "without_background": 1.0,
141 |         },
142 |     }
143 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/rougel_scorer.py:
--------------------------------------------------------------------------------
  1 | # Reference: https://github.com/SakanaAI/evolutionary-model-merge/blob/main/evomerge/eval/metrics.py
  2 | 
  3 | import re
  4 | from rouge_score import rouge_scorer, scoring
  5 | from fugashi import Tagger
  6 | import emoji
  7 | import unicodedata
  8 | from .scorer import Scorer, AggregateOutput
  9 | from concurrent.futures import ProcessPoolExecutor, Future
 10 | 
 11 | 
 12 | class MecabTokenizer:
 13 |     def __init__(self) -> None:
 14 |         self.tagger = Tagger("-Owakati")
 15 | 
 16 |     def normalize_answer(self, text: str) -> str:
 17 |         """Lower case text, remove punctuation and extra whitespace, etc."""
 18 | 
 19 |         def white_space_fix(text: str) -> str:
 20 |             return " ".join(text.split())
 21 | 
 22 |         def remove_emoji(text: str) -> str:
 23 |             text = "".join(["" if emoji.is_emoji(c) else c for c in text])
 24 |             emoji_pattern = re.compile(
 25 |                 "["
 26 |                 "\U0001f600-\U0001f64f"  # emoticons
 27 |                 "\U0001f300-\U0001f5ff"  # symbols & pictographs
 28 |                 "\U0001f680-\U0001f6ff"  # transport & map symbols
 29 |                 "\U0001f1e0-\U0001f1ff"  # flags (iOS)
 30 |                 "\U00002702-\U000027b0"
 31 |                 "]+",
 32 |                 flags=re.UNICODE,
 33 |             )
 34 |             return emoji_pattern.sub(r"", text)
 35 | 
 36 |         text = remove_emoji(text)
 37 |         # see neologdn docs for details, but handles things like full/half width variation
 38 |         # text = neologdn.normalize(text) FIXME: fix c++12 error when installing neologdn
 39 |         text = unicodedata.normalize("NFKC", text)
 40 |         text = white_space_fix(text)
 41 |         return text
 42 | 
 43 |     def tokenize(self, text):
 44 |         return self.tagger.parse(self.normalize_answer(text)).split()
 45 | 
 46 | 
 47 | def rouge_ja(refs: list[str], preds: list[str]) -> dict:
 48 |     """Compute ROUGE-L scores for Japanese text.
 49 |     Args:
 50 |         refs: list of reference strings
 51 |         preds: list of predicted strings
 52 |     Returns:
 53 |         dict: dictionary with keys: { 'rouge1', 'rouge2', 'rougeL' }
 54 |         Each value is a float representing the ROUGE score (f-measure) * 100.
 55 |     """
 56 |     assert isinstance(refs, list) and isinstance(
 57 |         preds, list
 58 |     ), "refs and preds must be lists."
 59 |     tokenizer = MecabTokenizer()
 60 |     rouge_types = ["rouge1", "rouge2", "rougeL"]
 61 |     # mecab-based rouge
 62 |     scorer = rouge_scorer.RougeScorer(
 63 |         rouge_types,
 64 |         tokenizer=tokenizer,
 65 |     )
 66 | 
 67 |     # Accumulate confidence intervals.
 68 |     aggregator = scoring.BootstrapAggregator()
 69 |     for ref, pred in zip(refs, preds):
 70 |         aggregator.add_scores(scorer.score(ref, pred))
 71 |     result = aggregator.aggregate()
 72 |     return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
 73 | 
 74 | 
 75 | class RougeLScorer(Scorer):
 76 |     @staticmethod
 77 |     def score(refs: list[str], preds: list[str]) -> list[float]:
 78 |         futures: list[Future[dict[str, float]]] = []
 79 |         with ProcessPoolExecutor() as executor:
 80 |             for ref, pred in zip(refs, preds):
 81 |                 future = executor.submit(rouge_ja, [ref], [pred])
 82 |                 futures.append(future)
 83 |         scores = [f.result()["rougeL"] for f in futures]
 84 |         return scores
 85 | 
 86 |     @staticmethod
 87 |     def aggregate(scores: list[float]) -> AggregateOutput:
 88 |         mean = sum(scores) / len(scores)
 89 |         return AggregateOutput(mean, {"rougel": mean})
 90 | 
 91 | 
 92 | def test_rouge_ja():
 93 |     import pytest
 94 | 
 95 |     refs = ["私は猫です。"]
 96 |     preds = ["私は猫です。"]
 97 |     scores = rouge_ja(refs, preds)
 98 |     assert scores["rougeL"] == 100.0
 99 |     refs = ["たかしが公園で遊んでいた。"]
100 |     preds = ["たかしが公園にいたようだ。"]
101 |     scores = rouge_ja(refs, preds)
102 |     assert pytest.approx(scores["rougeL"], 0.01) == 66.66
103 | 
104 |     refs = ["私は猫です。", "私は犬です。"]
105 |     preds = ["私は犬です。", "私は猫です。"]
106 |     scores = rouge_ja(refs, preds)
107 |     assert pytest.approx(scores["rougeL"], 0.01) == 80.0
108 |     refs = ["池のほとりです。"]
109 |     preds = ["ここは湖の岸です。"]
110 |     scores = rouge_ja(refs, preds)
111 |     assert pytest.approx(scores["rougeL"], 0.01) == 50.0
112 | 
113 | 
114 | def test_rougel_scorer():
115 |     import pytest
116 | 
117 |     refs = ["私は猫です。"]
118 |     preds = ["私は猫です。"]
119 |     from .scorer import ScorerConfig
120 | 
121 |     scorer = RougeLScorer(ScorerConfig())
122 |     scores = scorer.score(refs, preds)
123 |     assert scores == [100.0]
124 |     refs = ["たかしが公園で遊んでいた。"]
125 |     preds = ["たかしが公園にいたようだ。"]
126 |     scores = scorer.score(refs, preds)
127 |     assert pytest.approx(scores[0], 0.01) == 66.66
128 |     output = scorer.aggregate(scores)
129 |     assert pytest.approx(output.overall_score, 0.01) == 66.66
130 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/scorer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import dataclass
 3 | from eval_mm.utils.azure_client import OpenAIChatAPI
 4 | 
 5 | 
 6 | @dataclass
 7 | class AggregateOutput:
 8 |     overall_score: float
 9 |     details: dict[str, float]
10 | 
11 | 
12 | @dataclass
13 | class ScorerConfig:
14 |     docs: dict | None = None
15 |     judge_model: str | None = None
16 |     client: OpenAIChatAPI | None = None
17 |     batch_size: int = 10
18 |     random_choice: bool = False
19 | 
20 | 
21 | class Scorer(ABC):
22 |     def __init__(self, config: ScorerConfig):
23 |         self.config = config
24 | 
25 |     @abstractmethod
26 |     def score(self, refs: list[str], preds: list[str]) -> list:
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def aggregate(self, scores: list) -> AggregateOutput:
31 |         raise NotImplementedError
32 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/scorer_registry.py:
--------------------------------------------------------------------------------
 1 | from .heron_bench_scorer import HeronBenchScorer
 2 | from .exact_match_scorer import ExactMatchScorer
 3 | from .llm_as_a_judge_scorer import LlmAsaJudgeScorer
 4 | from .rougel_scorer import RougeLScorer
 5 | from .substring_match_scorer import SubstringMatchScorer
 6 | from .scorer import Scorer
 7 | from .jmmmu_scorer import JMMMUScorer
 8 | from .mmmu_scorer import MMMUScorer
 9 | from .jdocqa_scorer import JDocQAScorer
10 | from .jic_vqa_scorer import JICVQAScorer
11 | from .mecha_ja_scorer import MECHAJaScorer
12 | from .cc_ocr_scorer import CCOCRScorer
13 | from .scorer import ScorerConfig
14 | from typing import Callable
15 | 
16 | 
17 | class ScorerRegistry:
18 |     """Registry to map metrics to their corresponding scorer classes."""
19 | 
20 |     _scorers: dict[str, Callable[[ScorerConfig], Scorer]] = {
21 |         "heron-bench": HeronBenchScorer,
22 |         "exact-match": ExactMatchScorer,
23 |         "llm-as-a-judge": LlmAsaJudgeScorer,
24 |         "rougel": RougeLScorer,
25 |         "substring-match": SubstringMatchScorer,
26 |         "jmmmu": JMMMUScorer,
27 |         "jdocqa": JDocQAScorer,
28 |         "mmmu": MMMUScorer,
29 |         "jic-vqa": JICVQAScorer,
30 |         "mecha-ja": MECHAJaScorer,
31 |         "cc-ocr": CCOCRScorer,
32 |     }
33 | 
34 |     @classmethod
35 |     def get_metric_list(cls) -> list[str]:
36 |         """Get a list of supported metrics."""
37 |         return list(cls._scorers.keys())
38 | 
39 |     @classmethod
40 |     def load_scorer(
41 |         cls, metric: str, scorer_config: ScorerConfig = ScorerConfig()
42 |     ) -> Scorer:
43 |         """Load a scorer instance from the scorer registry."""
44 |         try:
45 |             return cls._scorers[metric](scorer_config)  # type: ignore
46 |         except KeyError:
47 |             raise ValueError(f"Metric '{metric}' is not supported.")
48 | 


--------------------------------------------------------------------------------
/src/eval_mm/metrics/substring_match_scorer.py:
--------------------------------------------------------------------------------
 1 | from .scorer import Scorer, AggregateOutput
 2 | 
 3 | 
 4 | class SubstringMatchScorer(Scorer):
 5 |     @staticmethod
 6 |     def score(refs: list[str], preds: list[str]) -> list[int]:
 7 |         scores = [int(ref in pred) for ref, pred in zip(refs, preds)]
 8 |         return scores
 9 | 
10 |     @staticmethod
11 |     def aggregate(scores: list[int]) -> AggregateOutput:
12 |         mean = sum(scores) / len(scores)
13 |         return AggregateOutput(mean, {"substring_match": mean})
14 | 
15 | 
16 | def test_substring_match_scorer():
17 |     from .scorer import ScorerConfig
18 | 
19 |     scorer = SubstringMatchScorer(ScorerConfig())
20 |     refs = ["私は猫です。"]
21 |     preds = ["私は猫です。"]
22 |     scores = scorer.score(refs, preds)
23 |     assert scores == [1]
24 |     refs = ["たかしが公園で遊んでいた。"]
25 |     preds = ["たかしが公園にいたようだ。"]
26 |     scores = scorer.score(refs, preds)
27 |     assert scores == [0]
28 |     refs = ["私は猫です。", "私は犬です。"]
29 |     preds = ["私は犬です。", "私は猫です。"]
30 |     scores = scorer.score(refs, preds)
31 |     assert scores == [0, 0]
32 |     refs = ["池のほとりです。"]
33 |     preds = ["ここは湖の岸です。"]
34 |     scores = scorer.score(refs, preds)
35 |     assert scores == [0]
36 | 
37 |     output = scorer.aggregate([1, 1, 1, 0])
38 |     assert output.overall_score == 0.75
39 |     assert output.details == {"substring_match": 0.75}
40 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ja_vg_vqa_500 import JaVGVQA500
 2 | from .japanese_heron_bench import JapaneseHeronBench
 3 | from .ja_vlm_bench_in_the_wild import JaVLMBenchIntheWild
 4 | from .jmmmu import JMMMU
 5 | from .ja_multi_image_vqa import JAMultiImageVQA
 6 | from .jdocqa import JDocQA
 7 | from .mmmu import MMMU
 8 | from .llava_bench_in_the_wild import LlavaBenchIntheWild
 9 | from .jic_vqa import JICVQA
10 | from .mecha_ja import MECHAJa
11 | from .mmmlu import MMMLU
12 | from .cc_ocr import CCOCR
13 | from .cvqa import CVQA
14 | from .task_registry import TaskRegistry
15 | from .task import TaskConfig
16 | 
17 | __all__ = [
18 |     "JaVGVQA500",
19 |     "JapaneseHeronBench",
20 |     "JaVLMBenchIntheWild",
21 |     "JMMMU",
22 |     "JAMultiImageVQA",
23 |     "JDocQA",
24 |     "MMMU",
25 |     "LlavaBenchIntheWild",
26 |     "JICVQA",
27 |     "MECHAJa",
28 |     "MMMLU",
29 |     "CCOCR",
30 |     "CVQA",
31 |     "TaskRegistry",
32 |     "TaskConfig",
33 | ]
34 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/cc_ocr.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | from .task import Task
 3 | from PIL import Image
 4 | from io import BytesIO
 5 | import base64
 6 | 
 7 | 
 8 | def base64_to_pil_image(base64_string: str) -> Image.Image:
 9 |     return Image.open(BytesIO(base64.b64decode(base64_string)))
10 | 
11 | 
12 | class CCOCR(Task):
13 |     """
14 |     The CCOCR class processes the CC-OCR dataset for Japanese samples and provides
15 |     methods to interact with the dataset. It filters the dataset to include only
16 |     entries labeled as "Japanese" and decodes base64-encoded images into PIL Image
17 |     objects for visual processing.
18 |     """
19 | 
20 |     default_metric = "ccocr"
21 | 
22 |     @staticmethod
23 |     def _prepare_dataset() -> Dataset:
24 |         ds = load_dataset("wulipc/CC-OCR", "multi_lan_ocr", split="test")
25 | 
26 |         ds = ds.filter(lambda example: example["l2-category"] == "Japanese")
27 | 
28 |         ds = ds.map(
29 |             lambda x, idx: {
30 |                 "index": str(idx),
31 |                 "question_id": str(idx),
32 |                 "question": x["question"],
33 |                 "answer": x["answer"],
34 |                 "input_text": x["question"],
35 |                 "image": x["image"],
36 |             },
37 |             with_indices=True,
38 |         )
39 | 
40 |         return ds
41 | 
42 |     @staticmethod
43 |     def doc_to_text(doc) -> str:
44 |         return doc["input_text"]
45 | 
46 |     @staticmethod
47 |     def doc_to_visual(doc) -> list[Image.Image]:
48 |         image = base64_to_pil_image(doc["image"])
49 |         return [image]
50 | 
51 |     @staticmethod
52 |     def doc_to_id(doc) -> str:
53 |         return str(doc["question_id"])
54 | 
55 |     @staticmethod
56 |     def doc_to_answer(doc) -> str:
57 |         return doc["answer"]
58 | 
59 | 
60 | def test_task():
61 |     from eval_mm.tasks.task import TaskConfig
62 | 
63 |     task = CCOCR(TaskConfig())
64 |     ds = task.dataset
65 |     assert isinstance(task.doc_to_text(ds[0]), str)
66 |     assert isinstance(task.doc_to_visual(ds[0]), list)
67 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
68 |     assert isinstance(task.doc_to_id(ds[0]), str)
69 |     assert isinstance(task.doc_to_answer(ds[0]), str)
70 |     print(ds[0])
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     test_task()
75 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/cvqa.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | from .task import Task
 3 | from PIL import Image
 4 | 
 5 | MULTI_CHOICE_PROMPT = (
 6 |     "与えられた選択肢の中から最も適切な回答のアルファベットを直接記入してください。"
 7 | )
 8 | 
 9 | OPTIONS_MAP = {
10 |     0: "A",
11 |     1: "B",
12 |     2: "C",
13 |     3: "D",
14 | }
15 | 
16 | 
17 | def parse_options(options):
18 |     option_letters = [chr(ord("A") + i) for i in range(len(options))]
19 |     choices_str = "\n".join(
20 |         [
21 |             f"{option_letter}. {option}"
22 |             for option_letter, option in zip(option_letters, options)
23 |         ]
24 |     )
25 |     return choices_str
26 | 
27 | 
28 | def construct_prompt(question, options):
29 |     parsed_options = parse_options(options)
30 |     return f"{question}\n{parsed_options}\n\n{MULTI_CHOICE_PROMPT}"
31 | 
32 | 
33 | class CVQA(Task):
34 |     default_metric = "substring-match"
35 | 
36 |     @staticmethod
37 |     def _prepare_dataset() -> Dataset:
38 |         ds = load_dataset("afaji/cvqa", split="test")
39 | 
40 |         ds = ds.filter(lambda x: x["Subset"] == "('Japanese', 'Japan')")
41 | 
42 |         ds = ds.map(
43 |             lambda x, idx: {
44 |                 "index": str(idx),
45 |                 "question_id": str(idx),
46 |                 "question": x["Question"],
47 |                 "question_en": x["Translated Question"],  # English
48 |                 "options": x["Options"],
49 |                 "translated_options": x["Translated Options"],  # English
50 |                 "answer": x["Label"],  # 0~3
51 |                 "answer_text": OPTIONS_MAP[x["Label"]],
52 |                 "input_text": construct_prompt(x["Question"], x["Options"]),
53 |                 "image": x["image"],
54 |             },
55 |             with_indices=True,
56 |         )
57 | 
58 |         return ds
59 | 
60 |     @staticmethod
61 |     def doc_to_text(doc) -> str:
62 |         return doc["input_text"]
63 | 
64 |     @staticmethod
65 |     def doc_to_visual(doc) -> list[Image.Image]:
66 |         return [doc["image"]]
67 | 
68 |     @staticmethod
69 |     def doc_to_id(doc) -> str:
70 |         return str(doc["question_id"])
71 | 
72 |     @staticmethod
73 |     def doc_to_answer(doc) -> str:
74 |         return doc["answer_text"]
75 | 
76 | 
77 | def test_task():
78 |     from eval_mm.tasks.task import TaskConfig
79 | 
80 |     task = CVQA(TaskConfig())
81 |     ds = task.dataset
82 |     assert isinstance(task.doc_to_text(ds[0]), str)
83 |     assert isinstance(task.doc_to_visual(ds[0]), list)
84 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
85 |     assert isinstance(task.doc_to_id(ds[0]), str)
86 |     assert isinstance(task.doc_to_answer(ds[0]), str)
87 |     print(ds[0])
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     test_task()
92 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/ja_multi_image_vqa.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | import re
 3 | 
 4 | 
 5 | from .task import Task
 6 | from PIL import Image
 7 | 
 8 | # import neologdn FIXME: fix c++12 error when installing neologdn
 9 | 
10 | 
11 | class JAMultiImageVQA(Task):
12 |     default_metric = "rougel"
13 | 
14 |     @staticmethod
15 |     def _prepare_dataset() -> Dataset:
16 |         ds = load_dataset("SakanaAI/JA-Multi-Image-VQA", split="test")
17 |         ds = ds.rename_column("question", "input_text")
18 |         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
19 |         return ds
20 | 
21 |     @staticmethod
22 |     def doc_to_text(doc) -> str:
23 |         # delete redundant image tags
24 |         text = re.sub(r"<image> ", "", doc["input_text"])
25 |         return text
26 | 
27 |     @staticmethod
28 |     def doc_to_visual(doc) -> list[Image.Image]:
29 |         return doc["images"]
30 | 
31 |     @staticmethod
32 |     def doc_to_id(doc) -> str:
33 |         return str(doc["question_id"])
34 | 
35 |     @staticmethod
36 |     def doc_to_answer(doc) -> str:
37 |         return doc["answer"]
38 | 
39 | 
40 | def test_task():
41 |     from eval_mm.tasks.task import TaskConfig
42 | 
43 |     task = JAMultiImageVQA(TaskConfig())
44 |     ds = task.dataset
45 |     print(ds[0])
46 |     assert isinstance(task.doc_to_text(ds[0]), str)
47 |     assert isinstance(task.doc_to_visual(ds[0]), list)
48 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
49 |     assert isinstance(task.doc_to_id(ds[0]), str)
50 |     assert isinstance(task.doc_to_answer(ds[0]), str)
51 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/ja_vg_vqa_500.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, concatenate_datasets, load_dataset
 2 | 
 3 | from .task import Task
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class JaVGVQA500(Task):
 8 |     default_metric = "rougel"
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         ds = load_dataset("SakanaAI/JA-VG-VQA-500", split="test")
13 | 
14 |         def flatten_sample(sample):
15 |             dataset = {
16 |                 "image_id": [sample["image_id"] for _ in sample["qas"]],
17 |                 "image": [sample["image"] for _ in sample["qas"]],
18 |                 "qa_id": [qa["qa_id"] for qa in sample["qas"]],
19 |                 "question": [qa["question"] for qa in sample["qas"]],
20 |                 "answer": [qa["answer"] for qa in sample["qas"]],
21 |             }
22 |             return Dataset.from_dict(dataset)
23 | 
24 |         fragments = []
25 |         for i, sample in enumerate(ds):
26 |             data_fragment = flatten_sample(sample)
27 |             fragments.append(data_fragment)
28 | 
29 |         ds = concatenate_datasets(fragments)
30 |         ds = ds.rename_column("question", "input_text")
31 |         ds = ds.rename_column("qa_id", "question_id")
32 | 
33 |         return ds
34 | 
35 |     @staticmethod
36 |     def doc_to_text(doc) -> str:
37 |         return doc["input_text"]
38 | 
39 |     @staticmethod
40 |     def doc_to_visual(doc) -> list[Image.Image]:
41 |         return [doc["image"]]
42 | 
43 |     @staticmethod
44 |     def doc_to_id(doc) -> str:
45 |         return str(doc["question_id"])
46 | 
47 |     @staticmethod
48 |     def doc_to_answer(doc) -> str:
49 |         return doc["answer"]
50 | 
51 | 
52 | def test_task():
53 |     from eval_mm.tasks.task import TaskConfig
54 | 
55 |     task = JaVGVQA500(TaskConfig())
56 |     ds = task.dataset
57 |     print(ds[0])
58 |     assert isinstance(task.doc_to_text(ds[0]), str)
59 |     assert isinstance(task.doc_to_visual(ds[0]), list)
60 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
61 |     assert isinstance(task.doc_to_id(ds[0]), str)
62 |     assert isinstance(task.doc_to_answer(ds[0]), str)
63 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/ja_vlm_bench_in_the_wild.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | 
 3 | from .task import Task
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class JaVLMBenchIntheWild(Task):
 8 |     default_metric = "rougel"
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         # データセットをロード
13 |         ds = load_dataset("SakanaAI/JA-VLM-Bench-In-the-Wild", split="test")
14 |         ds = ds.rename_column("question", "input_text")
15 |         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
16 |         return ds
17 | 
18 |     @staticmethod
19 |     def doc_to_text(doc) -> str:
20 |         return doc["input_text"]
21 | 
22 |     @staticmethod
23 |     def doc_to_visual(doc) -> list[Image.Image]:
24 |         return [doc["image"]]
25 | 
26 |     @staticmethod
27 |     def doc_to_id(doc) -> str:
28 |         return str(doc["question_id"])
29 | 
30 |     @staticmethod
31 |     def doc_to_answer(doc) -> str:
32 |         return doc["answer"]
33 | 
34 | 
35 | def test_task():
36 |     from eval_mm.tasks.task import TaskConfig
37 | 
38 |     task = JaVLMBenchIntheWild(TaskConfig())
39 |     ds = task.dataset
40 |     print(ds[0])
41 |     assert isinstance(task.doc_to_text(ds[0]), str)
42 |     assert isinstance(task.doc_to_visual(ds[0]), list)
43 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
44 |     assert isinstance(task.doc_to_id(ds[0]), str)
45 |     assert isinstance(task.doc_to_answer(ds[0]), str)
46 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/japanese_heron_bench.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset
 2 | 
 3 | from .task import Task
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class JapaneseHeronBench(Task):
 8 |     default_metric = "heron-bench"
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         ds = load_dataset("Silviase/Japanese-Heron-Bench", split="train")
13 |         ds = ds.rename_column("text", "input_text")
14 |         return ds
15 | 
16 |     @staticmethod
17 |     def doc_to_text(doc) -> str:
18 |         return doc["input_text"]
19 | 
20 |     @staticmethod
21 |     def doc_to_visual(doc) -> list[Image.Image]:
22 |         return [doc["image"]]
23 | 
24 |     @staticmethod
25 |     def doc_to_id(doc) -> str:
26 |         return str(doc["question_id"])
27 | 
28 |     @staticmethod
29 |     def doc_to_answer(doc) -> str:
30 |         return doc["answer"]["gpt-4-0125-preview"]
31 | 
32 | 
33 | def test_task():
34 |     from eval_mm.tasks.task import TaskConfig
35 | 
36 |     task = JapaneseHeronBench(TaskConfig())
37 |     ds = task.dataset
38 |     print(ds[0])
39 |     assert isinstance(task.doc_to_text(ds[0]), str)
40 |     assert isinstance(task.doc_to_visual(ds[0]), list)
41 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
42 |     assert isinstance(task.doc_to_id(ds[0]), str)
43 |     assert isinstance(task.doc_to_answer(ds[0]), str)
44 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/jdocqa.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | 
 3 | from .task import Task
 4 | 
 5 | from PIL import Image
 6 | 
 7 | 
 8 | class JDocQA(Task):
 9 |     default_metric = "jdocqa"
10 | 
11 |     @staticmethod
12 |     def _prepare_dataset() -> Dataset:
13 |         ds = load_dataset(
14 |             "speed/JDocQA",
15 |             split="test",
16 |         )
17 |         ds = ds.rename_column("question", "input_text")
18 |         return ds
19 | 
20 |     @staticmethod
21 |     def doc_to_text(doc) -> str:
22 |         return doc["input_text"]
23 | 
24 |     @staticmethod
25 |     def doc_to_visual(doc) -> list[Image.Image]:
26 |         images = []
27 |         for column in ["image_0", "image_1", "image_2", "image_3"]:
28 |             if doc[column] is not None:
29 |                 images.append(doc[column])
30 |         return images
31 | 
32 |     @staticmethod
33 |     def doc_to_id(doc) -> str:
34 |         return str(doc["question_id"])
35 | 
36 |     @staticmethod
37 |     def doc_to_answer(doc) -> str:
38 |         return doc["answer"]
39 | 
40 | 
41 | def test_task():
42 |     from eval_mm.tasks.task import TaskConfig
43 | 
44 |     task = JDocQA(TaskConfig())
45 |     ds = task.dataset
46 |     print(ds[0])
47 |     assert isinstance(task.doc_to_text(ds[0]), str)
48 |     assert isinstance(task.doc_to_visual(ds[0]), list)
49 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
50 |     assert isinstance(task.doc_to_id(ds[0]), str)
51 |     assert isinstance(task.doc_to_answer(ds[0]), str)
52 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/jic_vqa.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from datasets import Dataset, load_dataset
 3 | 
 4 | from .task import Task
 5 | import os
 6 | 
 7 | 
 8 | class JICVQA(Task):
 9 |     default_metric = "jic-vqa"
10 | 
11 |     @staticmethod
12 |     def _prepare_dataset() -> Dataset:
13 |         if not os.path.exists("dataset/jic_vqa.parquet"):
14 |             raise FileNotFoundError(
15 |                 "Dataset not found. Please run `scripts/prepare_jic_vqa.py` to prepare the dataset."
16 |             )
17 | 
18 |         dataset = load_dataset(
19 |             "parquet", data_files="dataset/jic_vqa.parquet", split="train"
20 |         )
21 |         return dataset
22 | 
23 |     @staticmethod
24 |     def doc_to_text(doc) -> str:
25 |         return doc["input_text"]
26 | 
27 |     @staticmethod
28 |     def doc_to_visual(doc) -> list[Image.Image]:
29 |         return [doc["image"]]
30 | 
31 |     @staticmethod
32 |     def doc_to_id(doc) -> str:
33 |         return str(doc["question_id"])
34 | 
35 |     @staticmethod
36 |     def doc_to_answer(doc) -> str:
37 |         return doc["answer"]
38 | 
39 | 
40 | def test_task():
41 |     from eval_mm.tasks.task import TaskConfig
42 | 
43 |     task = JICVQA(TaskConfig())
44 |     ds = task.dataset
45 |     print(ds[0])
46 |     assert isinstance(task.doc_to_text(ds[0]), str)
47 |     assert isinstance(task.doc_to_visual(ds[0]), list)
48 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
49 |     assert isinstance(task.doc_to_id(ds[0]), str)
50 |     assert isinstance(task.doc_to_answer(ds[0]), str)
51 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/jmmmu.py:
--------------------------------------------------------------------------------
  1 | # Reference: https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/tasks/jmmmu/utils.py
  2 | 
  3 | from datasets import (
  4 |     Dataset,
  5 |     load_dataset,
  6 |     concatenate_datasets,
  7 |     get_dataset_config_names,
  8 | )
  9 | 
 10 | from .task import Task
 11 | 
 12 | import ast
 13 | import re
 14 | from PIL import Image
 15 | 
 16 | 
 17 | MULTI_CHOICE_PROMPT = (
 18 |     "与えられた選択肢の中から最も適切な回答のアルファベットを直接記入してください。"
 19 | )
 20 | OPEN_ENDED_PROMPT = "質問に対する回答を単語や短いフレーズで記入してください。"
 21 | 
 22 | 
 23 | def replace_images_tokens(input_string):
 24 |     for i in range(1, 8):
 25 |         question_text = f"<image {i}>"
 26 |         query_text = "<image>"
 27 |         if question_text in input_string:
 28 |             input_string = input_string.replace(question_text, query_text)
 29 |     return input_string
 30 | 
 31 | 
 32 | def parse_options(options):
 33 |     option_letters = [chr(ord("A") + i) for i in range(len(options))]
 34 |     choices_str = "\n".join(
 35 |         [
 36 |             f"{option_letter}. {option}"
 37 |             for option_letter, option in zip(option_letters, options)
 38 |         ]
 39 |     )
 40 |     return choices_str
 41 | 
 42 | 
 43 | def construct_prompt(doc):
 44 |     question = doc["question"]
 45 |     question = question.replace("<image1>", "<image 1>")
 46 |     if doc["question_type"] == "multiple-choice":
 47 |         # Weirdly, data["options"] is a string in MMMU Huggingface dataset
 48 |         parsed_options = parse_options(ast.literal_eval(doc["options"]))
 49 |         # parsed_options already prepends a newline so no need to add space here
 50 |         question = f"{question}\n{parsed_options}\n\n{MULTI_CHOICE_PROMPT}"
 51 |     else:
 52 |         question = f"{question}\n\n{OPEN_ENDED_PROMPT}"
 53 |     return question
 54 | 
 55 | 
 56 | def jmmmu_doc_to_text(doc):
 57 |     question = construct_prompt(doc)
 58 |     question = replace_images_tokens(question)  # TODO: check if this is necessary
 59 |     return question
 60 | 
 61 | 
 62 | def jmmmu_doc_to_visual(doc):
 63 |     prompt = construct_prompt(doc)
 64 | 
 65 |     image_tokens = re.findall(r"<image \d+>", prompt)
 66 |     # Remove <> and  swap space as _
 67 |     image_tokens = sorted(
 68 |         list(
 69 |             set(
 70 |                 [
 71 |                     image_token.strip("<>").replace(" ", "_")
 72 |                     for image_token in image_tokens
 73 |                 ]
 74 |             )
 75 |         )
 76 |     )
 77 |     visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
 78 |     return visual
 79 | 
 80 | 
 81 | class JMMMU(Task):
 82 |     default_metric = "jmmmu"
 83 | 
 84 |     @staticmethod
 85 |     def _prepare_dataset() -> Dataset:
 86 |         configs = get_dataset_config_names("JMMMU/JMMMU")
 87 |         datasets = [
 88 |             load_dataset("JMMMU/JMMMU", name=subject, split="test")
 89 |             for subject in configs
 90 |         ]
 91 |         dataset = concatenate_datasets(datasets)
 92 |         dataset = dataset.map(
 93 |             lambda x: {
 94 |                 "input_text": jmmmu_doc_to_text(x),
 95 |                 "question_id": x["id"],
 96 |                 "answer": x["answer"],
 97 |             }
 98 |         )
 99 |         return dataset
100 | 
101 |     @staticmethod
102 |     def doc_to_text(doc) -> str:
103 |         return doc["input_text"]
104 | 
105 |     @staticmethod
106 |     def doc_to_visual(doc) -> list[Image.Image]:
107 |         return jmmmu_doc_to_visual(doc)
108 | 
109 |     @staticmethod
110 |     def doc_to_id(doc) -> str:
111 |         return str(doc["question_id"])
112 | 
113 |     @staticmethod
114 |     def doc_to_answer(doc) -> str:
115 |         return doc["answer"]
116 | 
117 | 
118 | def test_task():
119 |     from eval_mm.tasks.task import TaskConfig
120 | 
121 |     task = JMMMU(TaskConfig())
122 |     ds = task.dataset
123 |     print(ds[0])
124 |     assert isinstance(task.doc_to_text(ds[0]), str)
125 |     assert isinstance(task.doc_to_visual(ds[0]), list)
126 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
127 |     assert isinstance(task.doc_to_id(ds[0]), str)
128 |     assert isinstance(task.doc_to_answer(ds[0]), str)
129 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/llava_bench_in_the_wild.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset, load_dataset
 2 | 
 3 | from .task import Task
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class LlavaBenchIntheWild(Task):
 8 |     default_metric = "rougel"
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         # データセットをロード
13 |         ds = load_dataset("lmms-lab/llava-bench-in-the-wild", split="train")
14 |         ds = ds.rename_column("question", "input_text")
15 |         ds = ds.rename_column("gpt_answer", "answer")
16 |         return ds
17 | 
18 |     @staticmethod
19 |     def doc_to_text(doc) -> str:
20 |         return doc["input_text"]
21 | 
22 |     @staticmethod
23 |     def doc_to_visual(doc) -> list[Image.Image]:
24 |         return [doc["image"]]
25 | 
26 |     @staticmethod
27 |     def doc_to_id(doc) -> str:
28 |         return str(doc["question_id"])
29 | 
30 |     @staticmethod
31 |     def doc_to_answer(doc) -> str:
32 |         return doc["answer"]
33 | 
34 | 
35 | def test_task():
36 |     from eval_mm.tasks.task import TaskConfig
37 | 
38 |     task = LlavaBenchIntheWild(TaskConfig())
39 |     ds = task.dataset
40 |     print(ds[0])
41 |     assert isinstance(task.doc_to_text(ds[0]), str)
42 |     assert isinstance(task.doc_to_visual(ds[0]), list)
43 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
44 |     assert isinstance(task.doc_to_id(ds[0]), str)
45 |     assert isinstance(task.doc_to_answer(ds[0]), str)
46 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/mecha_ja.py:
--------------------------------------------------------------------------------
  1 | from datasets import Dataset, load_dataset
  2 | from .task import Task
  3 | from PIL import Image
  4 | 
  5 | MULTI_CHOICE_PROMPT = (
  6 |     "与えられた選択肢の中から最も適切な回答のアルファベットを直接記入してください。"
  7 | )
  8 | 
  9 | OPTIONS_MAP = {
 10 |     0: "A",
 11 |     1: "B",
 12 |     2: "C",
 13 |     3: "D",
 14 | }
 15 | 
 16 | 
 17 | def parse_options(options):
 18 |     option_letters = [chr(ord("A") + i) for i in range(len(options))]
 19 |     choices_str = "\n".join(
 20 |         [
 21 |             f"{option_letter}. {option}"
 22 |             for option_letter, option in zip(option_letters, options)
 23 |         ]
 24 |     )
 25 |     return choices_str
 26 | 
 27 | 
 28 | def construct_prompt(question, options):
 29 |     parsed_options = parse_options(options)
 30 |     return f"{question}\n{parsed_options}\n\n{MULTI_CHOICE_PROMPT}"
 31 | 
 32 | 
 33 | def rotate_single_example(doc):
 34 |     """
 35 |     1つの doc に対して4パターン (options を左回転0,1,2,3) を作り、
 36 |     その際に answer も回転にあわせて更新し、各パターンの辞書をリストで返す。
 37 |     """
 38 |     base_opts = doc["options"]
 39 |     n = len(base_opts)  # 4想定
 40 |     orig_answer_idx = doc["answer"]  # 0~3
 41 |     results = []
 42 |     for i in range(n):
 43 |         rotated_options = base_opts[i:] + base_opts[:i]
 44 |         new_answer_idx = (orig_answer_idx - i) % n
 45 |         new_doc = dict(doc)
 46 |         new_doc["options"] = rotated_options
 47 |         new_doc["answer"] = new_answer_idx
 48 |         new_doc["answer_text"] = OPTIONS_MAP[new_answer_idx]
 49 |         new_doc["question_id"] = f"{doc['question_id']}_rot{i}"
 50 |         new_doc["input_text"] = construct_prompt(
 51 |             new_doc["question"], new_doc["options"]
 52 |         )
 53 |         results.append(new_doc)
 54 |     return results
 55 | 
 56 | 
 57 | def rotate_options_fn(batch):
 58 |     """
 59 |     batched=True 用の関数。
 60 |     batch: dict of lists
 61 |     これを1つずつ取り出して rotate_single_example で4つに拡張し、
 62 |     最終的に「列ごとのリスト」を返す。
 63 |     """
 64 |     # 出力用の空リストを用意
 65 |     new_batch = {
 66 |         "question": [],
 67 |         "options": [],
 68 |         "answer": [],
 69 |         "answer_text": [],
 70 |         "image": [],
 71 |         "question_id": [],
 72 |         "answer_type": [],
 73 |         "background_text": [],
 74 |         "input_text": [],
 75 |     }
 76 | 
 77 |     num_examples = len(batch["question_id"])
 78 |     for i in range(num_examples):
 79 |         # i番目のサンプル doc をまとめる
 80 |         doc = {
 81 |             "question": batch["question"][i],
 82 |             "options": batch["options"][i],
 83 |             "answer": batch["answer"][i],
 84 |             "answer_type": batch["answer_type"][i],
 85 |             "image": batch["image"][i],
 86 |             "background_text": batch["background_text"][i],
 87 |             "question_id": batch["question_id"][i],
 88 |         }
 89 |         # rotateして複数サンプルに展開
 90 |         rotated_docs = rotate_single_example(doc)
 91 |         # new_batch にappend
 92 |         for rd in rotated_docs:
 93 |             new_batch["question"].append(rd["question"])
 94 |             new_batch["options"].append(rd["options"])
 95 |             new_batch["answer"].append(rd["answer"])
 96 |             new_batch["answer_text"].append(rd["answer_text"])
 97 |             new_batch["image"].append(rd["image"])
 98 |             new_batch["question_id"].append(rd["question_id"])
 99 |             new_batch["answer_type"].append(rd["answer_type"])
100 |             new_batch["background_text"].append(rd["background_text"])
101 |             new_batch["input_text"].append(rd["input_text"])
102 | 
103 |     return new_batch
104 | 
105 | 
106 | class MECHAJa(Task):
107 |     default_metric = "mecha-ja"
108 | 
109 |     def _prepare_dataset(self) -> Dataset:
110 |         ds = load_dataset("llm-jp/MECHA-ja", split="test")
111 | 
112 |         ds = ds.map(
113 |             lambda x, idx: {
114 |                 "question": x["question"],
115 |                 "options": x["options"],
116 |                 "answer": x["answer"],  # 0~3
117 |                 "answer_type": x["answer_type"],
118 |                 "image": x["image"],
119 |                 "background_text": x["background_text"],
120 |                 "question_id": str(idx),
121 |                 "answer_text": OPTIONS_MAP[x["answer"]],
122 |                 "input_text": construct_prompt(x["question"], x["options"]),
123 |             },
124 |             with_indices=True,
125 |         )
126 |         # rotate_choices が有効なら4パターン展開
127 |         if self.config.rotate_choices:
128 |             ds = ds.map(
129 |                 rotate_options_fn,
130 |                 num_proc=8,
131 |                 batched=True,
132 |                 remove_columns=ds.column_names,
133 |             )
134 | 
135 |         return ds
136 | 
137 |     @staticmethod
138 |     def doc_to_text(doc) -> str:
139 |         return doc["input_text"]
140 | 
141 |     @staticmethod
142 |     def doc_to_visual(doc) -> list[Image.Image]:
143 |         return [doc["image"]]
144 | 
145 |     @staticmethod
146 |     def doc_to_id(doc) -> str:
147 |         return str(doc["question_id"])
148 | 
149 |     @staticmethod
150 |     def doc_to_answer(doc) -> str:
151 |         return doc["answer_text"]
152 | 
153 | 
154 | def test_task():
155 |     from eval_mm.tasks.task import TaskConfig
156 | 
157 |     task = MECHAJa(TaskConfig())
158 |     ds = task.dataset
159 |     print(ds[0])
160 |     assert isinstance(task.doc_to_text(ds[0]), str)
161 |     assert isinstance(task.doc_to_visual(ds[0]), list)
162 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
163 |     assert isinstance(task.doc_to_id(ds[0]), str)
164 |     assert isinstance(task.doc_to_answer(ds[0]), str)
165 | 
166 |     task = MECHAJa(TaskConfig(rotate_choices=True))
167 |     ds = task.dataset
168 |     print(ds[0])
169 |     assert isinstance(task.doc_to_text(ds[0]), str)
170 |     assert isinstance(task.doc_to_visual(ds[0]), list)
171 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
172 |     assert isinstance(task.doc_to_id(ds[0]), str)
173 |     assert isinstance(task.doc_to_answer(ds[0]), str)
174 |     assert ds[0]["question_id"] == "0_rot0"
175 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/mmmlu.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset
 2 | 
 3 | from .task import Task
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class MMMLU(Task):
 8 |     default_metric = "exact-match"
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         ds = load_dataset("openai/MMMLU", "JA_JP", split="test")
13 | 
14 |         # ['Unnamed: 0', 'Question', 'A', 'B', 'C', 'D', 'Answer', 'Subject'],
15 |         def build_prompt(example):
16 |             return f"{example['Question']} A: {example['A']} B: {example['B']} C: {example['C']} D: {example['D']}. Output only the letter of the correct answer. Answer:"
17 | 
18 |         ds = ds.rename_column("Answer", "answer")
19 |         ds = ds.rename_column("Unnamed: 0", "question_id")
20 |         ds = ds.map(
21 |             lambda example: {"input_text": build_prompt(example)},
22 |             remove_columns=["Question", "A", "B", "C", "D", "Subject"],
23 |         )
24 | 
25 |         return ds
26 | 
27 |     @staticmethod
28 |     def doc_to_text(doc) -> str:
29 |         return doc["input_text"]
30 | 
31 |     @staticmethod
32 |     def doc_to_visual(doc) -> list[Image.Image]:
33 |         return []
34 | 
35 |     @staticmethod
36 |     def doc_to_id(doc) -> str:
37 |         return str(doc["question_id"])
38 | 
39 |     @staticmethod
40 |     def doc_to_answer(doc) -> str:
41 |         return doc["answer"]
42 | 
43 | 
44 | def test_task():
45 |     from .task import TaskConfig
46 | 
47 |     task = MMMLU(TaskConfig())
48 |     ds = task.dataset
49 |     assert isinstance(task.doc_to_text(ds[0]), str)
50 |     assert isinstance(task.doc_to_visual(ds[0]), list)
51 |     assert isinstance(task.doc_to_id(ds[0]), str)
52 |     assert isinstance(task.doc_to_answer(ds[0]), str)
53 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/mmmu.py:
--------------------------------------------------------------------------------
  1 | # Reference: https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/tasks/mmmu/utils.py
  2 | 
  3 | from datasets import (
  4 |     Dataset,
  5 |     load_dataset,
  6 |     concatenate_datasets,
  7 |     get_dataset_config_names,
  8 | )
  9 | 
 10 | from .task import Task
 11 | from PIL import Image
 12 | 
 13 | import ast
 14 | import re
 15 | 
 16 | MULTI_CHOICE_PROMPT = "Answer with the option's letter from the given choices directly."
 17 | OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
 18 | 
 19 | 
 20 | def replace_images_tokens(input_string):
 21 |     for i in range(1, 8):
 22 |         question_text = f"<image {i}>"
 23 |         query_text = "<image>"
 24 |         if question_text in input_string:
 25 |             input_string = input_string.replace(question_text, query_text)
 26 |     return input_string
 27 | 
 28 | 
 29 | def parse_options(options):
 30 |     option_letters = [chr(ord("A") + i) for i in range(len(options))]
 31 |     choices_str = "\n".join(
 32 |         [
 33 |             f"{option_letter}. {option}"
 34 |             for option_letter, option in zip(option_letters, options)
 35 |         ]
 36 |     )
 37 |     return choices_str
 38 | 
 39 | 
 40 | def construct_prompt(doc):
 41 |     question = doc["question"]
 42 |     question = question.replace("<image1>", "<image 1>")
 43 |     if doc["question_type"] == "multiple-choice":
 44 |         # Weirdly, data["options"] is a string in MMMU Huggingface dataset
 45 |         parsed_options = parse_options(ast.literal_eval(doc["options"]))
 46 |         # parsed_options already prepends a newline so no need to add space here
 47 |         question = f"{question}\n{parsed_options}\n\n{MULTI_CHOICE_PROMPT}"
 48 |     else:
 49 |         question = f"{question}\n\n{OPEN_ENDED_PROMPT}"
 50 |     return question
 51 | 
 52 | 
 53 | def mmmu_doc_to_text(doc):
 54 |     question = construct_prompt(doc)
 55 |     question = replace_images_tokens(question)  # TODO: check if this is necessary
 56 |     return question
 57 | 
 58 | 
 59 | def mmmu_doc_to_visual(doc):
 60 |     prompt = construct_prompt(doc)
 61 | 
 62 |     image_tokens = re.findall(r"<image \d+>", prompt)
 63 |     # Remove <> and  swap space as _
 64 |     image_tokens = sorted(
 65 |         list(
 66 |             set(
 67 |                 [
 68 |                     image_token.strip("<>").replace(" ", "_")
 69 |                     for image_token in image_tokens
 70 |                 ]
 71 |             )
 72 |         )
 73 |     )
 74 |     visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
 75 |     return visual
 76 | 
 77 | 
 78 | class MMMU(Task):
 79 |     default_metric = "mmmu"
 80 | 
 81 |     @staticmethod
 82 |     def _prepare_dataset() -> Dataset:
 83 |         configs = get_dataset_config_names("MMMU/MMMU")
 84 |         datasets = [
 85 |             load_dataset("MMMU/MMMU", name=subject, split="validation")
 86 |             for subject in configs
 87 |         ]
 88 |         dataset = concatenate_datasets(datasets)
 89 |         dataset = dataset.map(
 90 |             lambda x: {
 91 |                 "input_text": mmmu_doc_to_text(x),
 92 |                 "question_id": x["id"],
 93 |                 "answer": x["answer"],
 94 |             }
 95 |         )
 96 |         return dataset
 97 | 
 98 |     @staticmethod
 99 |     def doc_to_text(doc) -> str:
100 |         return doc["input_text"]
101 | 
102 |     @staticmethod
103 |     def doc_to_visual(doc) -> list[Image.Image]:
104 |         return mmmu_doc_to_visual(doc)
105 | 
106 |     @staticmethod
107 |     def doc_to_id(doc) -> str:
108 |         return str(doc["question_id"])
109 | 
110 |     @staticmethod
111 |     def doc_to_answer(doc) -> str:
112 |         return doc["answer"]
113 | 
114 | 
115 | def test_task():
116 |     from eval_mm.tasks.task import TaskConfig
117 | 
118 |     task = MMMU(TaskConfig())
119 |     ds = task.dataset
120 |     print(ds[0])
121 |     assert isinstance(task.doc_to_text(ds[0]), str)
122 |     assert isinstance(task.doc_to_visual(ds[0]), list)
123 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
124 |     assert isinstance(task.doc_to_id(ds[0]), str)
125 |     assert isinstance(task.doc_to_answer(ds[0]), str)
126 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/mnist.py:
--------------------------------------------------------------------------------
 1 | from eval_mm.tasks.task import Task
 2 | from datasets import load_dataset, Dataset
 3 | from PIL import Image
 4 | 
 5 | 
 6 | class MNIST(Task):
 7 |     def __init__(self, config):
 8 |         super().__init__(config)
 9 | 
10 |     @staticmethod
11 |     def _prepare_dataset() -> Dataset:
12 |         ds = load_dataset("ylecun/mnist", split="test")
13 |         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
14 |         return ds
15 | 
16 |     @staticmethod
17 |     def doc_to_text(doc) -> str:
18 |         return "画像に写っている数字は何ですか？ 数字のみを出力してください。"
19 | 
20 |     @staticmethod
21 |     def doc_to_visual(doc) -> list[Image.Image]:
22 |         return [doc["image"]]
23 | 
24 |     @staticmethod
25 |     def doc_to_id(doc) -> str:
26 |         return str(doc["question_id"])
27 | 
28 |     @staticmethod
29 |     def doc_to_answer(doc) -> str:
30 |         return str(doc["label"])
31 | 
32 | 
33 | def test_task():
34 |     from eval_mm.tasks.task import TaskConfig
35 | 
36 |     task = MNIST(TaskConfig())
37 |     ds = task.dataset
38 |     print(ds[0])
39 |     assert isinstance(task.doc_to_text(ds[0]), str)
40 |     assert isinstance(task.doc_to_visual(ds[0]), list)
41 |     assert isinstance(task.doc_to_visual(ds[0])[0], Image.Image)
42 |     assert isinstance(task.doc_to_id(ds[0]), str)
43 |     assert isinstance(task.doc_to_answer(ds[0]), str)
44 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/task.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from dataclasses import dataclass
 4 | from datasets import Dataset
 5 | from PIL import Image
 6 | 
 7 | 
 8 | @dataclass
 9 | class TaskConfig:
10 |     max_dataset_len: int | None = None
11 |     rotate_choices: bool = False
12 | 
13 | 
14 | class Task(abc.ABC):
15 |     def __init__(self, config: TaskConfig):
16 |         self.config = config
17 | 
18 |         if self.config.max_dataset_len is not None:
19 |             self.dataset = self._prepare_dataset().select(
20 |                 range(self.config.max_dataset_len)
21 |             )
22 |         else:
23 |             self.dataset = self._prepare_dataset()
24 | 
25 |     @abc.abstractmethod
26 |     def _prepare_dataset(self) -> Dataset:
27 |         """Prepares the dataset."""
28 |         pass
29 | 
30 |     @abc.abstractmethod
31 |     def doc_to_text(self, doc) -> str:
32 |         """Converts a document to text."""
33 |         pass
34 | 
35 |     @abc.abstractmethod
36 |     def doc_to_visual(self, doc) -> list[Image.Image]:
37 |         """Converts a document to visual."""
38 |         pass
39 | 
40 |     @abc.abstractmethod
41 |     def doc_to_id(self, doc) -> str:
42 |         """Converts a document to id."""
43 |         pass
44 | 
45 |     @abc.abstractmethod
46 |     def doc_to_answer(self, doc) -> str:
47 |         """Converts a document to answer."""
48 |         pass
49 | 


--------------------------------------------------------------------------------
/src/eval_mm/tasks/task_registry.py:
--------------------------------------------------------------------------------
 1 | from .ja_vg_vqa_500 import JaVGVQA500
 2 | from .japanese_heron_bench import JapaneseHeronBench
 3 | from .ja_vlm_bench_in_the_wild import JaVLMBenchIntheWild
 4 | from .jmmmu import JMMMU
 5 | from .ja_multi_image_vqa import JAMultiImageVQA
 6 | from .jdocqa import JDocQA
 7 | from .mmmu import MMMU
 8 | from .llava_bench_in_the_wild import LlavaBenchIntheWild
 9 | from .jic_vqa import JICVQA
10 | from .mecha_ja import MECHAJa
11 | from .mmmlu import MMMLU
12 | from .mnist import MNIST
13 | from .cc_ocr import CCOCR
14 | from .cvqa import CVQA
15 | from .task import Task, TaskConfig
16 | from typing import Callable
17 | 
18 | 
19 | class TaskRegistry:
20 |     """Registry to map metrics to their corresponding scorer classes."""
21 | 
22 |     _tasks: dict[str, Callable[[TaskConfig], Task]] = {
23 |         "japanese-heron-bench": JapaneseHeronBench,
24 |         "ja-vlm-bench-in-the-wild": JaVLMBenchIntheWild,
25 |         "ja-vg-vqa-500": JaVGVQA500,
26 |         "jmmmu": JMMMU,
27 |         "ja-multi-image-vqa": JAMultiImageVQA,
28 |         "jdocqa": JDocQA,
29 |         "mmmu": MMMU,
30 |         "llava-bench-in-the-wild": LlavaBenchIntheWild,
31 |         "jic-vqa": JICVQA,
32 |         "mecha-ja": MECHAJa,
33 |         "mmmlu": MMMLU,
34 |         "mnist": MNIST,
35 |         "cc-ocr": CCOCR,
36 |         "cvqa": CVQA,
37 |     }
38 | 
39 |     @classmethod
40 |     def get_task_list(cls):
41 |         return list(cls._tasks.keys())
42 | 
43 |     @classmethod
44 |     def load_task(cls, task_name: str, task_config: TaskConfig = TaskConfig()) -> Task:
45 |         try:
46 |             return cls._tasks[task_name](task_config)  # type: ignore
47 |         except KeyError:
48 |             raise ValueError(f"Task '{task_name}' is not supported.")
49 | 


--------------------------------------------------------------------------------
/src/eval_mm/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/src/eval_mm/utils/__init__.py


--------------------------------------------------------------------------------
/src/eval_mm/utils/azure_client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import asyncio
  3 | import logging
  4 | from openai import AsyncAzureOpenAI, AsyncOpenAI, APIError
  5 | from openai.types.chat import ChatCompletion
  6 | import backoff
  7 | from tqdm import tqdm
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | @backoff.on_exception(backoff.expo, APIError, max_tries=5)
 13 | async def call_openai(
 14 |     client: AsyncOpenAI,
 15 |     model_name: str,
 16 |     messages_list: list[dict[str, str]],
 17 |     **kwargs,
 18 | ) -> ChatCompletion:
 19 |     return await client.chat.completions.create(
 20 |         model=model_name,
 21 |         messages=messages_list,
 22 |         **kwargs,
 23 |     )
 24 | 
 25 | 
 26 | class OpenAIChatAPI:
 27 |     def __init__(self) -> None:
 28 |         if os.getenv("AZURE_OPENAI_KEY") and os.getenv("AZURE_OPENAI_ENDPOINT"):
 29 |             self.client = AsyncAzureOpenAI(
 30 |                 api_key=os.getenv("AZURE_OPENAI_KEY"),
 31 |                 api_version="2023-05-15",
 32 |                 azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
 33 |             )
 34 |         elif os.getenv("OPENAI_API_KEY"):
 35 |             self.client = AsyncOpenAI()
 36 |         else:
 37 |             raise ValueError(
 38 |                 "API Key not found. Please set the OPENAI_API_KEY or AZURE_OPENAI_KEY environment variables."
 39 |             )
 40 | 
 41 |     async def _async_batch_run_chatgpt(
 42 |         self,
 43 |         messages_list: list[list[dict[str, str]]],
 44 |         stop_sequences: str | list[str] | None = None,
 45 |         max_new_tokens: int | None = None,
 46 |         model_name: str | None = None,
 47 |         **kwargs,
 48 |     ) -> list[ChatCompletion | Exception]:
 49 |         if stop_sequences is not None:
 50 |             if "stop" in kwargs:
 51 |                 raise ValueError("Specify only one: `stop_sequences` or `stop`.")
 52 |             kwargs["stop"] = stop_sequences
 53 | 
 54 |         if max_new_tokens is not None:
 55 |             if "max_tokens" in kwargs:
 56 |                 raise ValueError("Specify only one: `max_new_tokens` or `max_tokens`.")
 57 |             kwargs["max_tokens"] = max_new_tokens
 58 | 
 59 |         tasks = [
 60 |             asyncio.create_task(call_openai(self.client, model_name, ms, **kwargs))
 61 |             for ms in messages_list
 62 |         ]
 63 |         results: list[ChatCompletion | Exception] = await asyncio.gather(
 64 |             *tasks, return_exceptions=True
 65 |         )
 66 | 
 67 |         output: list[ChatCompletion | Exception] = []
 68 |         for i, result in enumerate(results):
 69 |             if isinstance(result, Exception):
 70 |                 logger.error(f"Error in task {i}: {result}")
 71 |                 output.append(None)
 72 |             else:
 73 |                 output.append(result)
 74 |         return output
 75 | 
 76 |     def batch_generate_chat_response(
 77 |         self,
 78 |         chat_messages_list: list[list[dict[str, str]]],
 79 |         model_name: str | None = None,
 80 |         **kwargs,
 81 |     ) -> list[str]:
 82 |         api_responses = asyncio.run(
 83 |             self._async_batch_run_chatgpt(
 84 |                 chat_messages_list, model_name=model_name, **kwargs
 85 |             )
 86 |         )
 87 |         model_outputs = []
 88 |         for res in api_responses:
 89 |             if isinstance(res, ChatCompletion):
 90 |                 model_output = res.choices[0].message.content
 91 |                 model_outputs.append(model_output)
 92 | 
 93 |                 logger.info(f"Model output: {model_output}")
 94 |                 logger.info(f"Usage: {res.usage}")
 95 |             else:
 96 |                 logger.error(f"Unexpected error: {res}")
 97 |                 model_outputs.append("")
 98 |         return model_outputs
 99 | 
100 |     def __repr__(self) -> str:
101 |         return f"{self.__class__.__name__}(client={self.client})"
102 | 
103 | 
104 | class MockChatAPI:
105 |     def __init__(self) -> None:
106 |         pass
107 | 
108 |     def batch_generate_chat_response(
109 |         self,
110 |         chat_messages_list: list[list[dict[str, str]]],
111 |         model_name: str | None = None,
112 |         **kwargs,
113 |     ) -> list[str]:
114 |         return ["Mock response"] * len(chat_messages_list)
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     # Test code
119 |     client = OpenAIChatAPI()
120 |     messages_list = [
121 |         [{"role": "system", "content": "こんにちは"}],
122 |         [{"role": "user", "content": "今日の天気は？"}],
123 |     ]
124 | 
125 |     responses = client.batch_generate_chat_response(
126 |         messages_list, model_name="gpt-4o-mini-2024-07-18"
127 |     )
128 |     print(responses)
129 | 
130 |     # Example dataset
131 |     dataset = [messages_list[0] for _ in range(10)]
132 | 
133 |     # Batch processing with progress bar
134 |     with tqdm(total=len(dataset)) as pbar:
135 |         batch_size = 5
136 |         for i in range(0, len(dataset), batch_size):
137 |             items = dataset[i : i + batch_size]
138 |             responses = client.batch_generate_chat_response(
139 |                 items, model_name="gpt-4o-mini-2024-07-18"
140 |             )
141 |             print(responses)
142 |             pbar.update(len(items))
143 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | uv run pytest src/eval_mm/tasks/*.py
2 | uv run pytest src/eval_mm/metrics/*.py
3 | 


--------------------------------------------------------------------------------
/test_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux  # エラーが発生したらスクリプトを停止する
 3 | 
 4 | # Set CUDA devices
 5 | export CUDA_VISIBLE_DEVICES=0
 6 | 
 7 | # Model name to group name mapping
 8 | declare -A MODEL_GROUP_MAP=(
 9 |     ["stabilityai/japanese-instructblip-alpha"]="normal"
10 |     ["stabilityai/japanese-stable-vlm"]="calm"
11 |     ["cyberagent/llava-calm2-siglip"]="calm"
12 |     ["llava-hf/llava-1.5-7b-hf"]="normal"
13 |     ["llava-hf/llava-v1.6-mistral-7b-hf"]="normal"
14 |     ["neulab/Pangea-7B-hf"]="sarashina"
15 |     ["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal"
16 |     ["OpenGVLab/InternVL2-8B"]="normal"
17 |     ["Qwen/Qwen2-VL-7B-Instruct"]="normal"
18 |     ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
19 |     ["gpt-4o-2024-05-13"]="normal"
20 |     ["mistralai/Pixtral-12B-2409"]="pixtral"
21 |     ["llm-jp/llm-jp-3-vila-14b"]="vilaja"
22 |     ["Efficient-Large-Model/VILA1.5-13b"]="vilaja"
23 |     ["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm"
24 |     ["google/gemma-3-4b-it"]="normal"
25 |     ["sbintuitions/sarashina2-vision-8b"]="sarashina"
26 |     ["microsoft/Phi-4-multimodal-instruct"]="phi"
27 |     ["MIL-UT/Asagi-14B"]="normal"
28 | )
29 | 
30 | for model_name in "${!MODEL_GROUP_MAP[@]}"; do
31 |     model_group=${MODEL_GROUP_MAP[$model_name]}
32 |     uv sync --group $model_group
33 |     uv run --group $model_group  python examples/test_model.py \
34 |         --model_id "$model_name"
35 | done
36 | 


--------------------------------------------------------------------------------
/tips/evaluation.md:
--------------------------------------------------------------------------------
 1 | # Tips for Evaluation
 2 | 
 3 | 
 4 | ## LLM-as-a-Judge
 5 | 
 6 | *You should use valid prompts for the LLM-as-a-Judge method.*
 7 | 
 8 | Case1:
 9 | 
10 | When using the following prompts, empty prediction sometimes scored as 5.
11 | ```
12 | # Instruction
13 | You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
14 | We will provide you with the user prompt and an AI-generated responses.
15 | You should first read the user prompt carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided in the Evaluation section below.
16 | 
17 | # Evaluation
18 | ## Metric Definition
19 | You will be assessing question answering quality, which measures the overall quality of the answer to the question in the user prompt. Pay special attention to length constraints, such as in X words or in Y sentences. The instruction for performing a question-answering task is provided in the user prompt. The response should not contain information that is not present in the context (if it is provided).
20 | 
21 | You will assign the writing response a score from 5, 4, 3, 2, 1, following the Rating Rubric and Evaluation Steps.
22 | Give step-by-step explanations for your scoring, and only choose scores from 5, 4, 3, 2, 1.
23 | 
24 | ## Criteria Definition
25 | Instruction following: The response demonstrates a clear understanding of the question answering task instructions, satisfying all of the instruction's requirements.
26 | Groundedness: The response contains information included only in the context if the context is present in the user prompt. The response does not reference any outside information.
27 | Completeness: The response completely answers the question with sufficient detail.
28 | Fluent: The response is well-organized and easy to read.
29 | 
30 | ## Rating Rubric
31 | 5: (Very good). The answer follows instructions, is grounded, complete, and fluent.
32 | 4: (Good). The answer follows instructions, is grounded, complete, but is not very fluent.
33 | 3: (Ok). The answer mostly follows instructions, is grounded, answers the question partially and is not very fluent.
34 | 2: (Bad). The answer does not follow the instructions very well, is incomplete or not fully grounded.
35 | 1: (Very bad). The answer does not follow the instructions, is wrong and not grounded.
36 | 
37 | ## Evaluation Steps
38 | STEP 1: Assess the response in aspects of instruction following, groundedness,completeness, and fluency according to the criteria.
39 | STEP 2: Provide overall score based on the rubric in the format of `Score: X` where X is the score you assign to the response.
40 | 
41 | # Question, Reference Answer and AI-generated Response
42 | ## Question
43 | {input_text}
44 | 
45 | ## Reference Answer
46 | {answer}
47 | 
48 | ## AI-generated Response
49 | {pred}
50 | ```
51 | 
52 | When using the above prompts, the following combinations of (question, answer, pred) were scored as [5, 1, 5]. Ideally, a score of [1, 1, 1] should be given.
53 | 
54 | ```
55 | questions = [“前年の合計所得が33万円以下の世帯の場合、軽減の割合は何割ですか。\n解答は数量のみで答えてください。“, “3つのラベルはどのような違いがありますか。\n解答は自由に記述してください。“, “国内の日本語教育の概要において、外国人に対する日本語教育の現状として、日本語教師数の数や学習者の数はどうなっていますか。\n解答は自由に記述してください。“]
56 | answers = [“7割です“, “下部に文字がプリントされているかどうか、プリントの内容が「CARBONNEUTRAL」または「CERTIFIEDMODEL」かの違いがあります。“, “国内の日本語教育の概要において、外国人に対する日本語教育の現状として、日本語教師数の数は34392人、学習者の数は139613人となっています。“]
57 | preds = [“”, “”, “”]
58 | 
59 | # scores: [5, 1, 5]
60 | ```
61 | 
62 | When the order of the answer and pred in the instruction prompt was reversed, they were correctly scored as [1,1,1]. This suggests that Judger may not be able to clearly distinguish between answer and pred when using the above prompt. The instruction prompts should be modified to clearly distinguish between “answer” and “pred”.
63 | 


--------------------------------------------------------------------------------