├── .env.sample ├── .github └── workflows │ ├── deploy-github-pages.yml │ ├── python-publish.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── LICENSE ├── README.md ├── assets ├── streamlit_visualization.png └── teaser.png ├── eval_all.sh ├── eval_with_vllm.sh ├── examples ├── asagi.py ├── base_vllm.py ├── base_vlm.py ├── evovlm_jp_v1.py ├── gemma3.py ├── gpt4o.py ├── heron_nvila.py ├── internvl2.py ├── japanese_instructblip_alpha.py ├── japanese_stable_vlm.py ├── llama_3_2_vision.py ├── llama_3_evovlm_jp_v2.py ├── llava_1_5.py ├── llava_1_6_mistral_hf.py ├── llava_calm2_siglip.py ├── llm_jp_3_vila.py ├── model_table.py ├── pangea_hf.py ├── phi4_multimodal.py ├── pixtral.py ├── qwen2_5_vl.py ├── qwen2_vl.py ├── sample.py ├── sample_vllm.py ├── sarashina2_vision.py ├── test_model.py ├── utils.py ├── vila.py ├── vllm_registry.py └── xcomposer2d5.py ├── github_pages ├── .gitignore ├── .prettierrc ├── README.md ├── package-lock.json ├── package.json ├── public │ ├── dataset_url.json │ ├── default_metrics.json │ ├── index.html │ └── leaderboard.json └── src │ ├── Author.css │ ├── Author.js │ ├── BibTex.css │ ├── BibTex.js │ ├── Figure.css │ ├── Figure.js │ ├── Footer.css │ ├── Footer.js │ ├── Introduction.css │ ├── Introduction.js │ ├── Leaderboard.css │ ├── Leaderboard.js │ ├── LinkButton.css │ ├── LinkButton.js │ ├── Main.css │ ├── Main.js │ ├── Method.css │ ├── Method.js │ ├── PageLayout.css │ ├── PageLayout.js │ ├── PaperMetaData.css │ ├── PaperMetaData.js │ ├── Result.css │ ├── Result.js │ ├── assets │ └── teaser.png │ ├── index.css │ ├── index.js │ ├── logo.svg │ ├── reportWebVitals.js │ └── setupTests.js ├── pyproject.toml ├── scripts ├── browse_prediction.py ├── consistency_mecha_ja.py ├── make_leaderboard.py └── prepare_jic_vqa.py ├── src └── eval_mm │ ├── __init__.py │ ├── _version.py │ ├── metrics │ ├── __init__.py │ ├── cc_ocr_scorer.py │ ├── exact_match_scorer.py │ ├── heron_bench_scorer.py │ ├── jdocqa_scorer.py │ ├── jic_vqa_scorer.py │ ├── jmmmu_scorer.py │ ├── llm_as_a_judge_scorer.py │ ├── mecha_ja_scorer.py │ ├── mmmu_scorer.py │ ├── rougel_scorer.py │ ├── scorer.py │ ├── scorer_registry.py │ └── substring_match_scorer.py │ ├── tasks │ ├── __init__.py │ ├── cc_ocr.py │ ├── cvqa.py │ ├── ja_multi_image_vqa.py │ ├── ja_vg_vqa_500.py │ ├── ja_vlm_bench_in_the_wild.py │ ├── japanese_heron_bench.py │ ├── jdocqa.py │ ├── jic_vqa.py │ ├── jmmmu.py │ ├── llava_bench_in_the_wild.py │ ├── mecha_ja.py │ ├── mmmlu.py │ ├── mmmu.py │ ├── mnist.py │ ├── task.py │ └── task_registry.py │ └── utils │ ├── __init__.py │ └── azure_client.py ├── test.sh ├── test_model.sh └── tips └── evaluation.md /.env.sample: -------------------------------------------------------------------------------- 1 | # For Azure OpenAI API 2 | AZURE_OPENAI_ENDPOINT= 3 | AZURE_OPENAI_KEY= 4 | # For OpenAI API 5 | OPENAI_API_KEY= 6 | -------------------------------------------------------------------------------- /.github/workflows/deploy-github-pages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - .github/workflows/deploy-github-pages.yml 9 | - github_pages/** 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | defaults: 16 | run: 17 | working-directory: github_pages 18 | 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Instal Node.js 26 | uses: actions/setup-node@v4 27 | 28 | - name: Install dependencies 29 | run: npm install 30 | 31 | - name: Build 32 | run: npm run build 33 | env: 34 | PUBLIC_URL: /llm-jp-eval-mm 35 | 36 | - name: Upload Pages artifact 37 | uses: actions/upload-pages-artifact@v3 38 | with: 39 | path: github_pages/build 40 | 41 | deploy: 42 | needs: build 43 | 44 | permissions: 45 | pages: write 46 | id-token: write 47 | 48 | environment: 49 | name: github-pages 50 | url: ${{ steps.deployment.outputs.page_url }} 51 | 52 | runs-on: ubuntu-latest 53 | 54 | steps: 55 | - name: Deploy to GitHub Pages 56 | uses: actions/deploy-pages@v4 57 | id: deployment 58 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Release workflow 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v[0123456789].*" 7 | 8 | permissions: 9 | contents: read 10 | id-token: write 11 | 12 | jobs: 13 | release: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: checkout 17 | uses: actions/checkout@v4 18 | - name: setup python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: "3.x" 22 | - name: build 23 | run: | 24 | python -m pip install --upgrade build hatch 25 | python -m hatch version "${GITHUB_REF_NAME}" 26 | python -m build 27 | - name: publish 28 | uses: pypa/gh-action-pypi-publish@release/v1 29 | with: 30 | password: ${{ secrets.PYPI_API_TOKEN }} 31 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test workflow 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | uv-example: 8 | name: python 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Install uv 15 | uses: astral-sh/setup-uv@v5 16 | 17 | - name: Install the project 18 | run: uv sync --dev 19 | 20 | - name: Run tests 21 | # For example, using `pytest` 22 | run: uv run pytest src/eval_mm/metrics/*.py 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # Environments 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | env.bak/ 88 | venv.bak/ 89 | .env 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # weights and biases 105 | wandb/ 106 | outputs/ 107 | 108 | # config 109 | configs/config.yaml 110 | 111 | # debug 112 | dataset/ 113 | logs/ 114 | 115 | # submodules 116 | submodule/ 117 | 118 | # temporary 119 | notebooks/tmp 120 | tmp/ 121 | 122 | # verbose output 123 | *verbose.jsonl 124 | __depr__/ 125 | 126 | # examples/llava for evaluating LLM-jp-3 VILA 127 | examples/llava/* 128 | 129 | # experiments 130 | result/ 131 | 132 | # uv.lock 133 | uv.lock 134 | 135 | # cursor config 136 | .cursor/ 137 | .cursorrules 138 | .cursorignore 139 | 140 | # vscode 141 | .vscode/ 142 | 143 | # cache 144 | .cache/ 145 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.1.4 5 | hooks: 6 | # Run the Ruff linter. 7 | - id: ruff 8 | exclude: "(__init__\\.py|migrations/)" 9 | args: ["--extend-ignore=F401,E501"] 10 | # Run the Ruff formatter. 11 | - id: ruff-format 12 | exclude: "(__init__\\.py|migrations/)" 13 | 14 | 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v4.4.0 17 | hooks: 18 | - id: trailing-whitespace # 末尾の空白を除去 19 | - id: end-of-file-fixer # ファイル末の改行統一 20 | - id: check-merge-conflict # コンフリクト残りの検出 21 | - id: check-yaml # YAML構文確認 22 | - id: check-added-large-files # 巨大ファイルの誤add防止 23 | - id: no-commit-to-branch # main/master直コミット防止 24 | args: ["--branch", "main", "--branch", "master"] 25 | 26 | - repo: https://github.com/pre-commit/mirrors-mypy 27 | rev: v1.9.0 28 | hooks: 29 | - id: mypy 30 | additional_dependencies: [types-requests] 31 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12.1 2 | -------------------------------------------------------------------------------- /assets/streamlit_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/assets/streamlit_visualization.png -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-jp/llm-jp-eval-mm/f0998c316138ae6541b67a3bea03e9cbb0cf4a34/assets/teaser.png -------------------------------------------------------------------------------- /eval_all.sh: -------------------------------------------------------------------------------- 1 | # Set CUDA devices 2 | set -eux # エラーが発生したらスクリプトを停止する 3 | 4 | #export CUDA_VISIBLE_DEVICES=0 5 | 6 | # Model name to group name mapping 7 | declare -A MODEL_GROUP_MAP=( 8 | # ["stabilityai/japanese-instructblip-alpha"]="normal" 9 | # ["stabilityai/japanese-stable-vlm"]="normal" 10 | # ["cyberagent/llava-calm2-siglip"]="calm" 11 | # ["llava-hf/llava-1.5-7b-hf"]="normal" 12 | # ["llava-hf/llava-v1.6-mistral-7b-hf"]="normal" 13 | # ["neulab/Pangea-7B-hf"]="sarashina" 14 | # ["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal" 15 | # ["meta-llama/Llama-3.2-90B-Vision-Instruct"]="normal" 16 | # ["OpenGVLab/InternVL2-8B"]="normal" 17 | # ["OpenGVLab/InternVL2-26B"]="normal" 18 | # ["Qwen/Qwen2-VL-7B-Instruct"]="normal" 19 | # ["Qwen/Qwen2-VL-72B-Instruct"]="normal" 20 | # ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal" 21 | # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal" 22 | # ["gpt-4o-2024-11-20"]="normal" 23 | # ["mistralai/Pixtral-12B-2409"]="pixtral" 24 | # ["llm-jp/llm-jp-3-vila-14b"]="vilaja" 25 | # ["Efficient-Large-Model/VILA1.5-13b"]="vilaja" 26 | # ["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm" 27 | # ["google/gemma-3-4b-it"]="normal" 28 | # ["google/gemma-3-12b-it"]="normal" 29 | # ["google/gemma-3-27b-it"]="normal" 30 | # ["sbintuitions/sarashina2-vision-8b"]="sarashina" 31 | # ["sbintuitions/sarashina2-vision-14b"]="sarashina" 32 | # ["microsoft/Phi-4-multimodal-instruct"]="phi" 33 | ["turing-motors/Heron-NVILA-Lite-15B"]="heron_nvila" 34 | ) 35 | 36 | # Task list 37 | declare -a task_list=( 38 | # "japanese-heron-bench" 39 | "ja-vlm-bench-in-the-wild" 40 | # "ja-vg-vqa-500" 41 | "jmmmu" 42 | "ja-multi-image-vqa" 43 | "jdocqa" 44 | "mmmu" 45 | "llava-bench-in-the-wild" 46 | # "jic-vqa" 47 | "mecha-ja" 48 | # "cc-ocr" 49 | # "cvqa" 50 | ) 51 | 52 | # Define metrics per task 53 | declare -A METRIC_MAP=( 54 | ["japanese-heron-bench"]="heron-bench" 55 | ["ja-vlm-bench-in-the-wild"]="llm-as-a-judge rougel" 56 | ["ja-vg-vqa-500"]="llm-as-a-judge rougel" 57 | ["jmmmu"]="jmmmu" 58 | ["ja-multi-image-vqa"]="llm-as-a-judge rougel" 59 | ["jdocqa"]="jdocqa llm-as-a-judge" 60 | ["mmmu"]="mmmu" 61 | ["llava-bench-in-the-wild"]="llm-as-a-judge rougel" 62 | ["jic-vqa"]="jic-vqa" 63 | ["mecha-ja"]="mecha-ja" 64 | ["cc-ocr"]="cc-ocr" 65 | ["cvqa"]="substring-match" 66 | ) 67 | 68 | # Result directories 69 | declare -a result_dir_list=( 70 | "result" 71 | ) 72 | 73 | # Main evaluation loop 74 | for RESULT_DIR in "${result_dir_list[@]}"; do 75 | for task in "${task_list[@]}"; do 76 | METRIC=${METRIC_MAP[$task]} 77 | for model_name in "${!MODEL_GROUP_MAP[@]}"; do 78 | model_group=${MODEL_GROUP_MAP[$model_name]} 79 | uv sync --group $model_group 80 | uv run --group $model_group python examples/sample.py \ 81 | --model_id "$model_name" \ 82 | --task_id "$task" \ 83 | --metrics $METRIC \ 84 | --judge_model "gpt-4o-2024-11-20" \ 85 | --result_dir "$RESULT_DIR" 86 | done 87 | done 88 | done 89 | 90 | echo "All evaluations are done." 91 | -------------------------------------------------------------------------------- /eval_with_vllm.sh: -------------------------------------------------------------------------------- 1 | # Set CUDA devices 2 | set -eux # エラーが発生したらスクリプトを停止する 3 | 4 | #export CUDA_VISIBLE_DEVICES=0 5 | 6 | # Model name to group name mapping 7 | declare -A MODEL_GROUP_MAP=( 8 | ["Qwen/Qwen2.5-VL-3B-Instruct"]="normal" 9 | ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal" 10 | ["Qwen/Qwen2.5-VL-32B-Instruct"]="normal" 11 | # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal" 12 | ["google/gemma-3-4b-it"]="normal" 13 | ["google/gemma-3-12b-it"]="normal" 14 | ["google/gemma-3-27b-it"]="normal" 15 | ) 16 | 17 | # Task list 18 | declare -a task_list=( 19 | "japanese-heron-bench" 20 | ) 21 | 22 | # Define metrics per task 23 | declare -A METRIC_MAP=( 24 | ["japanese-heron-bench"]="heron-bench" 25 | ["ja-vlm-bench-in-the-wild"]="llm-as-a-judge,rougel" 26 | ["ja-vg-vqa-500"]="llm-as-a-judge,rougel" 27 | ["jmmmu"]="jmmmu" 28 | ["ja-multi-image-vqa"]="llm-as-a-judge,rougel" 29 | ["jdocqa"]="jdocqa,llm-as-a-judge" 30 | ["mmmu"]="mmmu" 31 | ["llava-bench-in-the-wild"]="llm-as-a-judge,rougel" 32 | ["jic-vqa"]="jic-vqa" 33 | ["mecha-ja"]="mecha-ja" 34 | ) 35 | 36 | # Result directories 37 | declare -a result_dir_list=( 38 | "result" 39 | ) 40 | 41 | # Main evaluation loop 42 | for RESULT_DIR in "${result_dir_list[@]}"; do 43 | for task in "${task_list[@]}"; do 44 | METRIC=${METRIC_MAP[$task]} 45 | for model_name in "${!MODEL_GROUP_MAP[@]}"; do 46 | model_group=${MODEL_GROUP_MAP[$model_name]} 47 | uv sync --group vllm_normal 48 | uv run --group vllm_normal python examples/sample_vllm.py \ 49 | --model_id "$model_name" \ 50 | --task_id "$task" \ 51 | --metrics "$METRIC" \ 52 | --judge_model "gpt-4o-2024-11-20" \ 53 | --result_dir "$RESULT_DIR" \ 54 | --inference_only 55 | done 56 | done 57 | done 58 | 59 | echo "All evaluations are done." 60 | -------------------------------------------------------------------------------- /examples/asagi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import AutoModel, AutoProcessor 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | 7 | 8 | class VLM(BaseVLM): 9 | def __init__(self, model_id: str = "MIL-UT/Asagi-14B") -> None: 10 | self.model_id = model_id 11 | self.model = AutoModel.from_pretrained( 12 | self.model_id, 13 | trust_remote_code=True, 14 | torch_dtype=torch.bfloat16, 15 | device_map="auto", 16 | ) 17 | self.processor = AutoProcessor.from_pretrained(self.model_id) 18 | 19 | def generate( 20 | self, 21 | images: list[Image.Image] | None, 22 | text: str, 23 | gen_kwargs: GenerationConfig = GenerationConfig(), 24 | ) -> str: 25 | if images is None: 26 | images = [] 27 | 28 | prompt = f"""以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。 29 | ### 指示: 30 | {""*len(images)} 31 | {text} 32 | ### 応答: 33 | """ 34 | 35 | inputs = self.processor(text=prompt, images=images, return_tensors="pt") 36 | 37 | inputs_text = self.processor.tokenizer(prompt, return_tensors="pt") 38 | inputs["input_ids"] = inputs_text["input_ids"] 39 | inputs["attention_mask"] = inputs_text["attention_mask"] 40 | inputs = { 41 | k: inputs[k].to(self.model.device) for k in inputs if k != "token_type_ids" 42 | } 43 | 44 | generate_ids = self.model.generate(**inputs, **gen_kwargs.__dict__) 45 | generated_text = self.processor.batch_decode( 46 | generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False 47 | )[0] 48 | # truncate the text to remove the prompt 49 | generated_text = generated_text.split("### 応答:")[1].strip() 50 | return generated_text 51 | 52 | 53 | if __name__ == "__main__": 54 | vlm = VLM() 55 | vlm.test_vlm() 56 | -------------------------------------------------------------------------------- /examples/base_vllm.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | from PIL import Image 3 | from utils import GenerationConfig 4 | from base_vlm import BaseVLM 5 | from vllm_registry import VLLMModelRegistry 6 | import torch 7 | 8 | 9 | class VLLM(BaseVLM): 10 | def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None: 11 | self.model_id = model_id 12 | self.registry = VLLMModelRegistry(self.model_id) 13 | self.processor = self.registry.processor 14 | self.vllm_loader = self.registry.loader_map[self.model_id] 15 | 16 | engine_config = self.registry.get_engine_config(self.model_id) 17 | self.engine_args_dict = { 18 | "model": self.model_id, 19 | "tensor_parallel_size": 2, # number of GPUs of the machine, but 40 should be divisible by tensor_parallel_size 20 | "download_dir": "./.cache/vllm", 21 | **engine_config, 22 | } 23 | self.model = LLM(**self.engine_args_dict) 24 | 25 | def generate( 26 | self, 27 | images: list[Image.Image] | None, 28 | text: str, 29 | gen_kwargs: GenerationConfig = GenerationConfig(), 30 | ) -> str: 31 | if images is None: 32 | images = [] 33 | req_data = self.vllm_loader(text, images) 34 | sampling_params = SamplingParams( 35 | temperature=gen_kwargs.temperature, 36 | max_tokens=gen_kwargs.max_new_tokens, 37 | stop_token_ids=req_data.stop_token_ids, 38 | ) 39 | outputs = self.model.generate( 40 | { 41 | "prompt": req_data.prompt, 42 | "multi_modal_data": {"image": req_data.image_data}, 43 | }, 44 | sampling_params=sampling_params, 45 | lora_request=req_data.lora_requests, 46 | ) 47 | return outputs[0].outputs[0].text 48 | 49 | def batch_generate( 50 | self, 51 | images_list: list[list[Image.Image]] | None, 52 | text_list: list[str], 53 | gen_kwargs: GenerationConfig = GenerationConfig(), 54 | ) -> list[str]: 55 | if images_list is None: 56 | images_list = [[] for _ in range(len(text_list))] 57 | 58 | assert len(images_list) == len(text_list) 59 | 60 | from tqdm import tqdm 61 | 62 | req_data_list = [] 63 | 64 | for text, images in tqdm(zip(text_list, images_list)): 65 | req_data_list.append(self.vllm_loader(text, images)) 66 | 67 | sampling_params = SamplingParams( 68 | temperature=gen_kwargs.temperature, 69 | max_tokens=gen_kwargs.max_new_tokens, 70 | ) 71 | 72 | print(f"Generated {len(req_data_list)} requests") 73 | 74 | outputs = self.model.generate( 75 | [ 76 | { 77 | "prompt": req_data.prompt, 78 | "multi_modal_data": {"image": req_data.image_data}, 79 | } 80 | for req_data in req_data_list 81 | ], 82 | sampling_params=sampling_params, 83 | ) 84 | return [output.outputs[0].text for output in outputs] 85 | 86 | 87 | if __name__ == "__main__": 88 | print("=== Qwen/Qwen2.5-VL-3B-Instruct ===") 89 | vllm = VLLM("Qwen/Qwen2.5-VL-3B-Instruct") 90 | vllm.test_vlm() 91 | vllm.test_vlm_batch_100() 92 | -------------------------------------------------------------------------------- /examples/base_vlm.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | from utils import GenerationConfig 4 | from loguru import logger 5 | 6 | 7 | class BaseVLM: 8 | def __init__(self): 9 | raise NotImplementedError 10 | 11 | def generate( 12 | self, 13 | images: list[Image.Image] | None, 14 | text: str, 15 | gen_kwargs: GenerationConfig = GenerationConfig(), 16 | ) -> str: 17 | """Generate a response given an image (or list of images) and a prompt.""" 18 | raise NotImplementedError 19 | 20 | def batch_generate( 21 | self, 22 | images_list: list[list[Image.Image]] | None, 23 | text_list: list[str], 24 | gen_kwargs: GenerationConfig = GenerationConfig(), 25 | ) -> list[str]: 26 | """Generate a response given a list of images and a list of prompts.""" 27 | raise NotImplementedError 28 | 29 | def test_vlm(self): 30 | """Test the model with one or two images.""" 31 | image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" 32 | image = Image.open(requests.get(image_file, stream=True).raw) 33 | image_file2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" 34 | image2 = Image.open(requests.get(image_file2, stream=True).raw) 35 | output = self.generate([image], "画像には何が映っていますか?") 36 | logger.info(f"Output: {output}") 37 | assert isinstance( 38 | output, str 39 | ), f"Expected output to be a string, but got {type(output)}" 40 | 41 | output = self.generate([image, image2], "これらの画像の違いはなんですか?") 42 | logger.info(f"Output: {output}") 43 | assert isinstance( 44 | output, str 45 | ), f"Expected output to be a string, but got {type(output)}" 46 | 47 | # --- No image case --- 48 | # output = self.generate([], "画像には何が映っていますか?") 49 | # logger.info(f"Output: {output}") 50 | # assert isinstance( 51 | # output, str 52 | # ), f"Expected output to be a string, but got {type(output)}" 53 | 54 | def test_vlm_100(self): 55 | """Test the model with one or two images.""" 56 | image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" 57 | image = Image.open(requests.get(image_file, stream=True).raw) 58 | 59 | import time 60 | 61 | start_time = time.time() 62 | for _ in range(100): 63 | output = self.generate([image], "画像には何が映っていますか?") 64 | logger.info(f"Output: {output}") 65 | assert isinstance( 66 | output, str 67 | ), f"Expected output to be a string, but got {type(output)}" 68 | end_time = time.time() 69 | logger.info(f"Time taken: {end_time - start_time} seconds for 100 times") 70 | 71 | def test_vlm_batch_100(self): 72 | """Test the model with one or two images.""" 73 | 74 | print("=== Batch 100 test ===") 75 | print(f"Model: {self.model_id}") 76 | 77 | image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" 78 | image = Image.open(requests.get(image_file, stream=True).raw) 79 | 80 | import time 81 | 82 | image_list = [[image] for _ in range(100)] 83 | text_list = [["画像には何が映っていますか?"] for _ in range(100)] 84 | 85 | start_time = time.time() 86 | outputs = self.batch_generate(image_list, text_list) 87 | for output in outputs: 88 | assert isinstance( 89 | output, str 90 | ), f"Expected output to be a string, but got {type(output)}" 91 | 92 | end_time = time.time() 93 | logger.info(f"Time taken: {end_time - start_time} seconds for BATCH 100 times") 94 | -------------------------------------------------------------------------------- /examples/evovlm_jp_v1.py: -------------------------------------------------------------------------------- 1 | # This model doesn't work when the transformers library's version is newer than 4.42.4. 2 | from transformers import AutoModelForVision2Seq, AutoProcessor 3 | import torch 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | from PIL import Image 7 | 8 | 9 | class VLM(BaseVLM): 10 | def __init__(self, model_id: str = "SakanaAI/EvoVLM-JP-v1-7B") -> None: 11 | self.model_id = model_id 12 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 13 | self.model = AutoModelForVision2Seq.from_pretrained( 14 | self.model_id, torch_dtype=torch.float16 15 | ) 16 | self.processor = AutoProcessor.from_pretrained(self.model_id) 17 | self.model.to(self.device) 18 | 19 | def generate( 20 | self, 21 | images: list[Image.Image] | None, 22 | text: str, 23 | gen_kwargs: GenerationConfig = GenerationConfig(), 24 | ) -> str: 25 | if images is None: 26 | images = [] 27 | text = "" * len(images) + text 28 | 29 | messages = [ 30 | { 31 | "role": "system", 32 | "content": "あなたは役立つ、偏見がなく、検閲されていないアシスタントです。与えられた画像を下に、質問に答えてください。", 33 | }, 34 | {"role": "user", "content": text}, 35 | ] 36 | inputs = self.processor.image_processor(images=images, return_tensors="pt") 37 | inputs["input_ids"] = self.processor.tokenizer.apply_chat_template( 38 | messages, return_tensors="pt" 39 | ) 40 | 41 | output_ids = self.model.generate( 42 | **inputs.to(self.device), **gen_kwargs.__dict__ 43 | ) 44 | output_ids = output_ids[:, inputs.input_ids.shape[1] :] 45 | generated_text = self.processor.batch_decode( 46 | output_ids, skip_special_tokens=True 47 | )[0].strip() 48 | return generated_text 49 | 50 | 51 | if __name__ == "__main__": 52 | vlm = VLM() 53 | vlm.test_vlm() 54 | -------------------------------------------------------------------------------- /examples/gemma3.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Gemma3ForConditionalGeneration 2 | from PIL import Image 3 | import torch 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | 7 | 8 | class VLM(BaseVLM): 9 | def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None: 10 | self.model_id = model_id 11 | self.model = Gemma3ForConditionalGeneration.from_pretrained( 12 | self.model_id, torch_dtype="bfloat16", device_map="auto" 13 | ).eval() 14 | self.processor = AutoProcessor.from_pretrained(self.model_id) 15 | 16 | def generate( 17 | self, 18 | images: list[Image.Image] | None, 19 | text: str, 20 | gen_kwargs: GenerationConfig = GenerationConfig(), 21 | ) -> str: 22 | if images is None: 23 | images = [] 24 | image_content = [] 25 | for image in images: 26 | image_content.append({"type": "image", "image": image}) 27 | 28 | messages = [ 29 | { 30 | "role": "system", 31 | "content": [{"type": "text", "text": "You are a helpful assistant."}], 32 | }, 33 | { 34 | "role": "user", 35 | "content": [*image_content, {"type": "text", "text": text}], 36 | }, 37 | ] 38 | 39 | inputs = self.processor.apply_chat_template( 40 | messages, 41 | add_generation_prompt=True, 42 | tokenize=True, 43 | return_dict=True, 44 | return_tensors="pt", 45 | ).to(self.model.device, dtype=torch.bfloat16) 46 | 47 | input_len = inputs["input_ids"].shape[-1] 48 | 49 | with torch.inference_mode(): 50 | generation = self.model.generate(**inputs, **gen_kwargs.__dict__) 51 | generation = generation[0][input_len:] 52 | 53 | decoded = self.processor.decode(generation, skip_special_tokens=True) 54 | return decoded 55 | 56 | 57 | if __name__ == "__main__": 58 | vlm = VLM() 59 | vlm.test_vlm() 60 | -------------------------------------------------------------------------------- /examples/gpt4o.py: -------------------------------------------------------------------------------- 1 | from openai import AzureOpenAI, APIError 2 | import os 3 | from io import BytesIO 4 | import base64 5 | from base_vlm import BaseVLM 6 | from utils import GenerationConfig 7 | import backoff 8 | from PIL import Image 9 | 10 | 11 | def encode_image_to_base64(image): 12 | buffered = BytesIO() 13 | image.save(buffered, format="JPEG") 14 | img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") 15 | return img_str 16 | 17 | 18 | @backoff.on_exception(backoff.expo, (ValueError, APIError), max_tries=5) 19 | def make_api_call(vlm, message, gen_kwargs): 20 | return vlm.client.chat.completions.create( 21 | model=vlm.model_id, 22 | messages=message, 23 | max_tokens=gen_kwargs.max_new_tokens, 24 | temperature=gen_kwargs.temperature, 25 | top_p=gen_kwargs.top_p, 26 | ) 27 | 28 | 29 | class VLM(BaseVLM): 30 | def __init__(self, model_id: str = "gpt-4o-2024-05-13") -> None: 31 | self.model_id = model_id 32 | self.client = AzureOpenAI( 33 | api_key=os.getenv("AZURE_OPENAI_KEY"), 34 | api_version="2023-05-15", 35 | azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), 36 | ) 37 | 38 | def generate( 39 | self, 40 | images: list[Image.Image] | None, 41 | text: str, 42 | gen_kwargs: GenerationConfig = GenerationConfig(), 43 | ) -> str: 44 | message = [] 45 | content: list[dict[str, str | dict[str, str]]] = [] 46 | if images is None: 47 | images = [] 48 | image_base64_list = [encode_image_to_base64(img) for img in images] 49 | message_base = { 50 | "role": "user", 51 | "content": [ 52 | { 53 | "type": "text", 54 | "text": text, 55 | }, 56 | ], 57 | } 58 | 59 | content.append( 60 | { 61 | "type": "text", 62 | "text": text, 63 | }, 64 | ) 65 | for image_base64 in image_base64_list: 66 | content.append( 67 | { 68 | "type": "image_url", 69 | "image_url": { 70 | "url": f"data:image/jpeg;base64,{image_base64}", 71 | "detail": "auto", 72 | }, 73 | } 74 | ) 75 | message_base = { 76 | "role": "user", 77 | "content": content, 78 | } 79 | message = [message_base] 80 | 81 | response = make_api_call(self, message, gen_kwargs) 82 | return response.choices[0].message.content 83 | 84 | 85 | if __name__ == "__main__": 86 | vlm = VLM() 87 | vlm.test_vlm() 88 | -------------------------------------------------------------------------------- /examples/heron_nvila.py: -------------------------------------------------------------------------------- 1 | from base_vlm import BaseVLM 2 | from utils import GenerationConfig 3 | import torch 4 | from transformers import GenerationConfig as HFGenerationConfig, AutoModel 5 | 6 | 7 | def create_prompt(text, image): 8 | if image is None or (isinstance(image, list) and len(image) == 0): 9 | return [text] if text else [] 10 | if not isinstance(image, list): 11 | image = [image] 12 | if not text: 13 | return image 14 | if "" not in text: 15 | prompt = image.copy() 16 | prompt.append(text) 17 | return prompt 18 | parts = text.split("") 19 | prompt, idx = [], 0 20 | if parts[0] == "": 21 | prompt.append(image[idx]) 22 | idx += 1 23 | parts = parts[1:] 24 | for i, part in enumerate(parts): 25 | if part: 26 | prompt.append(part) 27 | if idx < len(image) and (i < len(parts) - 1 or text.endswith("")): 28 | prompt.append(image[idx]) 29 | idx += 1 30 | return prompt 31 | 32 | 33 | class VLM(BaseVLM): 34 | def __init__(self, model_id="turing-motors/Heron-NVILA-Lite-15B"): 35 | self.model_id = model_id 36 | self.model = AutoModel.from_pretrained( 37 | model_id, trust_remote_code=True, device_map="auto" 38 | ) 39 | 40 | def generate( 41 | self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig() 42 | ): 43 | gen_cfg = HFGenerationConfig(**gen_kwargs.__dict__) 44 | prompt = create_prompt(text, image) 45 | with torch.no_grad(): 46 | return self.model.generate_content(prompt, generation_config=gen_cfg) 47 | 48 | 49 | if __name__ == "__main__": 50 | VLM("turing-motors/Heron-NVILA-Lite-15B").test_vlm() 51 | -------------------------------------------------------------------------------- /examples/internvl2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | import torchvision.transforms as T 4 | from PIL import Image 5 | from torchvision.transforms.functional import InterpolationMode 6 | from transformers import AutoModel, AutoTokenizer 7 | from base_vlm import BaseVLM 8 | from utils import GenerationConfig 9 | import copy 10 | 11 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 12 | IMAGENET_STD = (0.229, 0.224, 0.225) 13 | 14 | 15 | def build_transform(input_size): 16 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD 17 | transform = T.Compose( 18 | [ 19 | T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), 20 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 21 | T.ToTensor(), 22 | T.Normalize(mean=MEAN, std=STD), 23 | ] 24 | ) 25 | return transform 26 | 27 | 28 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): 29 | best_ratio_diff = float("inf") 30 | best_ratio = (1, 1) 31 | area = width * height 32 | for ratio in target_ratios: 33 | target_aspect_ratio = ratio[0] / ratio[1] 34 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 35 | if ratio_diff < best_ratio_diff: 36 | best_ratio_diff = ratio_diff 37 | best_ratio = ratio 38 | elif ratio_diff == best_ratio_diff: 39 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 40 | best_ratio = ratio 41 | return best_ratio 42 | 43 | 44 | def dynamic_preprocess( 45 | image, min_num=1, max_num=12, image_size=448, use_thumbnail=False 46 | ): 47 | orig_width, orig_height = image.size 48 | aspect_ratio = orig_width / orig_height 49 | 50 | # calculate the existing image aspect ratio 51 | target_ratios = set( 52 | (i, j) 53 | for n in range(min_num, max_num + 1) 54 | for i in range(1, n + 1) 55 | for j in range(1, n + 1) 56 | if i * j <= max_num and i * j >= min_num 57 | ) 58 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 59 | 60 | # find the closest aspect ratio to the target 61 | target_aspect_ratio = find_closest_aspect_ratio( 62 | aspect_ratio, target_ratios, orig_width, orig_height, image_size 63 | ) 64 | 65 | # calculate the target width and height 66 | target_width = image_size * target_aspect_ratio[0] 67 | target_height = image_size * target_aspect_ratio[1] 68 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 69 | 70 | # resize the image 71 | resized_img = image.resize((target_width, target_height)) 72 | processed_images = [] 73 | for i in range(blocks): 74 | box = ( 75 | (i % (target_width // image_size)) * image_size, 76 | (i // (target_width // image_size)) * image_size, 77 | ((i % (target_width // image_size)) + 1) * image_size, 78 | ((i // (target_width // image_size)) + 1) * image_size, 79 | ) 80 | # split the image 81 | split_img = resized_img.crop(box) 82 | processed_images.append(split_img) 83 | assert len(processed_images) == blocks 84 | if use_thumbnail and len(processed_images) != 1: 85 | thumbnail_img = image.resize((image_size, image_size)) 86 | processed_images.append(thumbnail_img) 87 | return processed_images 88 | 89 | 90 | def load_image(image, input_size=448, max_num=12): 91 | transform = build_transform(input_size=input_size) 92 | images = dynamic_preprocess( 93 | image, image_size=input_size, use_thumbnail=True, max_num=max_num 94 | ) 95 | pixel_values = [transform(image) for image in images] 96 | pixel_values = torch.stack(pixel_values) 97 | return pixel_values 98 | 99 | 100 | # 画像の数だけ画像を読み込んでcatする 101 | def load_images(images: Image.Image | list[Image.Image]): 102 | if isinstance(images, list): 103 | tuples: tuple[Tensor, ...] = () 104 | 105 | for image in images: 106 | tuples += (load_image(image).to(torch.bfloat16).cuda(),) 107 | return torch.cat(tuples, dim=0) 108 | else: 109 | return load_image(images).to(torch.bfloat16).cuda() 110 | 111 | 112 | # 画像の数だけ をpromptの先頭に追加する 113 | def add_image_tags(images: Image.Image | list[Image.Image], prompt: str) -> str: 114 | if isinstance(images, list): 115 | num_images = len(images) 116 | else: 117 | num_images = 1 118 | 119 | image_tags = " " * num_images 120 | new_prompt = image_tags + prompt 121 | 122 | return new_prompt 123 | 124 | 125 | class VLM(BaseVLM): 126 | def __init__(self, model_id: str = "OpenGVLab/InternVL2-8B") -> None: 127 | self.model_id = model_id 128 | self.model = AutoModel.from_pretrained( 129 | self.model_id, 130 | torch_dtype=torch.bfloat16, 131 | low_cpu_mem_usage=True, 132 | use_flash_attn=True, 133 | trust_remote_code=True, 134 | device_map="auto", 135 | ) 136 | self.tokenizer = AutoTokenizer.from_pretrained( 137 | self.model_id, trust_remote_code=True, use_fast=False 138 | ) 139 | 140 | def generate( 141 | self, 142 | images: list[Image.Image] | None, 143 | text: str, 144 | gen_kwargs: GenerationConfig = GenerationConfig(), 145 | ) -> str: 146 | if images is None: 147 | images = [] 148 | if "" not in text: 149 | image_tokens = " ".join([""] * len(images)) 150 | text = f"{image_tokens}\n{text}" 151 | 152 | pixel_values_list = [] 153 | for img in images: 154 | pixel_values = ( 155 | load_image(img, max_num=12).to(self.model.device).to(self.model.dtype) 156 | ) 157 | pixel_values_list.append(pixel_values) 158 | num_patches_list = [pixel_values.size(0) for pixel_values in pixel_values_list] 159 | if len(images) == 0: 160 | pixel_values = None 161 | else: 162 | pixel_values = torch.cat(pixel_values_list, dim=0) 163 | 164 | generation_config = copy.deepcopy(gen_kwargs.__dict__) 165 | generation_config.pop("use_cache") 166 | 167 | response = self.model.chat( 168 | self.tokenizer, 169 | pixel_values, 170 | text, 171 | num_patches_list=num_patches_list, 172 | generation_config=generation_config, 173 | ) 174 | generated_text = response 175 | return generated_text 176 | 177 | 178 | if __name__ == "__main__": 179 | vlm = VLM() 180 | vlm.test_vlm() 181 | -------------------------------------------------------------------------------- /examples/llama_3_2_vision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import MllamaForConditionalGeneration, AutoProcessor 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | 7 | 8 | class VLM(BaseVLM): 9 | def __init__( 10 | self, model_id: str = "meta-llama/Llama-3.2-11B-Vision-Instruct" 11 | ) -> None: 12 | self.model_id = model_id 13 | self.model = MllamaForConditionalGeneration.from_pretrained( 14 | self.model_id, 15 | torch_dtype=torch.bfloat16, 16 | device_map="auto", 17 | ) 18 | self.processor = AutoProcessor.from_pretrained(self.model_id) 19 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 20 | 21 | def generate( 22 | self, 23 | images: list[Image.Image] | None, 24 | text: str, 25 | gen_kwargs: GenerationConfig = GenerationConfig(), 26 | ) -> str: 27 | if images is None: 28 | images = [] 29 | num_images = len(images) 30 | content = [{"type": "image"} for _ in range(num_images)] 31 | content.extend([{"type": "text", "text": text}]) 32 | messages = [ 33 | { 34 | "role": "user", 35 | "content": content, 36 | } 37 | ] 38 | input_text = self.processor.apply_chat_template( 39 | messages, add_generation_prompt=True 40 | ) 41 | 42 | inputs = self.processor( 43 | text=input_text, 44 | images=images, 45 | add_special_tokens=False, 46 | return_tensors="pt", 47 | ).to(self.device) 48 | 49 | output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__) 50 | generated_ids = [ 51 | output_ids[len(input_ids) :] 52 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 53 | ] 54 | return self.processor.decode( 55 | generated_ids[0], 56 | skip_special_tokens=True, 57 | clean_up_tokenization_spaces=True, 58 | ) 59 | 60 | 61 | if __name__ == "__main__": 62 | vlm = VLM() 63 | vlm.test_vlm() 64 | -------------------------------------------------------------------------------- /examples/llama_3_evovlm_jp_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mantis.models.conversation import Conversation, SeparatorStyle 3 | from mantis.models.mllava import ( 4 | chat_mllava, 5 | LlavaForConditionalGeneration, 6 | MLlavaProcessor, 7 | ) 8 | from mantis.models.mllava.utils import conv_templates 9 | from base_vlm import BaseVLM 10 | from utils import GenerationConfig 11 | from PIL import Image 12 | 13 | # 1. Set the system prompt 14 | conv_llama_3_elyza = Conversation( 15 | system="<|start_header_id|>system<|end_header_id|>\n\nあなたは誠実で優秀な日本人のアシスタントです。特に指示が無い場合は、常に日本語で回答してください。", 16 | roles=("user", "assistant"), 17 | messages=(), 18 | offset=0, 19 | sep_style=SeparatorStyle.LLAMA_3, 20 | sep="<|eot_id|>", 21 | ) 22 | conv_templates["llama_3"] = conv_llama_3_elyza 23 | 24 | 25 | class VLM(BaseVLM): 26 | def __init__(self, model_id: str = "SakanaAI/Llama-3-EvoVLM-JP-v2") -> None: 27 | self.model_id = model_id 28 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 29 | self.model = LlavaForConditionalGeneration.from_pretrained( 30 | self.model_id, torch_dtype=torch.float16, device_map=self.device 31 | ).eval() 32 | self.processor = MLlavaProcessor.from_pretrained( 33 | "TIGER-Lab/Mantis-8B-siglip-llama3" 34 | ) 35 | self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token 36 | 37 | def generate( 38 | self, 39 | images: list[Image.Image] | None, 40 | text: str, 41 | gen_kwargs: GenerationConfig = GenerationConfig(), 42 | ) -> str: 43 | if images is None: 44 | images = [] 45 | if "" not in text: 46 | text = " " * len(images) + "\n" + text 47 | response, history = chat_mllava( 48 | text, images, self.model, self.processor, **gen_kwargs.__dict__ 49 | ) 50 | return response 51 | 52 | 53 | if __name__ == "__main__": 54 | vlm = VLM() 55 | vlm.test_vlm() 56 | -------------------------------------------------------------------------------- /examples/llava_1_5.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import AutoProcessor, LlavaForConditionalGeneration 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | 7 | DEFAULT_IMAGE_TOKEN = "" 8 | 9 | 10 | class VLM(BaseVLM): 11 | def __init__(self, model_id: str = "llava-hf/llava-1.5-7b-hf") -> None: 12 | self.model_id = model_id 13 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 14 | self.model = LlavaForConditionalGeneration.from_pretrained( 15 | self.model_id, 16 | torch_dtype=torch.float16, 17 | low_cpu_mem_usage=True, 18 | ) 19 | self.processor = AutoProcessor.from_pretrained(self.model_id) 20 | self.model.to(self.device) 21 | 22 | def generate( 23 | self, 24 | images: list[Image.Image] | None, 25 | text: str, 26 | gen_kwargs: GenerationConfig = GenerationConfig(), 27 | ) -> str: 28 | if images is None: 29 | images = [] 30 | if DEFAULT_IMAGE_TOKEN in text: 31 | text = text.replace(DEFAULT_IMAGE_TOKEN, "") 32 | content = [{"type": "image"} for _ in range(len(images))] 33 | content.extend([{"type": "text", "text": text}]) 34 | messages = [ 35 | { 36 | "role": "user", 37 | "content": content, 38 | }, 39 | ] 40 | 41 | prompt = self.processor.apply_chat_template( 42 | messages, add_generation_prompt=True 43 | ) 44 | 45 | # processorがimages=Noneと[]を区別する可能性があるため、分岐で処理 46 | if len(images) == 0: 47 | inputs = self.processor(text=prompt, return_tensors="pt").to(self.device) 48 | else: 49 | inputs = self.processor(images=images, text=prompt, return_tensors="pt").to( 50 | self.device 51 | ) 52 | 53 | output = self.model.generate(**inputs, **gen_kwargs.__dict__)[0] 54 | generated_text = self.processor.decode(output, skip_special_tokens=True) 55 | answer = generated_text.split("ASSISTANT:")[-1].strip() 56 | return answer 57 | 58 | 59 | if __name__ == "__main__": 60 | vlm = VLM() 61 | vlm.test_vlm() 62 | -------------------------------------------------------------------------------- /examples/llava_1_6_mistral_hf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor 4 | from base_vlm import BaseVLM 5 | from utils import GenerationConfig 6 | 7 | 8 | class VLM(BaseVLM): 9 | def __init__(self, model_id: str = "llava-hf/llava-v1.6-mistral-7b-hf") -> None: 10 | self.model_id = model_id 11 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 12 | self.model = LlavaNextForConditionalGeneration.from_pretrained( 13 | self.model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True 14 | ) 15 | self.processor = LlavaNextProcessor.from_pretrained(self.model_id) 16 | self.model.to(self.device) 17 | 18 | def generate( 19 | self, 20 | images: list[Image.Image] | None, 21 | text: str, 22 | gen_kwargs: GenerationConfig = GenerationConfig(), 23 | ) -> str: 24 | if images is None: 25 | images = [] 26 | content = [{"type": "image"} for _ in range(len(images))] 27 | content.extend([{"type": "text", "text": text}]) 28 | messages = [ 29 | { 30 | "role": "user", 31 | "content": content, 32 | } 33 | ] 34 | input_text = self.processor.apply_chat_template( 35 | messages, add_generation_prompt=True 36 | ) 37 | inputs = self.processor( 38 | text=input_text, 39 | images=images, 40 | add_special_tokens=False, 41 | return_tensors="pt", 42 | ).to(self.device) 43 | 44 | # autoregressively complete prompt 45 | output = self.model.generate(**inputs, **gen_kwargs.__dict__)[0] 46 | 47 | generated_text = self.processor.decode(output, skip_special_tokens=True) 48 | # split [INST] and return the last part 49 | generated_text = generated_text.split("[/INST]")[-1].strip() 50 | return generated_text 51 | 52 | 53 | if __name__ == "__main__": 54 | vlm = VLM() 55 | vlm.test_vlm() 56 | -------------------------------------------------------------------------------- /examples/llava_calm2_siglip.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration 2 | import torch 3 | from base_vlm import BaseVLM 4 | from utils import GenerationConfig 5 | from PIL import Image 6 | 7 | 8 | class VLM(BaseVLM): 9 | def __init__(self, model_id: str = "cyberagent/llava-calm2-siglip") -> None: 10 | self.model_id = model_id 11 | self.model = LlavaForConditionalGeneration.from_pretrained( 12 | self.model_id, 13 | torch_dtype=torch.bfloat16, 14 | ).to("cuda") 15 | self.processor = AutoProcessor.from_pretrained(self.model_id) 16 | 17 | def generate( 18 | self, 19 | images: list[Image.Image] | None, 20 | text: str, 21 | gen_kwargs: GenerationConfig = GenerationConfig(), 22 | ) -> str: 23 | if images is None: 24 | images = [] 25 | prefix = None 26 | if "" in text: 27 | prompt = "USER: " + text + "\nASSISTANT: " 28 | else: 29 | num_images = len(images) 30 | prefix = " " * num_images 31 | prompt = "USER: " + prefix + text + "\nASSISTANT: " 32 | 33 | inputs = ( 34 | self.processor( 35 | text=prompt, 36 | images=images, 37 | add_special_tokens=False, 38 | return_tensors="pt", 39 | ) 40 | .to(self.model.device) 41 | .to(self.model.dtype) 42 | ) 43 | output_ids = self.model.generate( 44 | **inputs, 45 | **gen_kwargs.__dict__, 46 | ) 47 | generate_ids = [ 48 | output_ids[len(input_ids) :] 49 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 50 | ] 51 | 52 | output = self.processor.tokenizer.decode( 53 | generate_ids[0][:-1], clean_up_tokenization_spaces=False 54 | ) 55 | 56 | return output 57 | 58 | 59 | if __name__ == "__main__": 60 | vlm = VLM() 61 | vlm.test_vlm() 62 | -------------------------------------------------------------------------------- /examples/llm_jp_3_vila.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from llava.constants import IMAGE_TOKEN_INDEX 3 | from llava.conversation import conv_templates 4 | from llava.mm_utils import ( 5 | get_model_name_from_path, 6 | process_images, 7 | tokenizer_image_token, 8 | ) 9 | from llava.model.builder import load_pretrained_model 10 | from base_vlm import BaseVLM 11 | from utils import GenerationConfig 12 | from PIL import Image 13 | 14 | 15 | class VLM(BaseVLM): 16 | def __init__(self, model_id: str = "llm-jp/llm-jp-3-vila-14b") -> None: 17 | self.model_id = model_id 18 | model_name = get_model_name_from_path(self.model_id) 19 | device = "cuda" if torch.cuda.is_available() else "cpu" 20 | self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model( 21 | self.model_id, model_name, device=device 22 | ) 23 | 24 | def generate( 25 | self, 26 | images: list[Image.Image] | None, 27 | text: str, 28 | gen_kwargs: GenerationConfig = GenerationConfig(), 29 | ) -> str: 30 | if images is None: 31 | images = [] 32 | qs = text 33 | if "" not in text: 34 | qs = "\n" * len(images) + text 35 | conv_mode = "llmjp_v3" 36 | conv = conv_templates[conv_mode].copy() 37 | conv.append_message(conv.roles[0], qs) 38 | conv.append_message(conv.roles[1], None) 39 | prompt = conv.get_prompt() 40 | 41 | images_tensor = [ 42 | process_images(images, self.image_processor, self.model.config).to( 43 | self.model.device, dtype=torch.float16 44 | ) 45 | ] 46 | input_ids = ( 47 | tokenizer_image_token( 48 | prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" 49 | ) 50 | .unsqueeze(0) 51 | .to(self.model.device) 52 | ) 53 | 54 | with torch.inference_mode(): 55 | output_ids = self.model.generate( 56 | input_ids, 57 | images=images_tensor, 58 | **gen_kwargs.__dict__, 59 | ) 60 | 61 | outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 62 | return outputs 63 | 64 | 65 | if __name__ == "__main__": 66 | vlm = VLM() 67 | vlm.test_vlm() 68 | -------------------------------------------------------------------------------- /examples/model_table.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | MODEL_ID_TO_CLASS_PATH = { 4 | "llava-hf/llava-1.5-7b-hf": "llava_1_5.VLM", 5 | "llava-hf/llava-1.5-13b-hf": "llava_1_5.VLM", 6 | "llava-hf/llava-v1.6-mistral-7b-hf": "llava_1_6_mistral_hf.VLM", 7 | "SakanaAI/EvoVLM-JP-v1-7B": "evovlm_jp_v1.VLM", 8 | "gpt-4o-2024-05-13": "gpt4o.VLM", 9 | "gpt-4o-2024-11-20": "gpt4o.VLM", 10 | "internlm/internlm-xcomposer2d5-7b": "xcomposer2d5.VLM", 11 | "OpenGVLab/InternVL2-8B": "internvl2.VLM", 12 | "OpenGVLab/InternVL2-26B": "internvl2.VLM", 13 | "meta-llama/Llama-3.2-11B-Vision-Instruct": "llama_3_2_vision.VLM", 14 | "meta-llama/Llama-3.2-90B-Vision-Instruct": "llama_3_2_vision.VLM", 15 | "Kendamarron/Llama-3.2-11B-Vision-Instruct-Swallow-8B-Merge": "llama_3_2_vision.VLM", 16 | "AXCXEPT/Llama-3-EZO-VLM-1": "llama_3_evovlm_jp_v2.VLM", 17 | "SakanaAI/Llama-3-EvoVLM-JP-v2": "llama_3_evovlm_jp_v2.VLM", 18 | "neulab/Pangea-7B-hf": "pangea_hf.VLM", 19 | "mistralai/Pixtral-12B-2409": "pixtral.VLM", 20 | "Qwen/Qwen2-VL-2B-Instruct": "qwen2_vl.VLM", 21 | "Qwen/Qwen2-VL-7B-Instruct": "qwen2_vl.VLM", 22 | "Qwen/Qwen2-VL-72B-Instruct": "qwen2_vl.VLM", 23 | "Qwen/Qwen2.5-VL-3B-Instruct": "qwen2_5_vl.VLM", 24 | "Qwen/Qwen2.5-VL-7B-Instruct": "qwen2_5_vl.VLM", 25 | "Qwen/Qwen2.5-VL-32B-Instruct": "qwen2_5_vl.VLM", 26 | "Qwen/Qwen2.5-VL-72B-Instruct": "qwen2_5_vl.VLM", 27 | "llm-jp/llm-jp-3-vila-14b": "llm_jp_3_vila.VLM", 28 | "stabilityai/japanese-instructblip-alpha": "japanese_instructblip_alpha.VLM", 29 | "stabilityai/japanese-stable-vlm": "japanese_stable_vlm.VLM", 30 | "cyberagent/llava-calm2-siglip": "llava_calm2_siglip.VLM", 31 | "Efficient-Large-Model/VILA1.5-13b": "vila.VLM", 32 | "google/gemma-3-1b-it": "gemma3.VLM", 33 | "google/gemma-3-4b-it": "gemma3.VLM", 34 | "google/gemma-3-12b-it": "gemma3.VLM", 35 | "google/gemma-3-27b-it": "gemma3.VLM", 36 | "sbintuitions/sarashina2-vision-8b": "sarashina2_vision.VLM", 37 | "sbintuitions/sarashina2-vision-14b": "sarashina2_vision.VLM", 38 | "microsoft/Phi-4-multimodal-instruct": "phi4_multimodal.VLM", 39 | "MIL-UT/Asagi-14B": "asagi.VLM", 40 | "turing-motors/Heron-NVILA-Lite-1B": "heron_nvila.VLM", 41 | "turing-motors/Heron-NVILA-Lite-2B": "heron_nvila.VLM", 42 | "turing-motors/Heron-NVILA-Lite-15B": "heron_nvila.VLM", 43 | "turing-motors/Heron-NVILA-Lite-33B": "heron_nvila.VLM", 44 | } 45 | 46 | 47 | def get_class_from_path(class_path: str): 48 | """指定されたパスからクラスを動的にインポートして返す""" 49 | module_name, class_name = class_path.rsplit(".", 1) 50 | module = importlib.import_module(module_name) 51 | return getattr(module, class_name) 52 | 53 | 54 | def get_class_from_model_id(model_id: str): 55 | return get_class_from_path(MODEL_ID_TO_CLASS_PATH[model_id]) 56 | 57 | 58 | if __name__ == "__main__": 59 | for model_id, class_path in MODEL_ID_TO_CLASS_PATH.items(): 60 | try: 61 | vlm_class = get_class_from_path(class_path) 62 | vlm = vlm_class(model_id) 63 | vlm.test_vlm() 64 | print(f"Tested {model_id}") 65 | except Exception as e: 66 | print(f"Error testing {model_id}: {e}") 67 | -------------------------------------------------------------------------------- /examples/pangea_hf.py: -------------------------------------------------------------------------------- 1 | # Assuming that you have text_input and image_path 2 | from transformers import LlavaNextForConditionalGeneration, AutoProcessor 3 | import torch 4 | from PIL import Image 5 | from base_vlm import BaseVLM 6 | from utils import GenerationConfig 7 | 8 | 9 | class VLM(BaseVLM): 10 | def __init__(self, model_id: str = "neulab/Pangea-7B-hf") -> None: 11 | self.model_id = model_id 12 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 13 | self.model = LlavaNextForConditionalGeneration.from_pretrained( 14 | self.model_id, torch_dtype=torch.float16 15 | ).to(0) 16 | self.processor = AutoProcessor.from_pretrained(self.model_id) 17 | self.model.resize_token_embeddings(len(self.processor.tokenizer)) 18 | 19 | def generate( 20 | self, 21 | images: list[Image.Image] | None, 22 | text: str, 23 | gen_kwargs: GenerationConfig = GenerationConfig(), 24 | ) -> str: 25 | if images is None: 26 | images = [] 27 | 28 | prompt_template = ( 29 | "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user" 30 | + "\n" * len(images) 31 | + "\n{text}<|im_end|>\n<|im_start|>assistant\n" 32 | ) 33 | input_text = prompt_template.format(text=text) 34 | if images is None: 35 | # TODO: text only need to reload model https://huggingface.co/neulab/Pangea-7B <-? 36 | model_inputs = self.processor(text=input_text, return_tensors="pt").to( 37 | self.device, torch.float16 38 | ) 39 | else: 40 | model_inputs = self.processor( 41 | images=images, text=input_text, return_tensors="pt" 42 | ).to(self.device, torch.float16) 43 | 44 | output = self.model.generate( 45 | **model_inputs, 46 | **gen_kwargs.__dict__, 47 | ) 48 | output = output[0] 49 | result = self.processor.decode( 50 | output, skip_special_tokens=True, clean_up_tokenization_spaces=False 51 | ) 52 | # extract the answer 53 | result = result.split("assistant\n")[-1].strip() 54 | return result 55 | 56 | 57 | if __name__ == "__main__": 58 | vlm = VLM() 59 | vlm.test_vlm() 60 | -------------------------------------------------------------------------------- /examples/phi4_multimodal.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from transformers import AutoModelForCausalLM, AutoProcessor 3 | import transformers 4 | import torch 5 | from base_vlm import BaseVLM 6 | from utils import GenerationConfig 7 | 8 | 9 | class VLM(BaseVLM): 10 | def __init__(self, model_id: str = "microsoft/Phi-4-multimodal-instruct") -> None: 11 | self.model_id = model_id 12 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | self.model = AutoModelForCausalLM.from_pretrained( 15 | self.model_id, 16 | trust_remote_code=True, 17 | torch_dtype="auto", 18 | _attn_implementation="flash_attention_2", 19 | ).to(self.device) 20 | 21 | self.processor = AutoProcessor.from_pretrained( 22 | self.model_id, trust_remote_code=True 23 | ) 24 | 25 | def generate( 26 | self, 27 | images: list[Image.Image] | None, 28 | text: str, 29 | gen_kwargs: GenerationConfig = GenerationConfig(), 30 | ) -> str: 31 | if images is None: 32 | images = [] 33 | generation_config = transformers.GenerationConfig.from_pretrained( 34 | self.model_id, "generation_config.json" 35 | ) 36 | 37 | ########################### vision (multi-frame) ################################ 38 | placeholder = "" 39 | for i in range(len(images)): 40 | placeholder += f"<|image_{i}|>" 41 | 42 | messages = [ 43 | {"role": "user", "content": placeholder + text}, 44 | ] 45 | 46 | prompt = self.processor.tokenizer.apply_chat_template( 47 | messages, tokenize=False, add_generation_prompt=True 48 | ) 49 | 50 | if images is None: 51 | images = [] 52 | inputs = self.processor(prompt, images, return_tensors="pt").to(self.device) 53 | 54 | generate_ids = self.model.generate( 55 | **inputs, 56 | **gen_kwargs.__dict__, 57 | generation_config=generation_config, 58 | ) 59 | 60 | # 入力部分を取り除いた生成結果をデコード 61 | generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] 62 | response = self.processor.batch_decode( 63 | generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False 64 | )[0] 65 | 66 | return response 67 | 68 | 69 | if __name__ == "__main__": 70 | vlm = VLM() 71 | vlm.test_vlm() 72 | -------------------------------------------------------------------------------- /examples/pixtral.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from vllm import LLM 3 | from vllm.sampling_params import SamplingParams 4 | import base64 5 | from io import BytesIO 6 | from base_vlm import BaseVLM 7 | from utils import GenerationConfig 8 | 9 | 10 | def image_to_base64(img): 11 | buffer = BytesIO() 12 | # Check if the image has an alpha channel (RGBA) 13 | if img.mode == "RGBA": 14 | # Convert the image to RGB mode 15 | img = img.convert("RGB") 16 | img.save(buffer, format="JPEG") 17 | buffer.seek(0) 18 | img_str = base64.b64encode(buffer.getvalue()).decode("ascii") 19 | return img_str 20 | 21 | 22 | def image_to_content(image: Image.Image) -> dict: 23 | base64_image = image_to_base64(image) 24 | content = { 25 | "type": "image_url", 26 | "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, 27 | } 28 | return content 29 | 30 | 31 | class VLM(BaseVLM): 32 | def __init__(self, model_id: str = "mistralai/Pixtral-12B-2409") -> None: 33 | self.model_id = model_id 34 | max_img_per_msg = 5 35 | self.model = LLM( 36 | model=self.model_id, 37 | tokenizer_mode="mistral", 38 | tensor_parallel_size=1, 39 | limit_mm_per_prompt={"image": max_img_per_msg}, 40 | max_model_len=32768, 41 | ) 42 | 43 | def generate( 44 | self, 45 | images: list[Image.Image] | None, 46 | text: str, 47 | gen_kwargs: GenerationConfig = GenerationConfig(), 48 | ) -> str: 49 | if images is None: 50 | images = [] 51 | content = [image_to_content(image) for image in images] 52 | content.extend([{"type": "text", "text": text}]) 53 | messages = [ 54 | { 55 | "role": "user", 56 | "content": content, 57 | } 58 | ] 59 | 60 | sampling_params = SamplingParams( 61 | max_tokens=gen_kwargs.max_new_tokens, 62 | temperature=gen_kwargs.temperature, 63 | top_p=gen_kwargs.top_p, 64 | ) 65 | outputs = self.model.chat( 66 | messages, 67 | sampling_params=sampling_params, 68 | ) 69 | return outputs[0].outputs[0].text 70 | 71 | 72 | if __name__ == "__main__": 73 | vlm = VLM() 74 | vlm.test_vlm() 75 | -------------------------------------------------------------------------------- /examples/qwen2_5_vl.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | Qwen2_5_VLForConditionalGeneration, 3 | AutoProcessor, 4 | ) 5 | from qwen_vl_utils import process_vision_info 6 | from base_vlm import BaseVLM 7 | from utils import GenerationConfig 8 | from PIL import Image 9 | 10 | 11 | class VLM(BaseVLM): 12 | def __init__(self, model_id: str = "Qwen/Qwen2.5-VL-3B-Instruct") -> None: 13 | self.model_id = model_id 14 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 15 | self.model_id, 16 | torch_dtype="bfloat16", 17 | device_map="auto", 18 | attn_implementation="flash_attention_2", 19 | ) 20 | 21 | min_pixels = 256 * 28 * 28 22 | max_pixels = 1280 * 28 * 28 23 | self.processor = AutoProcessor.from_pretrained( 24 | self.model_id, min_pixels=min_pixels, max_pixels=max_pixels 25 | ) 26 | 27 | def generate( 28 | self, 29 | images: list[Image.Image] | None, 30 | text: str, 31 | gen_kwargs: GenerationConfig = GenerationConfig(), 32 | ) -> str: 33 | if images is None: 34 | images = [] 35 | if "" in text: 36 | text = text.replace("", "") 37 | message = [] 38 | image_content = [] 39 | 40 | for img in images: 41 | image_content.append( 42 | { 43 | "type": "image", 44 | "image": img, 45 | } 46 | ) 47 | message.append( 48 | { 49 | "role": "user", 50 | "content": image_content + [{"type": "text", "text": text}], 51 | } 52 | ) 53 | 54 | texts = self.processor.apply_chat_template( 55 | message, tokenize=False, add_generation_prompt=True 56 | ) 57 | image_inputs, video_inputs = process_vision_info(message) 58 | inputs = self.processor( 59 | text=[texts], 60 | images=image_inputs, 61 | videos=video_inputs, 62 | padding=True, 63 | return_tensors="pt", 64 | ) 65 | 66 | inputs = inputs.to(self.model.device) 67 | output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__) 68 | generated_ids = [ 69 | output_ids[len(input_ids) :] 70 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 71 | ] 72 | generated_text = self.processor.batch_decode( 73 | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True 74 | )[0] 75 | return generated_text 76 | 77 | 78 | if __name__ == "__main__": 79 | vlm = VLM() 80 | vlm.test_vlm() 81 | -------------------------------------------------------------------------------- /examples/qwen2_vl.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | Qwen2VLForConditionalGeneration, 3 | AutoProcessor, 4 | ) 5 | from qwen_vl_utils import process_vision_info 6 | from base_vlm import BaseVLM 7 | from utils import GenerationConfig 8 | from PIL import Image 9 | 10 | 11 | class VLM(BaseVLM): 12 | def __init__(self, model_id: str = "Qwen/Qwen2-VL-2B-Instruct") -> None: 13 | self.model_id = model_id 14 | self.model = Qwen2VLForConditionalGeneration.from_pretrained( 15 | self.model_id, 16 | torch_dtype="bfloat16", 17 | device_map="auto", 18 | attn_implementation="flash_attention_2", 19 | ) 20 | 21 | min_pixels = 256 * 28 * 28 22 | max_pixels = 1280 * 28 * 28 23 | self.processor = AutoProcessor.from_pretrained( 24 | self.model_id, min_pixels=min_pixels, max_pixels=max_pixels 25 | ) 26 | 27 | def generate( 28 | self, 29 | images: list[Image.Image] | None, 30 | text: str, 31 | gen_kwargs: GenerationConfig = GenerationConfig(), 32 | ) -> str: 33 | if images is None: 34 | images = [] 35 | if "" in text: 36 | text = text.replace("", "") 37 | message = [] 38 | image_content = [] 39 | 40 | for img in images: 41 | image_content.append( 42 | { 43 | "type": "image", 44 | "image": img, 45 | } 46 | ) 47 | message.append( 48 | { 49 | "role": "user", 50 | "content": image_content + [{"type": "text", "text": text}], 51 | } 52 | ) 53 | 54 | texts = self.processor.apply_chat_template( 55 | message, tokenize=False, add_generation_prompt=True 56 | ) 57 | image_inputs, video_inputs = process_vision_info(message) 58 | inputs = self.processor( 59 | text=[texts], 60 | images=image_inputs, 61 | videos=video_inputs, 62 | padding=True, 63 | return_tensors="pt", 64 | ) 65 | 66 | inputs = inputs.to(self.model.device) 67 | output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__) 68 | generated_ids = [ 69 | output_ids[len(input_ids) :] 70 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 71 | ] 72 | generated_text = self.processor.batch_decode( 73 | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True 74 | )[0] 75 | return generated_text 76 | 77 | 78 | if __name__ == "__main__": 79 | vlm = VLM() 80 | vlm.test_vlm() 81 | -------------------------------------------------------------------------------- /examples/sample_vllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from dataclasses import asdict 5 | from loguru import logger 6 | 7 | import eval_mm 8 | import eval_mm.metrics 9 | from utils import GenerationConfig 10 | from base_vllm import VLLM 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--model_id", default="Qwen/Qwen2.5-VL-3B-Instruct") 16 | parser.add_argument( 17 | "--task_id", 18 | default="japanese-heron-bench", 19 | help=f"Task ID to evaluate. Available: {eval_mm.TaskRegistry().get_task_list()}", 20 | ) 21 | parser.add_argument("--judge_model", default="gpt-4o-2024-11-20") 22 | parser.add_argument("--batch_size_for_evaluation", type=int, default=10) 23 | parser.add_argument("--overwrite", action="store_true") 24 | parser.add_argument("--result_dir", default="result") 25 | parser.add_argument("--inference_only", action="store_true") 26 | parser.add_argument("--max_new_tokens", type=int, default=256) 27 | parser.add_argument("--num_beams", type=int, default=1) 28 | parser.add_argument("--temperature", type=float, default=0.0) 29 | parser.add_argument("--top_p", type=float, default=1.0) 30 | parser.add_argument("--do_sample", action="store_true", default=False) 31 | parser.add_argument("--use_cache", action="store_true", default=True) 32 | parser.add_argument("--max_dataset_len", type=int) 33 | parser.add_argument( 34 | "--metrics", 35 | type=str, 36 | nargs="+", 37 | default=["heron-bench"], 38 | help=f"Metrics to evaluate. Available: {eval_mm.ScorerRegistry().get_metric_list()}", 39 | ) 40 | parser.add_argument( 41 | "--rotate_choices", action="store_true", help="This option is used in MECHA-ja" 42 | ) 43 | parser.add_argument( 44 | "--random_choice", 45 | action="store_true", 46 | help="If set, randomly choose the answer from the candidates when parse error occurs in JMMMU and MMMU tasks", 47 | ) 48 | return parser.parse_args() 49 | 50 | 51 | def load_or_generate_predictions(args, task, gen_kwargs, output_dir): 52 | prediction_path = os.path.join(output_dir, "prediction.jsonl") 53 | if os.path.exists(prediction_path) and not args.overwrite: 54 | logger.info(f"Loading predictions from {prediction_path}") 55 | with open(prediction_path) as f: 56 | preds = [json.loads(line) for line in f] 57 | assert len(preds) == len( 58 | task.dataset 59 | ), "Prediction length mismatch with dataset" 60 | return preds, [] 61 | 62 | logger.info("Generating predictions...") 63 | model = VLLM(args.model_id) 64 | preds = [] 65 | 66 | qids = [task.doc_to_id(doc) for doc in task.dataset] 67 | images = [task.doc_to_visual(doc) for doc in task.dataset] 68 | texts = [task.doc_to_text(doc).replace("", "") for doc in task.dataset] 69 | 70 | preds = model.batch_generate(images, texts, gen_kwargs) 71 | preds = [{"question_id": qid, "text": pred} for qid, pred in zip(qids, preds)] 72 | 73 | save_jsonl(prediction_path, preds) 74 | logger.info(f"Predictions saved to {prediction_path}") 75 | return preds, [] 76 | 77 | 78 | def save_jsonl(path, data): 79 | with open(path, "w") as f: 80 | for item in data: 81 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 82 | 83 | 84 | def evaluate(args, task, preds, metrics): 85 | logger.info("Starting evaluation...") 86 | scores_by_metric = {} 87 | aggregated_metrics = {} 88 | 89 | for metric in metrics: 90 | scorer = eval_mm.ScorerRegistry.load_scorer( 91 | metric, 92 | eval_mm.ScorerConfig( 93 | docs=task.dataset, 94 | judge_model=args.judge_model, 95 | batch_size=args.batch_size_for_evaluation, 96 | client=eval_mm.OpenAIChatAPI(), 97 | random_choice=args.random_choice, 98 | ), 99 | ) 100 | scores = scorer.score( 101 | [task.doc_to_answer(doc) for doc in task.dataset], 102 | [pred["text"] for pred in preds], 103 | ) 104 | scores_by_metric[metric] = scores 105 | aggregate = scorer.aggregate(scores) 106 | aggregated_metrics[metric] = asdict(aggregate) 107 | 108 | logger.info(f"Scores for {metric}: {scores}") 109 | logger.info(f"Aggregate for {metric}: {aggregate}") 110 | 111 | return scores_by_metric, aggregated_metrics 112 | 113 | 114 | def save_final_results(preds, task, metrics, scores_by_metric, output_path): 115 | final_results = [] 116 | for i, pred in enumerate(preds): 117 | doc = task.dataset[i] 118 | result = { 119 | "question_id": pred["question_id"], 120 | "text": pred["text"], 121 | "answer": task.doc_to_answer(doc), 122 | "input_text": task.doc_to_text(doc), 123 | } 124 | for metric in metrics: 125 | result[metric] = scores_by_metric[metric][i] 126 | final_results.append(result) 127 | 128 | save_jsonl(output_path, final_results) 129 | logger.info(f"Final prediction with scores saved to {output_path}") 130 | 131 | 132 | def main(): 133 | args = parse_args() 134 | 135 | gen_kwargs = GenerationConfig( 136 | max_new_tokens=args.max_new_tokens, 137 | temperature=args.temperature, 138 | top_p=args.top_p, 139 | num_beams=args.num_beams, 140 | do_sample=args.do_sample, 141 | use_cache=args.use_cache, 142 | ) 143 | 144 | task_config = eval_mm.TaskConfig( 145 | max_dataset_len=args.max_dataset_len, 146 | rotate_choices=args.rotate_choices, 147 | ) 148 | task = eval_mm.TaskRegistry.load_task(args.task_id, task_config) 149 | 150 | output_dir = os.path.join(args.result_dir, args.task_id, args.model_id + "_vllm") 151 | os.makedirs(output_dir, exist_ok=True) 152 | 153 | preds, _ = load_or_generate_predictions(args, task, gen_kwargs, output_dir) 154 | 155 | if args.inference_only: 156 | logger.info("Inference only mode. Skipping evaluation.") 157 | return 158 | 159 | scores_by_metric, aggregated_metrics = evaluate(args, task, preds, args.metrics) 160 | 161 | prediction_path = os.path.join(output_dir, "prediction.jsonl") 162 | save_final_results(preds, task, args.metrics, scores_by_metric, prediction_path) 163 | 164 | evaluation_path = os.path.join(output_dir, "evaluation.jsonl") 165 | with open(evaluation_path, "w") as f: 166 | f.write(json.dumps(aggregated_metrics, ensure_ascii=False) + "\n") 167 | logger.info(f"Evaluation result saved to {evaluation_path}") 168 | 169 | 170 | if __name__ == "__main__": 171 | main() 172 | -------------------------------------------------------------------------------- /examples/sarashina2_vision.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from transformers import AutoModelForCausalLM, AutoProcessor 3 | from base_vlm import BaseVLM 4 | from utils import GenerationConfig 5 | 6 | 7 | class VLM(BaseVLM): 8 | def __init__(self, model_id: str = "sbintuitions/sarashina2-vision-8b") -> None: 9 | self.model_id = model_id 10 | self.model = AutoModelForCausalLM.from_pretrained( 11 | self.model_id, 12 | device_map="cuda", 13 | torch_dtype="auto", 14 | trust_remote_code=True, 15 | ) 16 | self.processor = AutoProcessor.from_pretrained( 17 | self.model_id, trust_remote_code=True 18 | ) 19 | 20 | def generate( 21 | self, 22 | images: list[Image.Image] | None, 23 | text: str, 24 | gen_kwargs: GenerationConfig = GenerationConfig(), 25 | ) -> str: 26 | if images is None: 27 | images = [] 28 | message = [{"role": "user", "content": text}] 29 | 30 | text = self.processor.apply_chat_template(message, add_generation_prompt=True) 31 | # insert <|prefix|><|file|><|suffix|> after 32 | text = text.replace( 33 | "<|prefix|><|file|><|suffix|>", "<|prefix|><|file|><|suffix|>" * len(images) 34 | ) 35 | 36 | # Use text-only processing if no images are provided 37 | if images is None: 38 | images = [] 39 | inputs = self.processor( 40 | text=[text], 41 | images=images, 42 | padding=True, 43 | return_tensors="pt", 44 | ).to(self.model.device) 45 | 46 | stopping_criteria = self.processor.get_stopping_criteria(["\n###"]) 47 | 48 | # Inference: Generation of the output 49 | output_ids = self.model.generate( 50 | **inputs, 51 | **gen_kwargs.__dict__, 52 | stopping_criteria=stopping_criteria, 53 | ) 54 | generated_ids = [ 55 | output_ids[len(input_ids) :] 56 | for input_ids, output_ids in zip(inputs.input_ids, output_ids) 57 | ] 58 | output_text = self.processor.batch_decode( 59 | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True 60 | ) 61 | return output_text[0] 62 | 63 | 64 | if __name__ == "__main__": 65 | vlm = VLM() 66 | vlm.test_vlm() 67 | -------------------------------------------------------------------------------- /examples/test_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from model_table import get_class_from_model_id 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model_id", type=str, default="llava-hf/llava-1.5-7b-hf") 6 | 7 | args = parser.parse_args() 8 | 9 | model = get_class_from_model_id(args.model_id)(args.model_id) 10 | model.test_vlm() 11 | -------------------------------------------------------------------------------- /examples/utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class GenerationConfig: 6 | max_new_tokens: int = 1024 7 | temperature: float = 0.0 8 | top_p: float = 1.0 9 | num_beams: int = 1 10 | do_sample: bool = False 11 | use_cache: bool = True 12 | -------------------------------------------------------------------------------- /examples/vila.py: -------------------------------------------------------------------------------- 1 | # This file is modified from https://github.com/haotian-liu/LLaVA/ 2 | 3 | # rye add protobuf 4 | # uv pip install flash-attn --no-build-isolation --python .venv 5 | 6 | from base_vlm import BaseVLM 7 | from utils import GenerationConfig 8 | 9 | import torch 10 | 11 | from llava_vila.conversation import SeparatorStyle, conv_templates 12 | from llava_vila.mm_utils import ( 13 | get_model_name_from_path, 14 | process_images, 15 | tokenizer_image_token, 16 | ) 17 | from llava_vila.model.builder import load_pretrained_model 18 | from PIL import Image 19 | 20 | 21 | class VLM(BaseVLM): 22 | def __init__(self, model_id: str = "Efficient-Large-Model/VILA1.5-13b"): 23 | self.model_id = model_id 24 | model_name = get_model_name_from_path(self.model_id) 25 | self.model_name = model_name 26 | self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model( 27 | self.model_id, model_name 28 | ) 29 | # from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor 30 | # self.model = AutoModelForCausalLM.from_pretrained("Efficient-Large-Model/VILA-13b") 31 | # self.tokenizer = self.model.config.tokenizer 32 | # self.image_processor = AutoProcessor.from_pretrained("Efficient-Large-Model/VILA-13b") 33 | 34 | def generate( 35 | self, 36 | images: list[Image.Image] | None, 37 | text: str, 38 | gen_kwargs: GenerationConfig = GenerationConfig(), 39 | ) -> str: 40 | qs = text 41 | if images is None: 42 | images = [] 43 | if "" not in text: 44 | qs = "\n" * len(images) + text 45 | 46 | if "llama-2" in self.model_name.lower(): 47 | conv_mode = "llava_llama_2" 48 | elif "v1" in self.model_name.lower(): 49 | conv_mode = "llava_v1" 50 | elif "mpt" in self.model_name.lower(): 51 | conv_mode = "mpt" 52 | else: 53 | conv_mode = "llava_v0" 54 | 55 | conv = conv_templates[conv_mode].copy() 56 | conv.append_message(conv.roles[0], qs) 57 | conv.append_message(conv.roles[1], None) 58 | prompt = conv.get_prompt() 59 | if images is None: 60 | images_tensor = None 61 | else: 62 | images_tensor = [ 63 | process_images(images, self.image_processor, self.model.config).to( 64 | self.model.device, dtype=torch.float16 65 | ) 66 | ] 67 | input_ids = ( 68 | tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt") 69 | .unsqueeze(0) 70 | .cuda() 71 | ) 72 | 73 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 74 | # keywords = [stop_str] # if needed, add keywords 75 | 76 | with torch.inference_mode(): 77 | output_ids = self.model.generate( 78 | input_ids, 79 | images=images_tensor, 80 | **gen_kwargs.__dict__, 81 | ) 82 | outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 83 | outputs = outputs.strip() 84 | if outputs.endswith(stop_str): 85 | outputs = outputs[: -len(stop_str)] 86 | outputs = outputs.strip() 87 | return outputs 88 | 89 | 90 | if __name__ == "__main__": 91 | vlm = VLM("Efficient-Large-Model/VILA1.5-13b") 92 | vlm.test_vlm() 93 | -------------------------------------------------------------------------------- /examples/vllm_registry.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from dataclasses import dataclass 3 | from PIL import Image 4 | from vllm.lora.request import LoRARequest 5 | from transformers import AutoProcessor 6 | 7 | 8 | @dataclass 9 | class ModelRequestData: 10 | prompt: str 11 | image_data: Optional[list[Image.Image]] 12 | stop_token_ids: Optional[list[int]] = None 13 | chat_template: Optional[str] = None 14 | lora_requests: Optional[list[LoRARequest]] = None 15 | 16 | 17 | class VLLMModelRegistry: 18 | def __init__(self, model_name: str): 19 | self.model_name = model_name 20 | self.processor = AutoProcessor.from_pretrained( 21 | model_name, trust_remote_code=True 22 | ) 23 | self.loader_map = { 24 | "Qwen/Qwen2.5-VL-3B-Instruct": self.load_qwen2_5_vl, 25 | "Qwen/Qwen2.5-VL-7B-Instruct": self.load_qwen2_5_vl, 26 | "Qwen/Qwen2.5-VL-32B-Instruct": self.load_qwen2_5_vl, 27 | "Qwen/Qwen2.5-VL-72B-Instruct": self.load_qwen2_5_vl, 28 | "google/gemma-3-4b-it": self.load_gemma3, 29 | "google/gemma-3-12b-it": self.load_gemma3, 30 | "google/gemma-3-27b-it": self.load_gemma3, 31 | } 32 | 33 | def get_engine_config(self, model_id: str) -> dict: 34 | return { 35 | "max_model_len": 32768, 36 | "max_num_seqs": 5, 37 | "limit_mm_per_prompt": {"image": 5}, 38 | "trust_remote_code": True, 39 | } 40 | 41 | def load_qwen2_5_vl( 42 | self, text: str, images: list[Image.Image] | None 43 | ) -> ModelRequestData: 44 | try: 45 | from qwen_vl_utils import process_vision_info 46 | except ModuleNotFoundError: 47 | print( 48 | "WARNING: `qwen-vl-utils` not installed, input images will not " 49 | "be automatically resized. You can enable this functionality by " 50 | "`pip install qwen-vl-utils`." 51 | ) 52 | process_vision_info = None 53 | 54 | if images is None: 55 | images = [] 56 | 57 | placeholders = [{"type": "image", "image": image} for image in images] 58 | messages = [ 59 | { 60 | "role": "user", 61 | "content": [ 62 | *placeholders, 63 | {"type": "text", "text": text}, 64 | ], 65 | } 66 | ] 67 | 68 | prompt = self.processor.apply_chat_template( 69 | messages, tokenize=False, add_generation_prompt=True 70 | ) 71 | 72 | if process_vision_info is None: 73 | image_data = images 74 | else: 75 | image_data, _ = process_vision_info(messages, return_video_kwargs=False) 76 | 77 | return ModelRequestData( 78 | prompt=prompt, 79 | image_data=image_data, 80 | ) 81 | 82 | def load_gemma3( 83 | self, text: str, images: list[Image.Image] | None 84 | ) -> ModelRequestData: 85 | if images is None: 86 | images = [] 87 | 88 | placeholders = [{"type": "image", "image": image} for image in images] 89 | messages = [ 90 | { 91 | "role": "user", 92 | "content": [ 93 | *placeholders, 94 | {"type": "text", "text": text}, 95 | ], 96 | } 97 | ] 98 | 99 | prompt = self.processor.apply_chat_template( 100 | messages, tokenize=False, add_generation_prompt=True 101 | ) 102 | 103 | return ModelRequestData( 104 | prompt=prompt, 105 | image_data=images, 106 | ) 107 | -------------------------------------------------------------------------------- /examples/xcomposer2d5.py: -------------------------------------------------------------------------------- 1 | # flash-attn is required to run this example. 2 | # 3 | import torch 4 | from transformers import AutoModel, AutoTokenizer 5 | import os 6 | from utils import GenerationConfig 7 | from base_vlm import BaseVLM 8 | from PIL import Image 9 | 10 | torch.set_grad_enabled(False) 11 | 12 | 13 | class VLM(BaseVLM): 14 | def __init__(self, model_id: str = "internlm/internlm-xcomposer2d5-7b") -> None: 15 | self.model_id = model_id 16 | self.model = ( 17 | AutoModel.from_pretrained( 18 | self.model_id, 19 | torch_dtype=torch.bfloat16, 20 | trust_remote_code=True, 21 | ) 22 | .cuda() 23 | .eval() 24 | .half() 25 | ) 26 | self.tokenizer = AutoTokenizer.from_pretrained( 27 | self.model_id, trust_remote_code=True 28 | ) 29 | self.model.tokenizer = self.tokenizer 30 | 31 | def generate( 32 | self, 33 | images: list[Image.Image] | None, 34 | text: str, 35 | gen_kwargs: GenerationConfig = GenerationConfig(), 36 | ) -> str: 37 | if images is None: 38 | images = [] 39 | if "" not in text: 40 | image_tokens = "".join( 41 | [f"Image{i} ; " for i in range(1, len(images) + 1)] 42 | ) 43 | text = f"{image_tokens}{text}" 44 | # make tmp files 45 | os.makedirs("tmp", exist_ok=True) 46 | image_files = [] 47 | for i, img in enumerate(images): 48 | file_path = f"tmp/image_{i}.jpg" 49 | img.save(file_path) 50 | image_files.append(file_path) 51 | 52 | with torch.autocast(device_type="cuda", dtype=torch.float16): 53 | response, _ = self.model.chat( 54 | self.tokenizer, 55 | text, 56 | image_files, 57 | generation_config=gen_kwargs.__dict__, 58 | ) 59 | 60 | # remove tmp files 61 | for file_path in image_files: 62 | os.remove(file_path) 63 | return response 64 | 65 | 66 | if __name__ == "__main__": 67 | model = VLM() 68 | model.test_vlm() 69 | -------------------------------------------------------------------------------- /github_pages/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | 14 | # misc 15 | .DS_Store 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | share/python-wheels/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | MANIFEST 52 | 53 | # PyInstaller 54 | # Usually these files are written by a python script from a template 55 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 56 | *.manifest 57 | *.spec 58 | 59 | # Installer logs 60 | pip-log.txt 61 | pip-delete-this-directory.txt 62 | 63 | # Unit test / coverage reports 64 | htmlcov/ 65 | .tox/ 66 | .nox/ 67 | .coverage 68 | .coverage.* 69 | .cache 70 | nosetests.xml 71 | coverage.xml 72 | *.cover 73 | *.py,cover 74 | .hypothesis/ 75 | .pytest_cache/ 76 | cover/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | db.sqlite3 86 | db.sqlite3-journal 87 | 88 | # Flask stuff: 89 | instance/ 90 | .webassets-cache 91 | 92 | # Scrapy stuff: 93 | .scrapy 94 | 95 | # Sphinx documentation 96 | docs/_build/ 97 | 98 | # PyBuilder 99 | .pybuilder/ 100 | target/ 101 | 102 | # Jupyter Notebook 103 | .ipynb_checkpoints 104 | 105 | # IPython 106 | profile_default/ 107 | ipython_config.py 108 | 109 | # pyenv 110 | # For a library or package, you might want to ignore these files since the code is 111 | # intended to run in multiple environments; otherwise, check them in: 112 | # .python-version 113 | 114 | # pipenv 115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 118 | # install all needed dependencies. 119 | #Pipfile.lock 120 | 121 | # UV 122 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 123 | # This is especially recommended for binary packages to ensure reproducibility, and is more 124 | # commonly ignored for libraries. 125 | #uv.lock 126 | 127 | # poetry 128 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 129 | # This is especially recommended for binary packages to ensure reproducibility, and is more 130 | # commonly ignored for libraries. 131 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 132 | #poetry.lock 133 | 134 | # pdm 135 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 136 | #pdm.lock 137 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 138 | # in version control. 139 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 140 | .pdm.toml 141 | .pdm-python 142 | .pdm-build/ 143 | 144 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 145 | __pypackages__/ 146 | 147 | # Celery stuff 148 | celerybeat-schedule 149 | celerybeat.pid 150 | 151 | # SageMath parsed files 152 | *.sage.py 153 | 154 | # Environments 155 | .env 156 | .venv 157 | env/ 158 | venv/ 159 | ENV/ 160 | env.bak/ 161 | venv.bak/ 162 | 163 | # Spyder project settings 164 | .spyderproject 165 | .spyproject 166 | 167 | # Rope project settings 168 | .ropeproject 169 | 170 | # mkdocs documentation 171 | /site 172 | 173 | # mypy 174 | .mypy_cache/ 175 | .dmypy.json 176 | dmypy.json 177 | 178 | # Pyre type checker 179 | .pyre/ 180 | 181 | # pytype static type analyzer 182 | .pytype/ 183 | 184 | # Cython debug symbols 185 | cython_debug/ 186 | 187 | # PyCharm 188 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 189 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 190 | # and can be added to the global gitignore or merged into this file. For a more nuclear 191 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 192 | #.idea/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | -------------------------------------------------------------------------------- /github_pages/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "singleQuote": true, 4 | "jsxSingleQuote": true, 5 | "trailingComma": "all", 6 | "tabWidth": 2, 7 | } 8 | -------------------------------------------------------------------------------- /github_pages/README.md: -------------------------------------------------------------------------------- 1 | # llm-jp-eval-mm.github.io 2 | 3 | This repository is a source code for the llm-jp-eval-mm leaderboard website. 4 | [llm-jp-eval-mm](https://github.com/llm-jp/llm-jp-eval-mm) is used to evaluate the VLMs on the Japanese benchmark. 5 | 6 | ## How to develop 7 | ```bash 8 | cd github_pages 9 | sudo apt install -y nodejs npm 10 | sudo npm install n -g 11 | npm install 12 | npm run start 13 | ``` 14 | 15 | You may need to remove "homepage" from `github_pages/package.json` to start in the local environment. 16 | ```diff 17 | { 18 | "name": "github_pages", 19 | "version": "0.1.0", 20 | -- "homepage": "https://llm-jp.github.io/llm-jp-eval-mm", 21 | } 22 | ``` 23 | 24 | ## How to deploy 25 | ```bash 26 | cd github_pages 27 | npm run deploy 28 | ``` 29 | 30 | ## Add benchmark results to the leaderboard 31 | Please add the benchmark results to the `github_pages/public/leaderboard.json` file. 32 | The format of the benchmark results is as follows. 33 | ```json 34 | { 35 | "model": "Japanese InstructBLIP Alpha", 36 | "url": "https://huggingface.co/stabilityai/japanese-instructblip-alpha", 37 | "scores": { 38 | "Heron": { 39 | "conv": 22.8, 40 | "detail": 24.1, 41 | "complex": 19.5, 42 | "overall": 22.7 43 | }, 44 | "JVB-ItW": { "llm": 1.31, "rouge": 13.8 }, 45 | "MulIm-VQA": { "llm": 2.5, "rouge": 25.0 }, 46 | "JDocQA": { "Acc": 0.123, "llm": 1.9 }, 47 | "JMMMU": { "Acc": 0.271 } 48 | } 49 | }, 50 | ``` 51 | 52 | ## Format the code 53 | ```bash 54 | npx prettier --write "./**/*.{js,jsx,ts,tsx,css,html}" 55 | ``` 56 | 57 | 58 | ## Reference 59 | This repository refers to the following repositories. Thank you for your great work. 60 | - https://github.com/MMMU-Japanese-Benchmark/JMMMU 61 | - https://github.com/MMMU-Benchmark/mmmu-benchmark.github.io 62 | -------------------------------------------------------------------------------- /github_pages/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "github_pages", 3 | "version": "0.1.0", 4 | "homepage": "https://llm-jp.github.io/llm-jp-eval-mm", 5 | "private": true, 6 | "dependencies": { 7 | "cra-template": "1.2.0", 8 | "format": "^0.2.2", 9 | "react": "^19.0.0", 10 | "react-dom": "^19.0.0", 11 | "react-icons": "^5.4.0", 12 | "react-scripts": "5.0.1" 13 | }, 14 | "scripts": { 15 | "predeploy": "npm run build", 16 | "deploy": "gh-pages -d build", 17 | "start": "react-scripts start", 18 | "build": "react-scripts build", 19 | "test": "react-scripts test", 20 | "eject": "react-scripts eject" 21 | }, 22 | "eslintConfig": { 23 | "extends": [ 24 | "react-app", 25 | "react-app/jest" 26 | ] 27 | }, 28 | "browserslist": { 29 | "production": [ 30 | ">0.2%", 31 | "not dead", 32 | "not op_mini all" 33 | ], 34 | "development": [ 35 | "last 1 chrome version", 36 | "last 1 firefox version", 37 | "last 1 safari version" 38 | ] 39 | }, 40 | "devDependencies": { 41 | "gh-pages": "^6.3.0", 42 | "prettier": "^3.4.2", 43 | "web-vitals": "^4.2.4" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /github_pages/public/dataset_url.json: -------------------------------------------------------------------------------- 1 | { 2 | "Heron": { 3 | "url": "https://huggingface.co/datasets/turing-motors/Japanese-Heron-Bench" 4 | }, 5 | "JVB-ItW": { 6 | "url": "https://huggingface.co/datasets/SakanaAI/JA-VLM-Bench-In-the-Wild" 7 | }, 8 | "VG-VQA": { 9 | "url": "https://huggingface.co/datasets/SakanaAI/JA-VG-VQA-500" 10 | }, 11 | "MulIm-VQA": { 12 | "url": "https://huggingface.co/datasets/SakanaAI/JA-Multi-Image-VQA" 13 | }, 14 | "JDocQA": { 15 | "url": "https://github.com/mizuumi/JDocQA" 16 | }, 17 | "JMMMU": { 18 | "url": "https://huggingface.co/datasets/JMMMU/JMMMU" 19 | }, 20 | "MMMU": { 21 | "url": "https://huggingface.co/datasets/MMMU/MMMU" 22 | }, 23 | "LLavaB": { 24 | "url": "https://huggingface.co/datasets/lmms-lab/llava-bench-in-the-wild" 25 | }, 26 | "JIC": { 27 | "url": "https://huggingface.co/datasets/line-corporation/JIC-VQA" 28 | }, 29 | "MECHA": { 30 | "url": "https://huggingface.co/datasets/llm-jp/MECHA-ja" 31 | }, 32 | "CC-OCR": { 33 | "url": "https://huggingface.co/datasets/wulipc/CC-OCR" 34 | }, 35 | "CVQA": { 36 | "url": "https://huggingface.co/datasets/afaji/cvqa" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /github_pages/public/default_metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "default_metrics": { 3 | "Heron": "overall", 4 | "JVB-ItW": "rouge", 5 | "VGVQA": "rouge", 6 | "MulIm-VQA": "rouge", 7 | "JDocQA": "Acc", 8 | "JMMMU": "Acc", 9 | "MMMU": "Acc", 10 | "LlavaB-ItW": "rouge" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /github_pages/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 12 | 13 | 18 | llm-jp-eval-mm 19 | 20 | 21 |
22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /github_pages/src/Author.css: -------------------------------------------------------------------------------- 1 | .author { 2 | position: relative; 3 | display: inline-block; 4 | } 5 | 6 | .author-name { 7 | color: inherit; 8 | font-size: 1.5rem; 9 | font-weight: 500; 10 | line-height: 1.2; 11 | } 12 | 13 | .author-website { 14 | color: inherit; 15 | text-decoration: none; 16 | position: relative; 17 | } 18 | 19 | .author-annotation { 20 | position: relative; 21 | bottom: 0.5rem; 22 | font-size: 1rem; 23 | font-weight: 500; 24 | } 25 | 26 | .author-separator { 27 | position: relative; 28 | font-size: 1.5rem; 29 | font-weight: 500; 30 | } 31 | -------------------------------------------------------------------------------- /github_pages/src/Author.js: -------------------------------------------------------------------------------- 1 | import './Author.css'; 2 | 3 | const AFFILIATION_COLORS = [ 4 | '', 5 | '#6fbf73', 6 | '#ed4b82', 7 | '#9400d3', 8 | '#4169E1', 9 | '#ffac33', 10 | '#1e90ff', 11 | '#ff69b4', 12 | ]; 13 | export { AFFILIATION_COLORS }; 14 | 15 | const Author = ({ 16 | name, 17 | affiliation, 18 | annotation1, 19 | annotation2, 20 | url, 21 | isLast, 22 | }) => { 23 | return ( 24 |
25 | {url ? ( 26 | 32 | {name} 33 | 34 | ) : ( 35 | {name} 36 | )} 37 | 38 | {annotation1} 39 | {affiliation.map((num, index) => ( 40 | 41 | {num} 42 | {index < affiliation.length - 1 && ', '} 43 | 44 | ))} 45 | {annotation2} 46 | 47 | {(isLast === undefined || !isLast) && ( 48 | {','}  49 | )} 50 |
51 | ); 52 | }; 53 | export default Author; 54 | -------------------------------------------------------------------------------- /github_pages/src/BibTex.css: -------------------------------------------------------------------------------- 1 | .bibtex-title { 2 | font-size: 2rem; 3 | font-weight: 600; 4 | color: #363636; 5 | margin-block: 2.5rem 1.5rem; 6 | } 7 | 8 | .bibtex-entry { 9 | width: calc(100% - 1.5rem); 10 | position: relative; 11 | background-color: #eae5e3; 12 | color: #0f2350; 13 | font-size: 0.75rem; 14 | padding: 0.75rem; 15 | border-radius: 0.75rem; 16 | margin: 0; 17 | text-align: left; 18 | display: inline-block; 19 | white-space: pre-wrap; 20 | word-wrap: break-word; 21 | } 22 | 23 | .bibtex-copy-button { 24 | position: absolute; 25 | top: 0; 26 | right: 0; 27 | background-color: rgba(255, 255, 255, 0); 28 | color: #595857; 29 | } 30 | @media (hover: hover) { 31 | .bibtex-copy-button:hover { 32 | color: #ba2636; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /github_pages/src/BibTex.js: -------------------------------------------------------------------------------- 1 | import { LuCopy } from 'react-icons/lu'; 2 | import './BibTex.css'; 3 | 4 | // BibTeX entry for the paper 5 | const BIBTEX_ENTRY = `@inproceedings{maeda2025llm-jp-eval-mm, 6 | author = {前田 航希 and 杉浦 一瑳 and 小田 悠介 and 栗田 修平 and 岡崎 直観}, 7 | month = mar, 8 | series = {言語処理学会第31回年次大会 (NLP2025)}, 9 | title = {{llm-jp-eval-mm: 日本語視覚言語モデルの自動評価基盤}}, 10 | year = {2025} 11 | } 12 | `; 13 | 14 | const copyClipboard = () => { 15 | navigator.clipboard.writeText(BIBTEX_ENTRY).catch((error) => { 16 | console.error('Failed to copy BibTeX entry to clipboard', error); 17 | }); 18 | }; 19 | 20 | const BibTeX = () => { 21 | return ( 22 |
23 |

BibTeX

24 |
25 |         {BIBTEX_ENTRY}
26 |         
29 |       
30 |
31 | ); 32 | }; 33 | 34 | export default BibTeX; 35 | -------------------------------------------------------------------------------- /github_pages/src/Figure.css: -------------------------------------------------------------------------------- 1 | .figure { 2 | padding: 1.5rem 0rem; 3 | margin: 0; 4 | display: flex; 5 | flex-direction: column; 6 | align-items: center; 7 | justify-content: center; 8 | } 9 | 10 | .figure-image { 11 | width: 100%; 12 | max-width: 960px; 13 | height: auto; 14 | object-fit: contain; 15 | } 16 | 17 | .figure-caption { 18 | padding: 0.5rem; 19 | } 20 | -------------------------------------------------------------------------------- /github_pages/src/Figure.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import './Figure.css'; 3 | 4 | const Figure = ({ image, altText, caption }) => { 5 | return ( 6 |
7 | {altText} 8 |
{caption}
9 |
10 | ); 11 | }; 12 | export default Figure; 13 | -------------------------------------------------------------------------------- /github_pages/src/Footer.css: -------------------------------------------------------------------------------- 1 | .footer { 2 | width: 100%; 3 | margin: 0.5rem 0rem 0rem 0rem; 4 | padding: 0.5rem 0rem 0.5rem 0rem; 5 | font-size: 0.75rem; 6 | } 7 | -------------------------------------------------------------------------------- /github_pages/src/Footer.js: -------------------------------------------------------------------------------- 1 | import './Footer.css'; 2 | 3 | const Footer = () => { 4 | return ( 5 | 58 | ); 59 | }; 60 | 61 | export default Footer; 62 | -------------------------------------------------------------------------------- /github_pages/src/Introduction.css: -------------------------------------------------------------------------------- 1 | .introduction-title { 2 | font-size: 2rem; 3 | font-weight: 600; 4 | color: #363636; 5 | margin-block: 2.5rem 1.5rem; 6 | } 7 | 8 | .introduction-content { 9 | text-align: justify; 10 | } 11 | 12 | a { 13 | color: inherit; 14 | text-decoration: underline; 15 | } 16 | -------------------------------------------------------------------------------- /github_pages/src/Introduction.js: -------------------------------------------------------------------------------- 1 | import Figure from './Figure'; 2 | import overviewFigure from './assets/teaser.png'; 3 | import './Introduction.css'; 4 | 5 | const Introduction = () => { 6 | return ( 7 |
8 |

Introduction

9 |
10 | 11 | We introduce llm-jp-eval-mm, a toolkit for evaluating multiple 12 | multimodal tasks related to Japanese language performance in a unified 13 | environment. The toolkit is a benchmarking platform that integrates 14 | six existing Japanese multimodal tasks and consistently evaluates 15 | model outputs across multiple metrics. This paper outlines the design 16 | of llm-jp-eval-mm for its construction and ongoing development, 17 | reports the results of evaluating 13 publicly available Japanese and 18 | multilingual VLMs, and discusses the findings in the light of existing 19 | research. 20 | 21 |
22 |
27 | Figure 1: Overview of llm-jp-eval-mm. 28 | 29 | } 30 | /> 31 |
32 | ); 33 | }; 34 | 35 | export default Introduction; 36 | -------------------------------------------------------------------------------- /github_pages/src/Leaderboard.css: -------------------------------------------------------------------------------- 1 | .Leaderboard { 2 | display: flex; 3 | flex-direction: column; 4 | align-items: center; 5 | justify-content: center; 6 | width: 100%; 7 | } 8 | 9 | .leaderboard-title { 10 | font-size: 2rem; 11 | font-weight: 600; 12 | color: #363636; 13 | margin-block: 2.5rem 1.5rem; 14 | } 15 | 16 | .table-container { 17 | width: calc(100vw - 2rem); 18 | overflow-x: auto; 19 | /* 横スクロールを有効化 */ 20 | border: 1px solid #ccc; 21 | margin-top: 10px; 22 | } 23 | 24 | table { 25 | border-collapse: collapse; 26 | width: 100%; 27 | 28 | /* スクロールを促すための最小幅 */ 29 | } 30 | 31 | thead { 32 | position: sticky; 33 | top: 0; 34 | background-color: #f1f1f1; 35 | z-index: 1; 36 | } 37 | 38 | table th, 39 | table td { 40 | border: 1px solid #ddd; 41 | padding: 8px; 42 | text-align: center; 43 | font-size: 14px; 44 | white-space: nowrap; 45 | /* 折り返しを無効化 */ 46 | transition: background-color 0.2s ease; 47 | } 48 | 49 | table th { 50 | background-color: #f4f4f4; 51 | cursor: pointer; 52 | font-size: 14px; 53 | } 54 | 55 | table tr:hover { 56 | background-color: #f9f9f9; 57 | } 58 | 59 | /* スマホ向けのメディアクエリ */ 60 | @media (max-width: 768px) { 61 | .Leaderboard { 62 | padding: 10px; 63 | } 64 | 65 | .leaderboard-title { 66 | font-size: 1.5rem; 67 | margin-block: 1.5rem 1rem; 68 | } 69 | 70 | table th, 71 | table td { 72 | font-size: 12px; 73 | /* スマホ向けに文字サイズを縮小 */ 74 | padding: 6px; 75 | /* セル内の余白を調整 */ 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /github_pages/src/Leaderboard.js: -------------------------------------------------------------------------------- 1 | import React, { useEffect, useState } from 'react'; 2 | import './Leaderboard.css'; 3 | 4 | function Leaderboard() { 5 | const [data, setData] = useState([]); 6 | const [datasets, setDatasets] = useState([]); 7 | const [datasetUrl, setDatasetUrl] = useState({}); 8 | 9 | const [metrics, setMetrics] = useState({}); 10 | const [defaultMetrics, setDefaultMetrics] = useState({}); 11 | const [sortConfig, setSortConfig] = useState(null); 12 | 13 | useEffect(() => { 14 | // Fetch leaderboard data 15 | fetch('leaderboard.json') 16 | .then((response) => response.json()) 17 | .then((data) => { 18 | setData(data); 19 | 20 | // Extract datasets and metrics dynamically 21 | const datasetNames = data 22 | .reduce((acc, row) => { 23 | return acc.concat(Object.keys(row.scores)); 24 | }, []) 25 | .filter((value, index, self) => self.indexOf(value) === index); 26 | setDatasets(datasetNames); 27 | 28 | const metricNames = {}; 29 | datasetNames.forEach((dataset) => { 30 | const metricNamesArray = data 31 | .reduce((acc, row) => { 32 | return acc.concat(Object.keys(row.scores[dataset] || {})); 33 | }, []) 34 | .filter((value, index, self) => self.indexOf(value) === index); 35 | metricNames[dataset] = metricNamesArray; 36 | }); 37 | setMetrics(metricNames); 38 | }) 39 | .catch((error) => 40 | console.error('Error loading leaderboard data:', error), 41 | ); 42 | 43 | // Fetch default metrics 44 | fetch('default_metrics.json') 45 | .then((response) => response.json()) 46 | .then((defaultMetrics) => { 47 | setDefaultMetrics(defaultMetrics.default_metrics); // Use the `default_metrics` field 48 | }) 49 | .catch((error) => console.error('Error loading default metrics:', error)); 50 | 51 | // Fetch dataset url 52 | // { 53 | // { 54 | // "Heron": { 55 | // "url": "https://huggingface.co/datasets/turing-motors/Japanese-Heron-Bench" 56 | // }, 57 | // "JVB-ItW": { 58 | // "url": "https://huggingface.co/datasets/SakanaAI/JA-VLM-Bench-In-the-Wild" 59 | // }, 60 | // "VGVQA": { 61 | // "url": "https://huggingface.co/datasets/SakanaAI/JA-VG-VQA-500" 62 | // }, 63 | // "MulIm-VQA": { 64 | // "url": "https://huggingface.co/datasets/SakanaAI/JA-Multi-Image-VQA" 65 | // }, 66 | // "JDocQA": { 67 | // "url": "https://huggingface.co/datasets/shunk031/JDocQA" 68 | // }, 69 | // "JMMMU": { 70 | // "url": "https://huggingface.co/datasets/JMMMU/JMMMU" 71 | // } 72 | // } 73 | fetch('dataset_url.json') 74 | .then((response) => response.json()) 75 | .then((datasetUrl) => { 76 | setDatasetUrl(datasetUrl); 77 | }); 78 | }, []); 79 | 80 | const handleSort = (dataset, metric) => { 81 | let sortedData = [...data]; 82 | const direction = 83 | sortConfig?.key === `${dataset}-${metric}` && 84 | sortConfig.direction === 'asc' 85 | ? 'desc' 86 | : 'asc'; 87 | sortedData.sort((a, b) => { 88 | const aValue = a.scores[dataset]?.[metric] || 0; 89 | const bValue = b.scores[dataset]?.[metric] || 0; 90 | if (aValue < bValue) return direction === 'asc' ? -1 : 1; 91 | if (aValue > bValue) return direction === 'asc' ? 1 : -1; 92 | return 0; 93 | }); 94 | setSortConfig({ key: `${dataset}-${metric}`, direction }); 95 | setData(sortedData); 96 | }; 97 | 98 | const getSortArrow = (dataset, metric) => { 99 | if (sortConfig?.key === `${dataset}-${metric}`) { 100 | return sortConfig.direction === 'asc' ? '↑' : '↓'; 101 | } 102 | return '↕'; 103 | }; 104 | 105 | return ( 106 |
107 |

Leaderboard

108 | 109 |
110 | 111 | 112 | 113 | 114 | {datasets.map((dataset) => ( 115 | 118 | ))} 119 | 120 | 121 | 122 | {datasets.map((dataset) => 123 | metrics[dataset]?.map((metric) => ( 124 | 130 | )), 131 | )} 132 | 133 | 134 | 135 | {data.map((item, index) => ( 136 | 137 | 140 | {datasets.map((dataset) => 141 | metrics[dataset]?.map((metric) => ( 142 | 152 | )), 153 | )} 154 | 155 | ))} 156 | 157 |
Model 116 | {dataset} 117 |
handleSort(dataset, metric)} 127 | > 128 | {metric} {getSortArrow(dataset, metric)} 129 |
138 | {item.model} 139 | 150 | {item.scores[dataset]?.[metric]?.toFixed(1) || '-'} 151 |
158 |
159 |
160 | ); 161 | } 162 | export default Leaderboard; 163 | -------------------------------------------------------------------------------- /github_pages/src/LinkButton.css: -------------------------------------------------------------------------------- 1 | .link-button { 2 | height: 2.5rem; 3 | /* width: 8rem; */ 4 | border: none; 5 | border-radius: 1.25rem; 6 | padding: 0rem 1.25rem; 7 | margin: 0.25rem; 8 | display: inline-flex; 9 | align-items: center; 10 | justify-content: center; 11 | background-color: #2f2f2f; 12 | color: #ffffff; 13 | font-size: 1rem; 14 | font-weight: 400; 15 | cursor: pointer; 16 | } 17 | 18 | @media (hover: hover) { 19 | .link-button:hover { 20 | background-color: #595857; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /github_pages/src/LinkButton.js: -------------------------------------------------------------------------------- 1 | import './LinkButton.css'; 2 | 3 | const LinkButton = ({ url, children }) => { 4 | const handleClick = () => window.open(url, '_blank', 'noopener noreferrer'); 5 | 6 | return ( 7 | 10 | ); 11 | }; 12 | export default LinkButton; 13 | -------------------------------------------------------------------------------- /github_pages/src/Main.css: -------------------------------------------------------------------------------- 1 | #root { 2 | margin: 0 auto; 3 | padding: 0; 4 | text-align: center; 5 | } 6 | -------------------------------------------------------------------------------- /github_pages/src/Main.js: -------------------------------------------------------------------------------- 1 | import BibTex from './BibTex'; 2 | // import Example from "./Example"; 3 | import Introduction from './Introduction'; 4 | import Leaderboard from './Leaderboard'; 5 | import PaperMetaData from './PaperMetaData'; 6 | 7 | import PageLayout from './PageLayout'; 8 | import './Main.css'; 9 | 10 | const Main = () => { 11 | return ( 12 | 13 | 14 | 15 | {/* */} 16 | {/* */} 17 | 18 | {/* */} 19 | 20 | 21 | ); 22 | }; 23 | 24 | export default Main; 25 | -------------------------------------------------------------------------------- /github_pages/src/Method.css: -------------------------------------------------------------------------------- 1 | .method-title { 2 | font-size: 2rem; 3 | font-weight: 600; 4 | color: #363636; 5 | margin-block: 2.5rem 1.5rem; 6 | } 7 | 8 | .method-content { 9 | text-align: justify; 10 | } 11 | -------------------------------------------------------------------------------- /github_pages/src/Method.js: -------------------------------------------------------------------------------- 1 | import Figure from './Figure'; 2 | import './Method.css'; 3 | 4 | const Method = () => { 5 | return ( 6 |
7 |

Design of llm-jp-eval-mm

8 |
TODO:
9 |
10 | ); 11 | }; 12 | 13 | export default Method; 14 | -------------------------------------------------------------------------------- /github_pages/src/PageLayout.css: -------------------------------------------------------------------------------- 1 | .main-content { 2 | margin: 0 auto; 3 | max-width: 1120px; 4 | padding: 1rem; 5 | } 6 | -------------------------------------------------------------------------------- /github_pages/src/PageLayout.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Footer from './Footer'; 3 | 4 | import './PageLayout.css'; 5 | 6 | const PageLayout = ({ children }) => { 7 | return ( 8 | <> 9 |
{children}
10 |