├── eval
    ├── gen
    │   ├── gedit
    │   │   └── viescore
    │   │   │   ├── mllm_tools
    │   │   │       ├── __init__.py
    │   │   │       ├── utils.py
    │   │   │       └── qwen25vl_eval.py
    │   │   │   ├── parse_prompt.py
    │   │   │   └── __init__.py
    │   ├── geneval
    │   │   ├── evaluation
    │   │   │   ├── download_models.sh
    │   │   │   ├── object_names.txt
    │   │   │   └── summary_scores.py
    │   │   └── prompts
    │   │   │   ├── object_names.txt
    │   │   │   └── create_prompts.py
    │   ├── imgedit
    │   │   ├── step1_get_avgscore.py
    │   │   ├── step2_typescore.py
    │   │   └── basic_bench.py
    │   └── wise
    │   │   └── cal_score.py
    ├── __init__.py
    └── vlm
    │   ├── __init__.py
    │   ├── eval
    │       ├── mathvista
    │       │   ├── prompts
    │       │   │   └── ext_ans.py
    │       │   ├── extract_answer.py
    │       │   ├── extract_answer_mp.py
    │       │   ├── utilities.py
    │       │   └── evaluate_mathvista.py
    │       ├── mme
    │       │   ├── eval.py
    │       │   ├── Your_Results
    │       │   │   ├── OCR.txt
    │       │   │   ├── numerical_calculation.txt
    │       │   │   ├── code_reasoning.txt
    │       │   │   ├── existence.txt
    │       │   │   ├── color.txt
    │       │   │   ├── count.txt
    │       │   │   ├── position.txt
    │       │   │   └── text_translation.txt
    │       │   └── calculation.py
    │       ├── mmvet
    │       │   └── evaluate_mmvet.py
    │       ├── pope
    │       │   └── eval_pope.py
    │       └── mmmu
    │       │   ├── main_eval_only.py
    │       │   └── data_utils.py
    │   ├── utils.py
    │   └── evaluate.sh
├── assets
    ├── arch.png
    ├── teaser.webp
    └── emerging_curves.png
├── test_images
    ├── meme.jpg
    ├── women.jpg
    └── octupusy.jpg
├── data
    ├── __init__.py
    ├── interleave_datasets
    │   ├── __init__.py
    │   └── edit_dataset.py
    ├── configs
    │   └── example.yaml
    ├── dataset_info.py
    ├── distributed_iterable_dataset.py
    ├── parquet_utils.py
    ├── t2i_dataset.py
    ├── video_utils.py
    └── data_utils.py
├── train
    ├── __init__.py
    └── train_utils.py
├── .gitignore
├── modeling
    ├── __init__.py
    ├── bagel
    │   ├── __init__.py
    │   └── modeling_utils.py
    ├── qwen2
    │   ├── __init__.py
    │   └── tokenization_qwen2_fast.py
    └── siglip
    │   ├── __init__.py
    │   └── processing_siglip.py
├── scripts
    ├── eval
    │   ├── run_eval_vlm.sh
    │   ├── eval_vlm.sh
    │   ├── run_rise.sh
    │   ├── run_geneval.sh
    │   ├── run_kris.sh
    │   ├── run_wise.sh
    │   ├── run_imgedit.sh
    │   └── run_gedit.sh
    └── train.sh
└── requirements.txt


/eval/gen/gedit/viescore/mllm_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/arch.png


--------------------------------------------------------------------------------
/assets/teaser.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/teaser.webp


--------------------------------------------------------------------------------
/test_images/meme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/meme.jpg


--------------------------------------------------------------------------------
/test_images/women.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/women.jpg


--------------------------------------------------------------------------------
/test_images/octupusy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/octupusy.jpg


--------------------------------------------------------------------------------
/assets/emerging_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/emerging_curves.png


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/train/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/eval/vlm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | wandb
 2 | __pycache__
 3 | .vscode
 4 | notebooks
 5 | results
 6 | *.ipynb_checkpoints
 7 | eval_results
 8 | tests
 9 | .DS_Store
10 | gradio.sh


--------------------------------------------------------------------------------
/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | from . import bagel, qwen2, siglip, autoencoder


--------------------------------------------------------------------------------
/data/interleave_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | from .edit_dataset import UnifiedEditIterableDataset
5 | 
6 | 


--------------------------------------------------------------------------------
/modeling/bagel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | from .bagel import BagelConfig, Bagel
 6 | from .qwen2_navit import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 7 | from .siglip_navit import SiglipVisionConfig, SiglipVisionModel
 8 | 
 9 | 
10 | __all__ = [
11 |     'BagelConfig',
12 |     'Bagel',
13 |     'Qwen2Config',
14 |     'Qwen2Model', 
15 |     'Qwen2ForCausalLM',
16 |     'SiglipVisionConfig',
17 |     'SiglipVisionModel',
18 | ]
19 | 


--------------------------------------------------------------------------------
/scripts/eval/run_eval_vlm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | # Set proxy and API key
 7 | export OPENAI_API_KEY=$openai_api_key
 8 | 
 9 | export GPUS=1
10 | 
11 | DATASETS=("mme" "mmbench-dev-en" "mmvet" "mmmu-val" "mathvista-testmini" "mmvp")
12 | # DATASETS=("mmmu-val_cot")
13 | 
14 | DATASETS_STR="${DATASETS[*]}"
15 | export DATASETS_STR
16 | 
17 | bash scripts/eval/eval_vlm.sh \
18 |     $output_path \
19 |     --model-path $model_path


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord==0.6.0
 2 | einops==0.8.1
 3 | huggingface_hub==0.29.1
 4 | matplotlib==3.7.0
 5 | numpy==1.24.4
 6 | opencv_python==4.7.0.72
 7 | pyarrow==11.0.0
 8 | PyYAML==6.0.2
 9 | Requests==2.32.3
10 | safetensors==0.4.5
11 | scipy==1.10.1
12 | sentencepiece==0.1.99
13 | torch==2.5.1
14 | torchvision==0.20.1
15 | transformers==4.49.0
16 | #flash_attn==2.5.8
17 | accelerate>=0.34.0
18 | wandb
19 | gradio
20 | setuptools
21 | wheel
22 | ninja
23 | bitsandbytes
24 | xlsxwriter
25 | triton ; sys_platform != 'win32'
26 | triton-windows ; sys_platform == 'win32'


--------------------------------------------------------------------------------
/scripts/eval/eval_vlm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # Check if enough arguments are provided
 5 | if [ $# -lt 2 ]; then
 6 |     echo "Error: PREFIX_DIR and MODEL_PATH are required as the first and second arguments respectively."
 7 |     exit 1
 8 | fi
 9 | 
10 | LOG_PATH=$1
11 | if [ ! -d "$LOG_PATH" ]; then
12 |     mkdir -p "$LOG_PATH"
13 | fi
14 | shift 1
15 | ARGS=("$@")
16 | export MASTER_PORT=10042
17 | 
18 | FULL_MODEL_PATH="$PREFIX_DIR/$MODEL_PATH"
19 | 
20 | IFS=' ' read -r -a DATASETS <<< "$DATASETS_STR"
21 | 
22 | for DATASET in "${DATASETS[@]}"; do
23 |     bash eval/vlm/evaluate.sh \
24 |         "$DATASET" \
25 |         --out-dir "$LOG_PATH/$DATASET" \
26 |         "${ARGS[@]}"
27 | done


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # replace the variables with your own
 5 | torchrun \
 6 |   --nnodes=$num_nodes \
 7 |   --node_rank=$node_rank \
 8 |   --nproc_per_node=8 \
 9 |   --master_addr=$master_addr \
10 |   --master_port=$master_port \
11 |   train/pretrain_unified_navit.py \
12 |   --dataset_config_file ./data/configs/example.yaml \
13 |   --layer_module Qwen2MoTDecoderLayer \
14 |   --vae_path $vae_path \
15 |   --vit_path $vit_path \
16 |   --llm_path $llm_path \
17 |   --use_flex True \
18 |   --resume_from $resume_from \
19 |   --results_dir $output_path \
20 |   --checkpoint_dir $ckpt_path \
21 |   --max_latent_size 64  \
22 |   --num_workers 1 # use small num_workers since the num_used_data (10) are not enough to split


--------------------------------------------------------------------------------
/scripts/eval/run_rise.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | export OPENAI_API_KEY=$openai_api_key
 7 | 
 8 | GPUS=8
 9 | 
10 | 
11 | # generate images
12 | torchrun \
13 |     --nnodes=1 \
14 |     --node_rank=0 \
15 |     --nproc_per_node=$GPUS \
16 |     --master_addr=127.0.0.1 \
17 |     --master_port=12345 \
18 |     ./eval/gen/gen_images_mp_rise.py \
19 |     --output_dir $output_path/bagel \
20 |     --metadata_file ./eval/gen/rise/data/datav2_total_w_subtask.json \
21 |     --max_latent_size 64 \
22 |     --model-path $model_path \
23 |     --think
24 | 
25 | 
26 | # calculate score
27 | python ./eval/gen/rise/gpt_eval.py \
28 |     --data ./eval/gen/rise/data/datav2_total_w_subtask.json \
29 |     --input ./eval/gen/rise/data \
30 |     --output $output_path/bagel


--------------------------------------------------------------------------------
/eval/gen/geneval/evaluation/download_models.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Dhruba Ghosh
 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
 6 | #
 7 | # Original file was released under MIT, with the full license text
 8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE.
 9 | #
10 | # This modified file is released under the same license.
11 | 
12 | #!/bin/bash
13 | 
14 | # Download Mask2Former object detection config and weights
15 | 
16 | if [ ! -z "$1" ]
17 | then
18 |     mkdir -p "$1"
19 |     wget https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756-743b7d99.pth -O "$1/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.pth"
20 | fi
21 | 


--------------------------------------------------------------------------------
/eval/gen/geneval/evaluation/object_names.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | computer mouse
66 | tv remote
67 | computer keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/eval/gen/geneval/prompts/object_names.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | computer mouse
66 | tv remote
67 | computer keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/data/configs/example.yaml:
--------------------------------------------------------------------------------
 1 | t2i_pretrain:
 2 |   dataset_names:
 3 |   - t2i
 4 |   image_transform_args:
 5 |     image_stride: 16
 6 |     max_image_size: 1024
 7 |     min_image_size: 512
 8 |   is_mandatory: true
 9 |   num_used_data: # The sum should be larger that NUM_GPUS x NUM_WORKERS
10 |   - 10
11 |   weight: 1
12 | 
13 | unified_edit:
14 |   dataset_names:
15 |   - seedxedit_multi
16 |   image_transform_args:
17 |     image_stride: 16
18 |     max_image_size: 1024
19 |     min_image_size: 512
20 |   vit_image_transform_args:
21 |     image_stride: 14
22 |     max_image_size: 518
23 |     min_image_size: 224
24 |   is_mandatory: false
25 |   num_used_data:
26 |   - 10
27 |   weight: 1
28 | 
29 | vlm_sft:
30 |   dataset_names:
31 |   - llava_ov
32 |   image_transform_args:
33 |     image_stride: 14
34 |     max_image_size: 980
35 |     min_image_size: 378
36 |     max_pixels: 2_007_040
37 |   frame_sampler_args:
38 |     max_num_frames: 12
39 |     min_num_frames: 8
40 |   is_mandatory: true
41 |   shuffle_lines: True
42 |   shuffle_seed: 0
43 |   num_used_data:
44 |   - 1000
45 |   weight: 1
46 | 


--------------------------------------------------------------------------------
/eval/gen/gedit/viescore/parse_prompt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def create_python_file_with_texts(folder_path, output_file):
 4 |     with open(output_file, 'w', encoding='utf-8') as out_file:
 5 |         out_file.write("# This file is generated automatically through parse_prompt.py\n\n")
 6 |         for root, dirs, files in os.walk(folder_path):
 7 |             for file in files:
 8 |                 if file.endswith(".txt"):
 9 |                     file_path = os.path.join(root, file)
10 |                     var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_")
11 |                     with open(file_path, 'r', encoding='utf-8') as f:
12 |                         content = f.read().replace('"""', '\"\"\"')
13 |                         out_file.write(f'{var_name} = """{content}"""\n\n')
14 | 
15 | # Example usage
16 | current_file_path = os.path.abspath(__file__)
17 | current_folder_path = os.path.dirname(current_file_path)
18 | folder_path = os.path.join(current_folder_path, "prompts_raw")
19 | output_file = os.path.join(current_folder_path, "vie_prompts.py")
20 | create_python_file_with_texts(folder_path, output_file)
21 | 


--------------------------------------------------------------------------------
/scripts/eval/run_geneval.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | GPUS=8
 7 | 
 8 | 
 9 | # generate images
10 | torchrun \
11 |     --nnodes=1 \
12 |     --node_rank=0 \
13 |     --nproc_per_node=$GPUS \
14 |     --master_addr=127.0.0.1 \
15 |     --master_port=12345 \
16 |     ./eval/gen/gen_images_mp.py \
17 |     --output_dir $output_path/images \
18 |     --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata_long.jsonl \
19 |     --batch_size 1 \
20 |     --num_images 4 \
21 |     --resolution 1024 \
22 |     --max_latent_size 64 \
23 |     --model-path $model_path \
24 |     # --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata.jsonl \
25 | 
26 | 
27 | # calculate score
28 | torchrun \
29 |     --nnodes=1 \
30 |     --node_rank=0 \
31 |     --nproc_per_node=$GPUS \
32 |     --master_addr=127.0.0.1 \
33 |     --master_port=12345 \
34 |     ./eval/gen/geneval/evaluation/evaluate_images_mp.py \
35 |     $output_path/images \
36 |     --outfile $output_path/results.jsonl \
37 |     --model-path ./eval/gen/geneval/model
38 | 
39 | 
40 | # summarize score
41 | python ./eval/gen/geneval/evaluation/summary_scores.py $output_path/results.jsonl


--------------------------------------------------------------------------------
/train/train_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import os
 6 | 
 7 | 
 8 | def create_logger(logging_dir, rank, filename="log"):
 9 |     """
10 |     Create a logger that writes to a log file and stdout.
11 |     """
12 |     if rank == 0 and logging_dir is not None:  # real logger
13 |         logging.basicConfig(
14 |             level=logging.INFO,
15 |             format='[\033[34m%(asctime)s\033[0m] %(message)s',
16 |             datefmt='%Y-%m-%d %H:%M:%S',
17 |             handlers=[
18 |                 logging.StreamHandler(), 
19 |                 logging.FileHandler(f"{logging_dir}/{filename}.txt")
20 |             ]
21 |         )
22 |         logger = logging.getLogger(__name__)
23 |     else:  # dummy logger (does nothing)
24 |         logger = logging.getLogger(__name__)
25 |         logger.addHandler(logging.NullHandler())
26 |     return logger
27 | 
28 | 
29 | def get_latest_ckpt(checkpoint_dir):
30 |     step_dirs = [d for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))]
31 |     if len(step_dirs) == 0:
32 |         return None
33 |     step_dirs = sorted(step_dirs, key=lambda x: int(x))
34 |     latest_step_dir = os.path.join(checkpoint_dir, step_dirs[-1])
35 |     return latest_step_dir
36 | 


--------------------------------------------------------------------------------
/scripts/eval/run_kris.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | export OPENAI_API_KEY=$openai_api_key
 7 | 
 8 | GPUS=8
 9 | 
10 | 
11 | # generate images
12 | torchrun \
13 |     --nnodes=1 \
14 |     --node_rank=0 \
15 |     --nproc_per_node=$GPUS \
16 |     --master_addr=127.0.0.1 \
17 |     --master_port=12345 \
18 |     ./eval/gen/gen_images_mp_kris.py \
19 |     --output_dir $output_path/bagel \
20 |     --metadata_file ./eval/gen/kris/final_data.json \
21 |     --max_latent_size 64 \
22 |     --model-path $model_path \
23 |     --think
24 | 
25 | 
26 | # calculate score
27 | python ./eval/gen/kris/metrics_common.py \
28 |     --results_dir $output_path \
29 |     --max_workers 8
30 | 
31 | python ./eval/gen/kris/metrics_knowledge.py \
32 |     --results_dir $output_path \
33 |     --max_workers 8
34 | 
35 | python ./eval/gen/kris/metrics_multi_element.py \
36 |     --results_dir $output_path \
37 |     --max_workers 8
38 | 
39 | python ./eval/gen/kris/metrics_temporal_prediction.py \
40 |     --results_dir $output_path \
41 |     --max_workers 8
42 | 
43 | python ./eval/gen/kris/metrics_view_change.py \
44 |     --results_dir $output_path \
45 |     --max_workers 8
46 | 
47 | 
48 | # summarize score
49 | python ./eval/gen/kris/summarize.py \
50 |     --results_dir $output_path/bagel \


--------------------------------------------------------------------------------
/scripts/eval/run_wise.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | export OPENAI_API_KEY=$openai_api_key
 7 | 
 8 | GPUS=8
 9 | 
10 | 
11 | # generate images
12 | torchrun \
13 |     --nnodes=1 \
14 |     --node_rank=0 \
15 |     --nproc_per_node=$GPUS \
16 |     --master_addr=127.0.0.1 \
17 |     --master_port=12345 \
18 |     ./eval/gen/gen_images_mp_wise.py \
19 |     --output_dir $output_path/images \
20 |     --metadata-file ./eval/gen/wise/final_data.json \
21 |     --resolution 1024 \
22 |     --max-latent_size 64 \
23 |     --model-path $model_path \
24 |     --think
25 | 
26 | 
27 | # calculate score
28 | python3 eval/gen/wise/gpt_eval_mp.py \
29 |         --json_path eval/gen/wise/data/cultural_common_sense.json \
30 |         --image_dir $output_path/images \
31 |         --output_dir $output_path
32 | 
33 | python3 eval/gen/wise/gpt_eval_mp.py \
34 |         --json_path eval/gen/wise/data/spatio-temporal_reasoning.json \
35 |         --image_dir $output_path/images \
36 |         --output_dir $output_path
37 | 
38 | python3 eval/gen/wise/gpt_eval_mp.py \
39 |         --json_path eval/gen/wise/data/natural_science.json \
40 |         --image_dir $output_path/images \
41 |         --output_dir $output_path
42 | 
43 | python3 eval/gen/wise/cal_score.py \
44 |         --output_dir $output_path


--------------------------------------------------------------------------------
/data/dataset_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from .interleave_datasets import UnifiedEditIterableDataset
 5 | from .t2i_dataset import T2IIterableDataset
 6 | from .vlm_dataset import SftJSONLIterableDataset
 7 | 
 8 | 
 9 | DATASET_REGISTRY = {
10 |     't2i_pretrain': T2IIterableDataset,
11 |     'vlm_sft': SftJSONLIterableDataset,
12 |     'unified_edit': UnifiedEditIterableDataset,
13 | }
14 | 
15 | 
16 | DATASET_INFO = {
17 |     't2i_pretrain': {
18 |         't2i': {
19 |             'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files
20 |             'num_files': 10, # number of data units to be sharded across all ranks and workers
21 |             'num_total_samples': 1000, # number of total samples in the dataset
22 |         },
23 |     },
24 |     'unified_edit':{
25 |         'seedxedit_multi': {
26 |             'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi',
27 |             'num_files': 10,
28 |             'num_total_samples': 1000,
29 |             "parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files
30 | 		},
31 |     },
32 |     'vlm_sft': {
33 |         'llava_ov': {
34 | 			'data_dir': 'your_data_path/bagel_example/vlm/images',
35 | 			'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl',
36 | 			'num_total_samples': 1000
37 | 		},
38 |     },
39 | }


--------------------------------------------------------------------------------
/scripts/eval/run_imgedit.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set -x
 5 | 
 6 | export OPENAI_API_KEY=$openai_api_key
 7 | 
 8 | GPUS=8
 9 | 
10 | 
11 | # generate images
12 | torchrun \
13 |     --nnodes=1 \
14 |     --node_rank=0 \
15 |     --nproc_per_node=$GPUS \
16 |     --master_addr=127.0.0.1 \
17 |     --master_port=12345 \
18 |     ./eval/gen/gen_images_mp_imgedit.py \
19 |     --output_dir $output_path/bagel \
20 |     --metadata_file ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
21 |     --max_latent_size 64 \
22 |     --model-path $model_path
23 | 
24 | 
25 | # calculate score
26 | python ./eval/gen/imgedit/basic_bench.py \
27 |     --result_img_folder $output_path/bagel \
28 |     --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
29 |     --origin_img_root ./eval/gen/imgedit/Benchmark/singleturn \
30 |     --num_processes 4 \
31 |     --prompts_json ./eval/gen/imgedit/Benchmark/singleturn/judge_prompt.json
32 | 
33 | 
34 | # summarize score
35 | python ./eval/gen/imgedit/step1_get_avgscore.py \
36 |     --result_json $output_path/bagel/result.json \
37 |     --average_score_json $output_path/bagel/average_score.json
38 | 
39 | python ./eval/gen/imgedit/step2_typescore.py \
40 |     --average_score_json  $output_path/bagel/average_score.json \
41 |     --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
42 |     --typescore_json $output_path/bagel/typescore.json


--------------------------------------------------------------------------------
/eval/gen/imgedit/step1_get_avgscore.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import argparse
 6 | 
 7 | def extract_scores_and_average(entry: str) -> float:
 8 |     lines = entry.splitlines()
 9 |     scores = []
10 |     for line in lines:
11 |         parts = line.strip().split(': ')
12 |         if len(parts) == 2 and parts[1].isdigit():
13 |             scores.append(int(parts[1]))
14 |     if scores:
15 |         return round(sum(scores) / len(scores), 2)
16 |     return None
17 | 
18 | def compute_averages(result_json_dict):
19 |     result = {}
20 |     for key, value in result_json_dict.items():
21 |         avg = extract_scores_and_average(value)
22 |         if avg is not None:
23 |             result[key] = avg
24 |     return result
25 | 
26 | def main():
27 |     parser = argparse.ArgumentParser(description="Calculate the average score for each key and save it as a new JSON file")
28 |     parser.add_argument('--result_json', type=str, required=True, help='Path of result_json json')
29 |     parser.add_argument('--average_score_json', type=str, required=True, help='Path of average_score_json json')
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     with open(args.result_json, 'r', encoding='utf-8') as f:
34 |         data = json.load(f)
35 | 
36 |     averaged_data = compute_averages(data)
37 | 
38 |     with open(args.average_score_json, 'w', encoding='utf-8') as f:
39 |         json.dump(averaged_data, f, indent=2)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()


--------------------------------------------------------------------------------
/scripts/eval/run_gedit.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # run this script at the root of the project folder
 5 | pip install httpx==0.23.0
 6 | pip install openai==1.87.0
 7 | pip install datasets
 8 | pip install megfile
 9 | 
10 | 
11 | N_GPU=8  # Number of GPU used in for the evaluation
12 | MODEL_PATH="/Path/to/BAGEL-7B-MoT"
13 | OUTPUT_DIR="/Path/to/save/results"
14 | GEN_DIR="$OUTPUT_DIR/gen_image"
15 | LOG_DIR="$OUTPUT_DIR/logs"
16 | 
17 | AZURE_ENDPOINT="https://azure_endpoint_url_you_use"  # set up the azure openai endpoint url
18 | AZURE_OPENAI_KEY=""  # set up the azure openai key
19 | N_GPT_PARALLEL=10
20 | 
21 | 
22 | mkdir -p "$OUTPUT_DIR"
23 | mkdir -p "$GEN_DIR"
24 | mkdir -p "$LOG_DIR"
25 | 
26 | 
27 | # # ----------------------------
28 | # #    Download GEdit Dataset
29 | # # ----------------------------
30 | python -c "from datasets import load_dataset; dataset = load_dataset('stepfun-ai/GEdit-Bench')"
31 | echo "Dataset Downloaded"
32 | 
33 | 
34 | # # ---------------------
35 | # #    Generate Images
36 | # # ---------------------
37 | for ((i=0; i<$N_GPU; i++)); do
38 |     nohup python3 eval/gen/gedit/gen_images_gedit.py --model_path "$MODEL_PATH"  --output_dir "$GEN_DIR"  --shard_id $i --total_shards "$N_GPU" --device $i  2>&1 | tee "$LOG_DIR"/request_$(($N_GPU + i)).log &
39 | done
40 | 
41 | wait
42 | echo "Image Generation Done"
43 | 
44 | 
45 | # # ---------------------
46 | # #    GPT Evaluation
47 | # # ---------------------
48 | cd eval/gen/gedit
49 | python test_gedit_score.py --save_path "$OUTPUT_DIR" --azure_endpoint "$AZURE_ENDPOINT" --gpt_keys "$AZURE_OPENAI_KEY"  --max_workers "$N_GPT_PARALLEL"
50 | echo "Evaluation Done"
51 | 
52 | 
53 | # # --------------------
54 | # #    Print Results
55 | # # --------------------
56 | python calculate_statistics.py --save_path "$OUTPUT_DIR"  --language en
57 | 
58 | 


--------------------------------------------------------------------------------
/modeling/qwen2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from transformers.utils import (
 7 |     OptionalDependencyNotAvailable,
 8 |     _LazyModule,
 9 |     is_tokenizers_available,
10 |     is_torch_available,
11 | )
12 | 
13 | 
14 | _import_structure = {
15 |     "configuration_qwen2": ["Qwen2Config"],
16 |     "tokenization_qwen2": ["Qwen2Tokenizer"],
17 | }
18 | 
19 | try:
20 |     if not is_tokenizers_available():
21 |         raise OptionalDependencyNotAvailable()
22 | except OptionalDependencyNotAvailable:
23 |     pass
24 | else:
25 |     _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
26 | 
27 | try:
28 |     if not is_torch_available():
29 |         raise OptionalDependencyNotAvailable()
30 | except OptionalDependencyNotAvailable:
31 |     pass
32 | else:
33 |     _import_structure["modeling_qwen2"] = [
34 |         "Qwen2ForCausalLM",
35 |         "Qwen2Model",
36 |         "Qwen2PreTrainedModel",
37 |     ]
38 | 
39 | 
40 | if TYPE_CHECKING:
41 |     from .configuration_qwen2 import Qwen2Config
42 |     from .tokenization_qwen2 import Qwen2Tokenizer
43 | 
44 |     try:
45 |         if not is_tokenizers_available():
46 |             raise OptionalDependencyNotAvailable()
47 |     except OptionalDependencyNotAvailable:
48 |         pass
49 |     else:
50 |         from .tokenization_qwen2_fast import Qwen2TokenizerFast
51 | 
52 |     try:
53 |         if not is_torch_available():
54 |             raise OptionalDependencyNotAvailable()
55 |     except OptionalDependencyNotAvailable:
56 |         pass
57 |     else:
58 |         from .modeling_qwen2 import (
59 |             Qwen2ForCausalLM,
60 |             Qwen2Model,
61 |             Qwen2PreTrainedModel,
62 |         )
63 | 
64 | 
65 | else:
66 |     import sys
67 | 
68 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
69 | 


--------------------------------------------------------------------------------
/eval/gen/imgedit/step2_typescore.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import argparse
 6 | from collections import defaultdict
 7 | 
 8 | def compute_edit_type_averages(score_dict, meta_dict):
 9 |     edit_type_scores = defaultdict(list)
10 |     all_scores = []
11 | 
12 |     for key, score in score_dict.items():
13 |         meta = meta_dict.get(key, {})
14 |         edit_type = meta.get("edit_type")
15 |         if edit_type is not None:
16 |             edit_type_scores[edit_type].append(score)
17 |         all_scores.append(score)
18 | 
19 |     averaged_by_type = {
20 |         etype: round(sum(scores) / len(scores), 2)
21 |         for etype, scores in edit_type_scores.items() if scores
22 |     }
23 |     if all_scores:
24 |         averaged_by_type['overall'] = round(sum(all_scores) / len(all_scores), 2)
25 | 
26 |     return averaged_by_type
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser(description="Calculate edit type averages")
30 |     parser.add_argument('--average_score_json', type=str, required=True, help='path to the JSON file containing the scores')
31 |     parser.add_argument('--edit_json', type=str, required=True, help='Path  to the JSON file containing the basic edit information')
32 |     parser.add_argument('--typescore_json', type=str, required=True, help='Path  to the JSON file containing the edit type scores')
33 | 
34 |     args = parser.parse_args()
35 | 
36 |     with open(args.average_score_json, 'r', encoding='utf-8') as f:
37 |         score_data = json.load(f)
38 | 
39 |     with open(args.edit_json, 'r', encoding='utf-8') as f:
40 |         meta_data = json.load(f)
41 | 
42 |     averaged_result = compute_edit_type_averages(score_data, meta_data)
43 |     for k, v in averaged_result.items():
44 |         print(f"{k}: {v}")
45 | 
46 |     with open(args.typescore_json, 'w', encoding='utf-8') as f:
47 |         json.dump(averaged_result, f, indent=2)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()


--------------------------------------------------------------------------------
/eval/gen/geneval/evaluation/summary_scores.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Dhruba Ghosh
 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
 6 | #
 7 | # Original file was released under MIT, with the full license text
 8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE.
 9 | #
10 | # This modified file is released under the same license.
11 | 
12 | import argparse
13 | import os
14 | 
15 | import numpy as np
16 | import pandas as pd
17 | 
18 | 
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument("filename", type=str)
21 | args = parser.parse_args()
22 | 
23 | # Load classnames
24 | 
25 | with open(os.path.join(os.path.dirname(__file__), "object_names.txt")) as cls_file:
26 |     classnames = [line.strip() for line in cls_file]
27 |     cls_to_idx = {"_".join(cls.split()):idx for idx, cls in enumerate(classnames)}
28 | 
29 | # Load results
30 | 
31 | df = pd.read_json(args.filename, orient="records", lines=True)
32 | 
33 | # Measure overall success
34 | 
35 | print("Summary")
36 | print("=======")
37 | print(f"Total images: {len(df)}")
38 | print(f"Total prompts: {len(df.groupby('metadata'))}")
39 | print(f"% correct images: {df['correct'].mean():.2%}")
40 | print(f"% correct prompts: {df.groupby('metadata')['correct'].any().mean():.2%}")
41 | print()
42 | 
43 | # By group
44 | 
45 | task_scores = []
46 | 
47 | print("Task breakdown")
48 | print("==============")
49 | for tag, task_df in df.groupby('tag', sort=False):
50 |     task_scores.append(task_df['correct'].mean())
51 |     print(f"{tag:<16} = {task_df['correct'].mean():.2%} ({task_df['correct'].sum()} / {len(task_df)})")
52 | print()
53 | 
54 | print(f"Overall score (avg. over tasks): {np.mean(task_scores):.5f}")
55 | 
56 | 
57 | print("\n\n==============")
58 | output_info = "SO   TO   CT   CL   POS  ATTR ALL\n"
59 | for score in task_scores:
60 |     output_info += f"{score:.2f} "
61 | output_info += f"{np.mean(task_scores):.2f}" + "\n"
62 | print(output_info)
63 | with open(os.path.join(os.path.dirname(args.filename), "geneval_results.txt"), "w") as f:
64 |     f.write(output_info)


--------------------------------------------------------------------------------
/data/distributed_iterable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import random
 5 | import torch
 6 | 
 7 | 
 8 | class DistributedIterableDataset(torch.utils.data.IterableDataset):
 9 |     def __init__(self, dataset_name, local_rank=0, world_size=1, num_workers=8):
10 |         self.dataset_name = dataset_name
11 |         self.local_rank = local_rank
12 |         self.world_size = world_size
13 |         self.num_workers = num_workers
14 |         self.rng = random.Random()
15 |         self.data_paths = None
16 | 
17 |     def get_data_paths(self, *args, **kwargs):
18 |         raise NotImplementedError
19 | 
20 |     def set_epoch(self, seed=42):
21 |         if self.data_paths is None:
22 |             return
23 | 
24 |         if isinstance(self.data_paths[0], tuple):
25 |             data_paths = sorted(self.data_paths, key=lambda x: (x[0], x[1]))
26 |         elif isinstance(self.data_paths[0], str):
27 |             data_paths = sorted(self.data_paths)
28 |         else:
29 |             raise ValueError(f"Unknown data_paths type: {type(self.data_paths[0])}")
30 | 
31 |         self.rng.seed(seed)
32 |         self.rng.shuffle(data_paths)
33 | 
34 |         num_files_per_rank = len(data_paths) // self.world_size
35 |         local_start = self.local_rank * num_files_per_rank
36 |         local_end = (self.local_rank + 1) * num_files_per_rank
37 |         self.num_files_per_rank = num_files_per_rank
38 |         self.data_paths_per_rank = data_paths[local_start:local_end]
39 | 
40 |     def get_data_paths_per_worker(self):
41 |         if self.data_paths is None:
42 |             return None
43 | 
44 |         info = torch.utils.data.get_worker_info()
45 |         if info is None:
46 |             # Single worker: Use all files assigned to the rank
47 |             return self.data_paths_per_rank, 0
48 | 
49 |         worker_id = info.id
50 |         num_files_per_worker = self.num_files_per_rank // info.num_workers
51 |         start = num_files_per_worker * worker_id
52 |         end = num_files_per_worker * (worker_id + 1)
53 |         data_paths_per_worker = self.data_paths_per_rank[start:end]
54 | 
55 |         return data_paths_per_worker[::-1], worker_id
56 | 
57 |     def __iter__(self):
58 |         raise NotImplementedError
59 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mathvista/prompts/ext_ans.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 OpenGVLab
 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
 6 | #
 7 | # Original file was released under MIT, with the full license text
 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
 9 | #
10 | # This modified file is released under the same license.
11 | 
12 | # pids = 852,  104,  824,  506,  540
13 | 
14 | demo_prompt = """
15 | Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
16 | 
17 | Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
18 | Question: Which number is missing?
19 | 
20 | Model response: The number missing in the sequence is 14.
21 | 
22 | Extracted answer: 14
23 | 
24 | Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
25 | Question: What is the fraction of females facing the camera?
26 | 
27 | Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
28 | 
29 | Extracted answer: 0.6
30 | 
31 | Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
32 | Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
33 | 
34 | Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
35 | 
36 | Extracted answer: 1.45
37 | 
38 | Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
39 | Question: Between which two years does the line  graph saw its maximum peak?
40 | 
41 | Model response: The line graph saw its maximum peak between 2007 and 2008.
42 | 
43 | Extracted answer: [2007, 2008]
44 | 
45 | Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
46 | Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
47 | 
48 | Model response: The correct answer is (B) 8/11.
49 | 
50 | Extracted answer: B
51 | """
52 | 


--------------------------------------------------------------------------------
/eval/gen/gedit/viescore/mllm_tools/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import base64
 3 | from io import BytesIO
 4 | from PIL import Image
 5 | import requests
 6 | 
 7 | def pil_image_to_base64(pil_image, format="PNG"):
 8 |     buffered = BytesIO()
 9 |     pil_image.save(buffered, format=format)  # Save image to the buffer in the specified format
10 |     img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')  # Encode the buffer's content to base64
11 |     return img_str
12 | 
13 | def load_image(image_file):
14 |     if image_file.startswith("http"):
15 |         response = requests.get(image_file)
16 |         image = Image.open(BytesIO(response.content)).convert("RGB")
17 |     else:
18 |         import os
19 |         image = Image.open(image_file).convert("RGB")
20 |     return image
21 | 
22 | 
23 | def load_images(image_files):
24 |     out = []
25 |     for image_file in image_files:
26 |         image = load_image(image_file)
27 |         out.append(image)
28 |     return out
29 | 
30 | def merge_images(image_links: List = []):
31 |         """Merge multiple images into one image
32 | 
33 |         Args:
34 |             image_links (List, optional): List of image links. Defaults to [].
35 | 
36 |         Returns:
37 |             [type]: [description]
38 |         """
39 |         if len(image_links) == 0:
40 |             return None
41 |         images = load_images(image_links)
42 |         if len(images) == 1:
43 |             return images[0]
44 |         widths, heights = zip(*(i.size for i in images))
45 |         average_height = sum(heights) // len(heights)
46 |         for i, im in enumerate(images):
47 |             # scale in proportion
48 |             images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height))
49 |         widths, heights = zip(*(i.size for i in images))
50 |         total_width = sum(widths)
51 |         max_height = max(heights)
52 |         new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height))
53 |         x_offset = 0
54 |         for i, im in enumerate(images):
55 |             if i > 0:
56 |                 # past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black
57 |                 new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
58 |                 x_offset += 1
59 |                 new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0))
60 |                 x_offset += 8
61 |                 new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
62 |                 x_offset += 1
63 |             new_im.paste(im, (x_offset, 0))
64 |             x_offset += im.size[0]
65 |         return new_im


--------------------------------------------------------------------------------
/modeling/siglip/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Inc. team.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from transformers.utils import (
 7 |     OptionalDependencyNotAvailable,
 8 |     _LazyModule,
 9 |     is_sentencepiece_available,
10 |     is_torch_available,
11 |     is_vision_available,
12 | )
13 | 
14 | 
15 | _import_structure = {
16 |     "configuration_siglip": [
17 |         "SiglipConfig",
18 |         "SiglipTextConfig",
19 |         "SiglipVisionConfig",
20 |     ],
21 |     "processing_siglip": ["SiglipProcessor"],
22 | }
23 | 
24 | try:
25 |     if not is_sentencepiece_available():
26 |         raise OptionalDependencyNotAvailable()
27 | except OptionalDependencyNotAvailable:
28 |     pass
29 | else:
30 |     _import_structure["tokenization_siglip"] = ["SiglipTokenizer"]
31 | 
32 | 
33 | try:
34 |     if not is_vision_available():
35 |         raise OptionalDependencyNotAvailable()
36 | except OptionalDependencyNotAvailable:
37 |     pass
38 | else:
39 |     _import_structure["image_processing_siglip"] = ["SiglipImageProcessor"]
40 | 
41 | try:
42 |     if not is_torch_available():
43 |         raise OptionalDependencyNotAvailable()
44 | except OptionalDependencyNotAvailable:
45 |     pass
46 | else:
47 |     _import_structure["modeling_siglip"] = [
48 |         "SiglipModel",
49 |         "SiglipPreTrainedModel",
50 |         "SiglipTextModel",
51 |         "SiglipVisionModel",
52 |         "SiglipForImageClassification",
53 |     ]
54 | 
55 | 
56 | if TYPE_CHECKING:
57 |     from .configuration_siglip import (
58 |         SiglipConfig,
59 |         SiglipTextConfig,
60 |         SiglipVisionConfig,
61 |     )
62 |     from .processing_siglip import SiglipProcessor
63 | 
64 |     try:
65 |         if not is_sentencepiece_available():
66 |             raise OptionalDependencyNotAvailable()
67 |     except OptionalDependencyNotAvailable:
68 |         pass
69 |     else:
70 |         from .tokenization_siglip import SiglipTokenizer
71 | 
72 |     try:
73 |         if not is_vision_available():
74 |             raise OptionalDependencyNotAvailable()
75 |     except OptionalDependencyNotAvailable:
76 |         pass
77 |     else:
78 |         from .image_processing_siglip import SiglipImageProcessor
79 | 
80 |     try:
81 |         if not is_torch_available():
82 |             raise OptionalDependencyNotAvailable()
83 |     except OptionalDependencyNotAvailable:
84 |         pass
85 |     else:
86 |         from .modeling_siglip import (
87 |             SiglipForImageClassification,
88 |             SiglipModel,
89 |             SiglipPreTrainedModel,
90 |             SiglipTextModel,
91 |             SiglipVisionModel,
92 |         )
93 | 
94 | 
95 | else:
96 |     import sys
97 | 
98 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
99 | 


--------------------------------------------------------------------------------
/data/interleave_datasets/edit_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import io
 5 | import random
 6 | from PIL import Image, ImageFile, PngImagePlugin
 7 | 
 8 | from .interleave_t2i_dataset import InterleavedBaseIterableDataset, ParquetStandardIterableDataset
 9 | from ..data_utils import pil_img2rgb
10 | 
11 | 
12 | Image.MAX_IMAGE_PIXELS = 200000000
13 | ImageFile.LOAD_TRUNCATED_IMAGES = True
14 | MaximumDecompressedSize = 1024
15 | MegaByte = 2 ** 20
16 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
17 | 
18 | 
19 | class UnifiedEditIterableDataset(InterleavedBaseIterableDataset, ParquetStandardIterableDataset):
20 | 
21 |     def parse_row(self, row):
22 |         image_num = len(row["image_list"])
23 |         # randomly choose start and end, return [0, 1] when only two images
24 |         start_idx = random.choice(range(image_num - 1))
25 |         max_end = min(start_idx + 3, image_num)
26 |         end_idx = random.choice(range(start_idx + 1, max_end))
27 | 
28 |         data = self._init_data()
29 |         data = self._add_image(
30 |             data, 
31 |             pil_img2rgb(Image.open(io.BytesIO(row["image_list"][start_idx]))),
32 |             need_loss=False, 
33 |             need_vae=True, 
34 |             need_vit=True, 
35 |         )
36 | 
37 |         if end_idx - start_idx > 1 and random.random() < 0.5: # concat multiple insturction
38 |             if end_idx == image_num - 1:
39 |                 end_idx -= 1
40 | 
41 |             instruction = ""
42 |             for idx in range(start_idx + 1, end_idx + 1):
43 |                 instruction += random.choice(row["instruction_list"][idx-1]) + ". "
44 |             data = self._add_text(data, instruction.rstrip(), need_loss=False)
45 |             data = self._add_image(
46 |                 data, 
47 |                 pil_img2rgb(Image.open(io.BytesIO(row["image_list"][end_idx]))),
48 |                 need_loss=True, 
49 |                 need_vae=False, 
50 |                 need_vit=False,
51 |             )
52 |         else:
53 |             for idx in range(start_idx + 1, end_idx + 1):
54 |                 instruction = random.choice(row["instruction_list"][idx-1])
55 |                 data = self._add_text(data, instruction, need_loss=False)
56 |                 if idx != end_idx:
57 |                     data = self._add_image(
58 |                         data, 
59 |                         pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
60 |                         need_loss=True, 
61 |                         need_vae=True, 
62 |                         need_vit=True,
63 |                     )
64 |                 else:
65 |                     data = self._add_image(
66 |                         data, 
67 |                         pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
68 |                         need_loss=True, 
69 |                         need_vae=False, 
70 |                         need_vit=False,
71 |                     )
72 |         return data
73 | 


--------------------------------------------------------------------------------
/data/parquet_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | import os
 6 | import subprocess
 7 | import logging
 8 | 
 9 | import pyarrow.fs as pf
10 | import torch.distributed as dist
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def get_parquet_data_paths(data_dir_list, num_sampled_data_paths, rank=0, world_size=1):
16 |     num_data_dirs = len(data_dir_list)
17 |     if world_size > 1:
18 |         chunk_size = (num_data_dirs + world_size - 1) // world_size
19 |         start_idx = rank * chunk_size
20 |         end_idx = min(start_idx + chunk_size, num_data_dirs)
21 |         local_data_dir_list = data_dir_list[start_idx:end_idx]
22 |         local_num_sampled_data_paths = num_sampled_data_paths[start_idx:end_idx]
23 |     else:
24 |         local_data_dir_list = data_dir_list
25 |         local_num_sampled_data_paths = num_sampled_data_paths
26 | 
27 |     local_data_paths = []
28 |     for data_dir, num_data_path in zip(local_data_dir_list, local_num_sampled_data_paths):
29 |         if data_dir.startswith("hdfs://"):
30 |             files = hdfs_ls_cmd(data_dir)
31 |             data_paths_per_dir = [
32 |                 file for file in files if file.endswith(".parquet")
33 |             ]
34 |         else:
35 |             files = os.listdir(data_dir)
36 |             data_paths_per_dir = [
37 |                 os.path.join(data_dir, name)
38 |                 for name in files
39 |                 if name.endswith(".parquet")
40 |             ]
41 |         repeat = num_data_path // len(data_paths_per_dir)
42 |         data_paths_per_dir = data_paths_per_dir * (repeat + 1)
43 |         local_data_paths.extend(data_paths_per_dir[:num_data_path])
44 | 
45 |     if world_size > 1:
46 |         gather_list = [None] * world_size
47 |         dist.all_gather_object(gather_list, local_data_paths)
48 | 
49 |         combined_chunks = []
50 |         for chunk_list in gather_list:
51 |             if chunk_list is not None:
52 |                 combined_chunks.extend(chunk_list)
53 |     else:
54 |         combined_chunks = local_data_paths
55 | 
56 |     return combined_chunks
57 | 
58 | 
59 | # NOTE: cumtomize this function for your cluster
60 | def get_hdfs_host():
61 |     return "hdfs://xxx"
62 | 
63 | 
64 | # NOTE: cumtomize this function for your cluster
65 | def get_hdfs_block_size():
66 |     return 134217728
67 | 
68 | 
69 | # NOTE: cumtomize this function for your cluster
70 | def get_hdfs_extra_conf():
71 |     return None
72 | 
73 | 
74 | def init_arrow_pf_fs(parquet_file_path):
75 |     if parquet_file_path.startswith("hdfs://"):
76 |         fs = pf.HadoopFileSystem(
77 |             host=get_hdfs_host(),
78 |             port=0,
79 |             buffer_size=get_hdfs_block_size(),
80 |             extra_conf=get_hdfs_extra_conf(),
81 |         )
82 |     else:
83 |         fs = pf.LocalFileSystem()
84 |     return fs
85 | 
86 | 
87 | def hdfs_ls_cmd(dir):
88 |     result = subprocess.run(["hdfs", "dfs", "ls", dir], capture_output=True, text=True).stdout
89 |     return ['hdfs://' + i.split('hdfs://')[-1].strip() for i in result.split('\n') if 'hdfs://' in i]
90 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 OpenGVLab
 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
 6 | #
 7 | # Original file was released under MIT, with the full license text
 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
 9 | #
10 | # This modified file is released under the same license.
11 | 
12 | import argparse
13 | import os
14 | import re
15 | 
16 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation
17 | from PIL import Image
18 | from tqdm import tqdm
19 | 
20 | 
21 | def post_processing(response):
22 |     response = response.replace('\n', '').replace('不是', 'No').replace('是', 'Yes').replace('否', 'No')
23 |     response = response.lower().replace('true', 'yes').replace('false', 'no')
24 |     pattern = re.compile(r'[\u4e00-\u9fa5]')
25 |     response = re.sub(pattern, '', response)
26 |     return response
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--root', type=str, default='eval/vlm/eval/mme/Your_Results')
32 |     parser.add_argument('--out-dir', type=str, default='results')
33 |     parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/')
34 |     args = parser.parse_args()
35 | 
36 |     model, tokenizer, new_token_ids = load_model_and_tokenizer(args)
37 |     image_transform = build_transform()
38 | 
39 |     total_params = sum(p.numel() for p in model.parameters()) / 1e9
40 |     print(f'[test] total_params: {total_params}B')
41 | 
42 |     os.makedirs(args.out_dir, exist_ok=True)
43 |     prompt = 'Answer the question using a single word or phrase.'
44 | 
45 |     for filename in os.listdir(args.root):
46 |         fin = open(os.path.join(args.root, filename), 'r', encoding='utf-8')
47 |         fout = open(os.path.join(args.out_dir, filename), 'w', encoding='utf-8')
48 |         lines = fin.readlines()
49 |         filename = filename.replace('.txt', '')
50 |         for line in tqdm(lines):
51 |             img, question, gt = line.strip().split('\t')
52 |             question = question + ' ' + prompt
53 |             img_path = os.path.join('eval/vlm/data/mme/MME_Benchmark_release_version', filename, img)
54 |             if not os.path.exists(img_path):
55 |                 img_path = os.path.join('eval/vlm/data/mme/MME_Benchmark_release_version', filename, "images", img)
56 |             if not os.path.exists(img_path):
57 |                 continue
58 |             images = [Image.open(img_path).convert('RGB')]
59 |             images, conversation = process_conversation(images, question)
60 | 
61 |             response = model.chat(
62 |                 tokenizer, 
63 |                 new_token_ids,
64 |                 image_transform,
65 |                 images=images,
66 |                 prompt=conversation,
67 |                 max_length=20,
68 |             )
69 |             response = post_processing(response)
70 |             print(img, question, gt, response, sep='\t', file=fout)
71 |         fin.close()
72 |         fout.close()
73 | 
74 |     os.system(f"python -m eval.vlm.eval.mme.calculation --out-dir {args.out_dir}")
75 | 


--------------------------------------------------------------------------------
/eval/vlm/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 OpenGVLab
 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
 6 | #
 7 | # Original file was released under MIT, with the full license text
 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
 9 | #
10 | # This modified file is released under the same license.
11 | 
12 | import os
13 | import yaml
14 | 
15 | from data.data_utils import add_special_tokens, pil_img2rgb
16 | from modeling.bagel import (
17 |     BagelConfig, 
18 |     Bagel, 
19 |     Qwen2Config, 
20 |     Qwen2ForCausalLM, 
21 |     SiglipVisionConfig, 
22 |     SiglipVisionModel,
23 | )
24 | from modeling.qwen2 import Qwen2Tokenizer
25 | from safetensors.torch import load_file
26 | 
27 | from data.transforms import ImageTransform
28 | 
29 | 
30 | def load_model_and_tokenizer(args):
31 |     llm_config = Qwen2Config.from_json_file(os.path.join(args.model_path, "llm_config.json"))
32 |     llm_config.qk_norm = True
33 |     llm_config.tie_word_embeddings = False
34 |     llm_config.layer_module ="Qwen2MoTDecoderLayer"
35 | 
36 |     vit_config = SiglipVisionConfig.from_json_file(os.path.join(args.model_path, "vit_config.json"))
37 |     vit_config.rope = False
38 |     vit_config.num_hidden_layers = vit_config.num_hidden_layers - 1
39 | 
40 |     config = BagelConfig(
41 |         visual_gen=False,
42 |         visual_und=True,
43 |         llm_config=llm_config, 
44 |         vit_config=vit_config,
45 |         vit_max_num_patch_per_side=70,
46 |         connector_act='gelu_pytorch_tanh',
47 |     )
48 |     language_model = Qwen2ForCausalLM(llm_config)
49 |     vit_model = SiglipVisionModel(vit_config)
50 |     model = Bagel(language_model, vit_model, config)
51 |     model.vit_model.vision_model.embeddings.convert_conv2d_to_linear(vit_config)
52 | 
53 |     tokenizer = Qwen2Tokenizer.from_pretrained(args.model_path)
54 |     tokenizer, new_token_ids, _ = add_special_tokens(tokenizer)
55 | 
56 |     model_state_dict_path = os.path.join(args.model_path, "ema.safetensors")
57 |     model_state_dict = load_file(model_state_dict_path, device="cpu")
58 |     msg = model.load_state_dict(model_state_dict, strict=False)
59 |     print(msg)
60 |     del model_state_dict
61 |     model = model.cuda().eval()
62 | 
63 |     return model, tokenizer, new_token_ids
64 | 
65 | 
66 | def build_transform():
67 |     with open("./data/configs/example.yaml", "r") as f:
68 |         data_config = yaml.safe_load(f)
69 | 
70 |     max_image_size = data_config['vlm_sft']['image_transform_args']['max_image_size']
71 |     min_image_size = data_config['vlm_sft']['image_transform_args']['min_image_size']
72 |     image_stride = data_config['vlm_sft']['image_transform_args']['image_stride']
73 |     max_pixels = data_config['vlm_sft']['image_transform_args']['max_pixels']
74 | 
75 |     image_transform = ImageTransform(
76 |         max_image_size=max_image_size,
77 |         min_image_size=min_image_size,
78 |         image_stride=image_stride,
79 |         max_pixels=max_pixels,
80 |     )
81 | 
82 |     return image_transform
83 | 
84 | 
85 | def process_conversation(images, conversation):
86 |     images = [pil_img2rgb(image) for image in images]
87 |     return images, conversation
88 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/OCR.txt:
--------------------------------------------------------------------------------
 1 | 0001.jpg	Is the word in the logo "angie's"? Please answer yes or no.	Yes
 2 | 0001.jpg	Is the word in the logo "angle's"? Please answer yes or no.	No
 3 | 0002.jpg	Is the word in the logo "c'est cheese"? Please answer yes or no.	Yes
 4 | 0002.jpg	Is the word in the logo "crest cheese"? Please answer yes or no.	No
 5 | 0003.jpg	Is the word in the logo "beavertails pastry"? Please answer yes or no.	Yes
 6 | 0003.jpg	Is the word in the logo "beavertalls pastry"? Please answer yes or no.	No
 7 | 0004.jpg	Is the word in the logo "old market sundries"? Please answer yes or no.	Yes
 8 | 0004.jpg	Is the word in the logo "old market hundreds"? Please answer yes or no.	No
 9 | 0005.jpg	Is the word in the logo "kress"? Please answer yes or no.	Yes
10 | 0005.jpg	Is the word in the logo "dress"? Please answer yes or no.	No
11 | 0006.jpg	Is the word in the logo "the beatles story liver pool"? Please answer yes or no.	Yes
12 | 0006.jpg	Is the word in the logo "the beats story liver pool"? Please answer yes or no.	No
13 | 0007.jpg	Is the phone number in the picture "0131 555 6363"? Please answer yes or no.	Yes
14 | 0007.jpg	Is the phone number in the picture "0137 556 6363"? Please answer yes or no.	No
15 | 0008.jpg	Is the word in the logo "phil's market"? Please answer yes or no.	Yes
16 | 0008.jpg	Is the word in the logo "phll's market"? Please answer yes or no.	No
17 | 0009.jpg	Is the word in the logo "fenders diner"? Please answer yes or no.	Yes
18 | 0009.jpg	Is the word in the logo "finders diner"? Please answer yes or no.	No
19 | 0010.jpg	Is the word in the logo "high time coffee shop"? Please answer yes or no.	Yes
20 | 0010.jpg	Is the word in the logo "high tite cofeee shop"? Please answer yes or no.	No
21 | 0011.jpg	Is the word in the logo "ihop restaurant"? Please answer yes or no.	Yes
22 | 0011.jpg	Is the word in the logo "lhop restaurant"? Please answer yes or no.	No
23 | 0012.jpg	Is the word in the logo "casa grecque restaurants"? Please answer yes or no.	Yes
24 | 0012.jpg	Is the word in the logo "case grecque restaurants"? Please answer yes or no.	No
25 | 0013.jpg	Is the word in the picture "seabreeze motel"? Please answer yes or no.	Yes
26 | 0013.jpg	Is the word in the picture "seebreeze model"? Please answer yes or no.	No
27 | 0014.jpg	Is the word in the logo "penarth pier built 1894"? Please answer yes or no.	Yes
28 | 0014.jpg	Is the word in the logo "penarth pies buid 1894"? Please answer yes or no.	No
29 | 0015.jpg	Is the text in the picture "hollywood"? Please answer yes or no.	Yes
30 | 0015.jpg	Is the text in the picture "holly word"? Please answer yes or no.	No
31 | 0016.jpg	Is the word in the logo "shop rite"? Please answer yes or no.	Yes
32 | 0016.jpg	Is the word in the logo "stop rite"? Please answer yes or no.	No
33 | 0017.jpg	Is the word in the logo "hardco industrial construction"? Please answer yes or no.	Yes
34 | 0017.jpg	Is the word in the logo "hardto industal construction"? Please answer yes or no.	No
35 | 0018.jpg	Is the word in the logo "oldsmobile service"? Please answer yes or no.	Yes
36 | 0018.jpg	Is the word in the logo "old mobile service"? Please answer yes or no.	No
37 | 0019.jpg	Is the word in the logo "exchange hotel"? Please answer yes or no.	Yes
38 | 0019.jpg	Is the word in the logo "excharge hotel"? Please answer yes or no.	No
39 | 0020.jpg	Is the word in the logo "cold drinks"? Please answer yes or no.	Yes
40 | 0020.jpg	Is the word in the logo "cold rinks"? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/numerical_calculation.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	Is the answer to the arithmetic question in the image 225? Please answer yes or no.	Yes
 2 | 0001.png	Is the answer to the arithmetic question in the image 1515? Please answer yes or no.	No
 3 | 0002.png	Is the answer to the arithmetic question in the image 340? Please answer yes or no.	Yes
 4 | 0002.png	Is the answer to the arithmetic question in the image 17? Please answer yes or no.	No
 5 | 0003.png	Is the answer to the arithmetic question in the image 65? Please answer yes or no.	Yes
 6 | 0003.png	Is the answer to the arithmetic question in the image 56? Please answer yes or no.	No
 7 | 0004.png	Is the answer to the arithmetic question in the image 33? Please answer yes or no.	Yes
 8 | 0004.png	Is the answer to the arithmetic question in the image 32? Please answer yes or no.	No
 9 | 0005.png	Is the area of the square in the picture equal to 40? Please answer yes or no.	Yes
10 | 0005.png	Is the area of the square in the picture equal to 8? Please answer yes or no.	No
11 | 0006.png	Is the area of the square in the picture equal to 9? Please answer yes or no.	Yes
12 | 0006.png	Is the area of the square in the picture equal to 3? Please answer yes or no.	No
13 | 0007.png	Is the answer to the arithmetic question in the image 49? Please answer yes or no.	Yes
14 | 0007.png	Is the answer to the arithmetic question in the image 39? Please answer yes or no.	No
15 | 0008.png	Should the value of "a" in the picture equal 7? Please answer yes or no.	Yes
16 | 0008.png	Should the value of "a" in the picture equal 14? Please answer yes or no.	No
17 | 0009.png	Should the value of "a" in the picture equal 2? Please answer yes or no.	Yes
18 | 0009.png	Should the value of "a" in the picture equal 3? Please answer yes or no.	No
19 | 0010.png	Is the answer to the arithmetic question in the image 13? Please answer yes or no.	Yes
20 | 0010.png	Is the answer to the arithmetic question in the image 12? Please answer yes or no.	No
21 | 0011.png	Is the area of the parallelogram in the picture equal to 24? Please answer yes or no.	Yes
22 | 0011.png	Is the area of the parallelogram in the picture equal to 6? Please answer yes or no.	No
23 | 0012.png	Should the value of "a" in the picture equal 9? Please answer yes or no.	Yes
24 | 0012.png	Should the value of "a" in the picture equal 1? Please answer yes or no.	No
25 | 0013.png	Is the area of the right triangle in the picture equal to 24? Please answer yes or no.	Yes
26 | 0013.png	Is the area of the right triangle in the picture equal to 8? Please answer yes or no.	No
27 | 0014.png	Is the answer to the arithmetic question in the image 200? Please answer yes or no.	Yes
28 | 0014.png	Is the answer to the arithmetic question in the image 400? Please answer yes or no.	No
29 | 0015.png	Is the answer to the arithmetic question in the image 11? Please answer yes or no.	Yes
30 | 0015.png	Is the answer to the arithmetic question in the image 111? Please answer yes or no.	No
31 | 0016.png	Is the answer to the arithmetic question in the image 9? Please answer yes or no.	Yes
32 | 0016.png	Is the answer to the arithmetic question in the image 16? Please answer yes or no.	No
33 | 0017.png	Is the answer to the arithmetic question in the image 14? Please answer yes or no.	Yes
34 | 0017.png	Is the answer to the arithmetic question in the image 83? Please answer yes or no.	No
35 | 0018.png	Should the value of "a" in the picture equal 3? Please answer yes or no.	Yes
36 | 0018.png	Should the value of "a" in the picture equal 2? Please answer yes or no.	No
37 | 0019.png	Is the answer to the arithmetic question in the image 18? Please answer yes or no.	Yes
38 | 0019.png	Is the answer to the arithmetic question in the image 36? Please answer yes or no.	No
39 | 0020.png	Is the answer to the arithmetic question in the image 9? Please answer yes or no.	Yes
40 | 0020.png	Is the answer to the arithmetic question in the image 45? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/code_reasoning.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	The image shows a python code. Is the output of the code 'Hello'? Please answer yes or no.	Yes
 2 | 0001.png	The image shows a python code. Is the output of the code 'World'? Please answer yes or no.	No
 3 | 0002.png	The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no.	Yes
 4 | 0002.png	The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no.	No
 5 | 0003.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	Yes
 6 | 0003.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
 7 | 0004.png	The image shows a python code. Is the output of the code '3'? Please answer yes or no.	Yes
 8 | 0004.png	The image shows a python code. Is the output of the code '2'? Please answer yes or no.	No
 9 | 0005.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	Yes
10 | 0005.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
11 | 0006.png	The image shows a python code. Is the output of the code '0'? Please answer yes or no.	Yes
12 | 0006.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
13 | 0007.png	Is a c++ code shown in the picture? Please answer yes or no.	Yes
14 | 0007.png	Is a python code shown in the picture? Please answer yes or no.	No
15 | 0008.png	The image shows a python code. Is the output of the code '1234'? Please answer yes or no.	Yes
16 | 0008.png	The image shows a python code. Is the output of the code '12345'? Please answer yes or no.	No
17 | 0009.png	The image shows a python code. Is the output of the code '36'? Please answer yes or no.	Yes
18 | 0009.png	The image shows a python code. Is the output of the code '6'? Please answer yes or no.	No
19 | 0010.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	Yes
20 | 0010.png	The image shows a python code. Is the output of the code '5'? Please answer yes or no.	No
21 | 0011.png	The image shows a python code. Is the output of the code '0'? Please answer yes or no.	Yes
22 | 0011.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
23 | 0012.png	The image shows a python code. Is the output of the code 'working hard'? Please answer yes or no.	Yes
24 | 0012.png	The image shows a python code. Is the output of the code 'playing hard'? Please answer yes or no.	No
25 | 0013.png	The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no.	Yes
26 | 0013.png	The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no.	No
27 | 0014.png	The image shows a python code. Is the output of the code '7'? Please answer yes or no.	Yes
28 | 0014.png	The image shows a python code. Is the output of the code '1'? Please answer yes or no.	No
29 | 0015.png	The image shows a python code. Is the output of the code '11'? Please answer yes or no.	Yes
30 | 0015.png	The image shows a python code. Is the output of the code '9'? Please answer yes or no.	No
31 | 0016.png	The image shows a python code. Is the output of the code 'x is smaller than 10'? Please answer yes or no.	Yes
32 | 0016.png	The image shows a python code. Is the output of the code 'x is larger than 10'? Please answer yes or no.	No
33 | 0017.png	The image shows a python code. Will the number 3 appear in the output of the code? Please answer yes or no.	Yes
34 | 0017.png	The image shows a python code. Will the number 6 appear in the output of the code? Please answer yes or no.	No
35 | 0018.png	The image shows a python code. Is the output of the code '11'? Please answer yes or no.	Yes
36 | 0018.png	The image shows a python code. Is the output of the code '12'? Please answer yes or no.	No
37 | 0019.png	The image shows a python code. Is the output of the code 'the list has more than 2 numbers'? Please answer yes or no.	Yes
38 | 0019.png	The image shows a python code. Is the output of the code 'the list has less than 2 numbers'? Please answer yes or no.	No
39 | 0020.png	Is a python code shown in the picture? Please answer yes or no.	Yes
40 | 0020.png	Is a c++ code shown in the picture? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mmvet/evaluate_mmvet.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import argparse
 13 | import json
 14 | import os
 15 | import random
 16 | 
 17 | import torch
 18 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation
 19 | from PIL import Image
 20 | from tqdm import tqdm
 21 | 
 22 | ds_collections = {
 23 |     'mmvet': {
 24 |         'root': 'eval/vlm/data/mm-vet/images',
 25 |         'question': 'eval/vlm/data/mm-vet/llava-mm-vet.jsonl',
 26 |         'metric': None,
 27 |         'max_new_tokens': 1000,
 28 |         'min_new_tokens': 1,
 29 |     }
 30 | }
 31 | 
 32 | 
 33 | class VQADataset(torch.utils.data.Dataset):
 34 | 
 35 |     def __init__(self, root, data, prompt):
 36 |         self.root = root
 37 |         self.data = open(data).readlines()
 38 |         self.prompt = prompt
 39 |         
 40 |     def __len__(self):
 41 |         return len(self.data)
 42 | 
 43 |     def __getitem__(self, idx):
 44 |         data = json.loads(self.data[idx].strip())
 45 |         image, question, question_id, annotation = data['image'], data[
 46 |             'text'], data['question_id'], data.get('answer', None)
 47 | 
 48 |         image = os.path.join(self.root, image)
 49 |         image = Image.open(image).convert('RGB')
 50 |         images = [image]
 51 |         
 52 |         question = question + ' ' + self.prompt
 53 | 
 54 |         images, conversation = process_conversation(images, question)
 55 | 
 56 |         return question_id, question, images, conversation, annotation
 57 | 
 58 | 
 59 | def evaluate_chat_model():
 60 |     random.seed(args.seed)
 61 |     prompt = ''
 62 | 
 63 |     for ds_name in args.datasets:
 64 |         dataset = VQADataset(
 65 |             root=ds_collections[ds_name]['root'],
 66 |             data=ds_collections[ds_name]['question'],
 67 |             prompt=prompt,
 68 |         )
 69 | 
 70 |         outputs = {}
 71 |         for _, (question_id, question, images, conversation, annotations) in tqdm(enumerate(dataset)):
 72 |             pred = model.chat(
 73 |                 tokenizer, 
 74 |                 new_token_ids,
 75 |                 image_transform,
 76 |                 images=images,
 77 |                 prompt=conversation,
 78 |                 max_length=ds_collections[ds_name]['max_new_tokens'], # TODO: how to use ds_collections[ds_name]['min_new_tokens']
 79 |             )
 80 | 
 81 |             outputs[f'v1_{question_id}'] = pred
 82 | 
 83 |         print(f'Evaluating {ds_name} ...')
 84 |         results_file = os.path.join(args.out_dir, 'results.json')
 85 |         json.dump(outputs, open(results_file, 'w'))
 86 |         print('Results saved to {}'.format(results_file))
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     parser = argparse.ArgumentParser()
 91 |     parser.add_argument('--datasets', type=str, default='mmvet')
 92 |     parser.add_argument('--batch-size', type=int, default=1)
 93 |     parser.add_argument('--num-workers', type=int, default=1)
 94 |     parser.add_argument('--out-dir', type=str, default='results')
 95 |     parser.add_argument('--seed', type=int, default=0)
 96 |     parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/')
 97 |     args = parser.parse_args()
 98 | 
 99 |     if not os.path.exists(args.out_dir):
100 |         os.makedirs(args.out_dir, exist_ok=True)
101 | 
102 |     args.datasets = args.datasets.split(',')
103 |     print('datasets:', args.datasets)
104 |     assert args.batch_size == 1, 'Only batch size 1 is supported'
105 | 
106 |     model, tokenizer, new_token_ids = load_model_and_tokenizer(args)
107 |     image_transform = build_transform()
108 | 
109 |     total_params = sum(p.numel() for p in model.parameters()) / 1e9
110 |     print(f'[test] total_params: {total_params}B')
111 | 
112 |     evaluate_chat_model()
113 | 


--------------------------------------------------------------------------------
/eval/gen/gedit/viescore/mllm_tools/qwen25vl_eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import time
  4 | from PIL import Image
  5 | from typing import List
  6 | from transformers import AutoModel, AutoTokenizer
  7 | from transformers.utils import is_flash_attn_2_available
  8 | from transformers import Qwen2_5_VLForConditionalGeneration
  9 | from qwen_vl_utils import process_vision_info
 10 | from transformers import AutoProcessor
 11 | import requests
 12 | from io import BytesIO
 13 | import random
 14 | import numpy as np
 15 | import base64
 16 | import magic
 17 | import megfile
 18 | 
 19 | def process_image(image):
 20 |     img_byte_arr = BytesIO()
 21 |     image.save(img_byte_arr, format='PNG')
 22 |     img_byte_arr = img_byte_arr.getvalue()
 23 |     return img_byte_arr
 24 | 
 25 | def convert_image_to_base64(file_content):
 26 |     mime_type = magic.from_buffer(file_content, mime=True)
 27 |     base64_encoded_data = base64.b64encode(file_content).decode('utf-8')
 28 |     return f"data:{mime_type};base64,{base64_encoded_data}"
 29 | 
 30 | 
 31 | def set_seed(seed: int):
 32 |     """
 33 |     Args:
 34 |     Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
 35 |         seed (`int`): The seed to set.
 36 |     """
 37 |     random.seed(seed)
 38 |     np.random.seed(seed)
 39 |     torch.manual_seed(seed)
 40 |     torch.cuda.manual_seed_all(seed)
 41 | 
 42 | class Qwen25VL():
 43 |     def __init__(self) -> None:     
 44 |         attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
 45 |         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 46 |             "/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ", 
 47 |             torch_dtype=torch.float16, 
 48 |             device_map="auto"
 49 |         ).eval()
 50 |         self.processor = AutoProcessor.from_pretrained("/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ")
 51 | 
 52 |         print(f"Using {attn_implementation} for attention implementation")
 53 | 
 54 |     def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
 55 |         if not isinstance(image_links, list):
 56 |             image_links = [image_links]
 57 |         
 58 |         image_links_base64 = []
 59 | 
 60 |         for img_link in image_links:
 61 |             if type(img_link) == str:
 62 |                 image_links_base64.append(convert_image_to_base64(process_image(megfile.smart_open(img_link, 'rb'))))
 63 |             else:
 64 |                 image_links_base64.append(convert_image_to_base64(process_image(img_link)))
 65 |         
 66 |         messages = [
 67 |             {
 68 |                 "role": "user",
 69 |                 "content": [
 70 |                     {"type": "image", "image": img_link} for img_link in image_links_base64
 71 |                 ] + [{"type": "text", "text": text_prompt}]
 72 |             }
 73 |         ]
 74 |         return messages
 75 | 
 76 |     def get_parsed_output(self, messages):
 77 |         set_seed(42)
 78 |         # Prepare the inputs
 79 |         text = self.processor.apply_chat_template(
 80 |             messages, tokenize=False, add_generation_prompt=True
 81 |         )
 82 |         image_inputs, video_inputs = process_vision_info(messages)
 83 |         
 84 |         # Process inputs
 85 |         inputs = self.processor(
 86 |             text=[text],
 87 |             images=image_inputs,
 88 |             videos=video_inputs,
 89 |             padding=True,
 90 |             return_tensors="pt"
 91 |         )
 92 |         inputs = inputs.to("cuda")
 93 | 
 94 |         # Generate output
 95 |         generation_config = {
 96 |             "max_new_tokens": 512,
 97 |             "num_beams": 1,
 98 |             "do_sample": False,
 99 |             "temperature": 0.1,
100 |             "top_p": None,
101 |         }
102 |         generated_ids = self.model.generate(**inputs, **generation_config)
103 |         generated_ids_trimmed = [
104 |             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
105 |         ]
106 |         output_text = self.processor.batch_decode(
107 |             generated_ids_trimmed, 
108 |             skip_special_tokens=True, 
109 |             clean_up_tokenization_spaces=False
110 |         )
111 |         
112 |         return output_text[0] if output_text else ""
113 | 
114 | if __name__ == "__main__":
115 |     model = Qwen25VL()
116 |     prompt = model.prepare_prompt(
117 |         ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"], 
118 |         'Describe the image in detail.'
119 |     )
120 |     res = model.get_parsed_output(prompt)
121 |     print("result : \n", res)


--------------------------------------------------------------------------------
/eval/vlm/eval/pope/eval_pope.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import argparse
 13 | import json
 14 | import os
 15 | import numpy as np
 16 | 
 17 | 
 18 | def eval_pope(answers, label_file):
 19 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 20 | 
 21 |     for answer in answers:
 22 |         text = answer['text']
 23 | 
 24 |         # Only keep the first sentence
 25 |         if text.find('.') != -1:
 26 |             text = text.split('.')[0]
 27 | 
 28 |         text = text.replace(',', '')
 29 |         words = text.split(' ')
 30 |         if 'No' in words or 'not' in words or 'no' in words:
 31 |             answer['text'] = 'no'
 32 |         else:
 33 |             answer['text'] = 'yes'
 34 | 
 35 |     for i in range(len(label_list)):
 36 |         if label_list[i] == 'no':
 37 |             label_list[i] = 0
 38 |         else:
 39 |             label_list[i] = 1
 40 | 
 41 |     pred_list = []
 42 |     for answer in answers:
 43 |         if answer['text'] == 'no':
 44 |             pred_list.append(0)
 45 |         else:
 46 |             pred_list.append(1)
 47 | 
 48 |     pos = 1
 49 |     neg = 0
 50 |     yes_ratio = pred_list.count(1) / len(pred_list)
 51 | 
 52 |     TP, TN, FP, FN = 0, 0, 0, 0
 53 |     for pred, label in zip(pred_list, label_list):
 54 |         if pred == pos and label == pos:
 55 |             TP += 1
 56 |         elif pred == pos and label == neg:
 57 |             FP += 1
 58 |         elif pred == neg and label == neg:
 59 |             TN += 1
 60 |         elif pred == neg and label == pos:
 61 |             FN += 1
 62 | 
 63 |     ret_message = ""
 64 |     print('TP\tFP\tTN\tFN\t')
 65 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
 66 |     ret_message += 'TP\tFP\tTN\tFN\t\n'
 67 |     ret_message += '{}\t{}\t{}\t{}\n'.format(TP, FP, TN, FN)
 68 | 
 69 |     precision = float(TP) / float(TP + FP)
 70 |     recall = float(TP) / float(TP + FN)
 71 |     f1 = 2 * precision * recall / (precision + recall)
 72 |     acc = (TP + TN) / (TP + TN + FP + FN)
 73 |     print('Accuracy: {}'.format(acc))
 74 |     print('Precision: {}'.format(precision))
 75 |     print('Recall: {}'.format(recall))
 76 |     print('F1 score: {}'.format(f1))
 77 |     print('Yes ratio: {}'.format(yes_ratio))
 78 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio))
 79 | 
 80 |     ret_message += 'Accuracy: {}\n'.format(acc)
 81 |     ret_message += 'Precision: {}\n'.format(precision)
 82 |     ret_message += 'Recall: {}\n'.format(recall)
 83 |     ret_message += 'F1 score: {}\n'.format(f1)
 84 |     ret_message += 'Yes ratio: {}\n'.format(yes_ratio)
 85 |     ret_message += '%.3f, %.3f, %.3f, %.3f, %.3f\n' % (f1, acc, precision, recall, yes_ratio)
 86 |     return f1, ret_message
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     parser = argparse.ArgumentParser()
 91 |     parser.add_argument('--annotation-dir', type=str)
 92 |     parser.add_argument('--question-file', type=str)
 93 |     parser.add_argument('--result-file', type=str)
 94 |     parser.add_argument('--out-dir', type=str)
 95 |     args = parser.parse_args()
 96 | 
 97 |     questions = [json.loads(line) for line in open(args.question_file)]
 98 |     questions = {question['question_id']: question for question in questions}
 99 |     answers = json.loads(open(args.result_file).read())
100 |     avg_f1 = []
101 |     ret_message = ""
102 |     for file in os.listdir(args.annotation_dir):
103 |         assert file.startswith('coco_pope_')
104 |         assert file.endswith('.json')
105 |         category = file[10:-5]
106 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
107 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
108 |         ret_message += 'Category: {}, # samples: {}\n'.format(category, len(cur_answers))
109 |         f1, ret = eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
110 |         ret_message += ret
111 |         print('====================================')
112 |         ret_message += '====================================\n'
113 |         avg_f1.append(f1)
114 |     print(f"Avg F1 score: {np.array(avg_f1).mean()}")
115 |     ret_message += f"Avg F1 score: {np.array(avg_f1).mean()}\n"
116 | 
117 |     writer = open(os.path.join(args.out_dir, "results.txt"), 'w')
118 |     print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}")
119 |     writer.write(ret_message)
120 |     writer.close()
121 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/existence.txt:
--------------------------------------------------------------------------------
 1 | 000000006040.jpg	Is there a train in this image? Please answer yes or no.	Yes
 2 | 000000006040.jpg	Is there a bed in this image? Please answer yes or no.	No
 3 | 000000006471.jpg	Is there a baseball bat in this image? Please answer yes or no.	Yes
 4 | 000000006471.jpg	Is there a giraffe in this image? Please answer yes or no.	No
 5 | 000000007108.jpg	Is there a elephant in this image? Please answer yes or no.	Yes
 6 | 000000007108.jpg	Is there a hair drier in this image? Please answer yes or no.	No
 7 | 000000007816.jpg	Is there a motorcycle in this image? Please answer yes or no.	Yes
 8 | 000000007816.jpg	Is there a airplane in this image? Please answer yes or no.	No
 9 | 000000007977.jpg	Is there a skateboard in this image? Please answer yes or no.	Yes
10 | 000000007977.jpg	Is there a spoon in this image? Please answer yes or no.	No
11 | 000000008844.jpg	Is there a person in this image? Please answer yes or no.	Yes
12 | 000000008844.jpg	Is there a sink in this image? Please answer yes or no.	No
13 | 000000009590.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
14 | 000000009590.jpg	Is there a scissors in this image? Please answer yes or no.	No
15 | 000000010363.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
16 | 000000010363.jpg	Is there a apple in this image? Please answer yes or no.	No
17 | 000000011197.jpg	Is there a car in this image? Please answer yes or no.	Yes
18 | 000000011197.jpg	Is there a fork in this image? Please answer yes or no.	No
19 | 000000015254.jpg	Is there a spoon in this image? Please answer yes or no.	Yes
20 | 000000015254.jpg	Is there a donut in this image? Please answer yes or no.	No
21 | 000000015517.jpg	Is there a bus in this image? Please answer yes or no.	Yes
22 | 000000015517.jpg	Is there a cow in this image? Please answer yes or no.	No
23 | 000000015746.jpg	Is there a fire hydrant in this image? Please answer yes or no.	Yes
24 | 000000015746.jpg	Is there a person in this image? Please answer yes or no.	No
25 | 000000037751.jpg	Is there a backpack in this image? Please answer yes or no.	Yes
26 | 000000037751.jpg	Is there a microwave in this image? Please answer yes or no.	No
27 | 000000050145.jpg	Is there a bicycle in this image? Please answer yes or no.	Yes
28 | 000000050145.jpg	Is there a apple in this image? Please answer yes or no.	No
29 | 000000061418.jpg	Is there a chair in this image? Please answer yes or no.	Yes
30 | 000000061418.jpg	Is there a airplane in this image? Please answer yes or no.	No
31 | 000000417779.jpg	Is there a car in this image? Please answer yes or no.	Yes
32 | 000000417779.jpg	Is there a kite in this image? Please answer yes or no.	No
33 | 000000424521.jpg	Is there a skateboard in this image? Please answer yes or no.	Yes
34 | 000000424521.jpg	Is there a banana in this image? Please answer yes or no.	No
35 | 000000438304.jpg	Is there a sports ball in this image? Please answer yes or no.	Yes
36 | 000000438304.jpg	Is there a horse in this image? Please answer yes or no.	No
37 | 000000494427.jpg	Is there a laptop in this image? Please answer yes or no.	Yes
38 | 000000494427.jpg	Is there a potted plant in this image? Please answer yes or no.	No
39 | 000000495448.jpg	Is there a cake in this image? Please answer yes or no.	Yes
40 | 000000495448.jpg	Is there a tie in this image? Please answer yes or no.	No
41 | 000000498463.jpg	Is there a refrigerator in this image? Please answer yes or no.	Yes
42 | 000000498463.jpg	Is there a donut in this image? Please answer yes or no.	No
43 | 000000519039.jpg	Is there a truck in this image? Please answer yes or no.	Yes
44 | 000000519039.jpg	Is there a book in this image? Please answer yes or no.	No
45 | 000000523241.jpg	Is there a car in this image? Please answer yes or no.	Yes
46 | 000000523241.jpg	Is there a cell phone in this image? Please answer yes or no.	No
47 | 000000530162.jpg	Is there a umbrella in this image? Please answer yes or no.	Yes
48 | 000000530162.jpg	Is there a horse in this image? Please answer yes or no.	No
49 | 000000537812.jpg	Is there a chair in this image? Please answer yes or no.	Yes
50 | 000000537812.jpg	Is there a baseball bat in this image? Please answer yes or no.	No
51 | 000000541952.jpg	Is there a clock in this image? Please answer yes or no.	Yes
52 | 000000541952.jpg	Is there a bottle in this image? Please answer yes or no.	No
53 | 000000546626.jpg	Is there a bottle in this image? Please answer yes or no.	Yes
54 | 000000546626.jpg	Is there a mouse in this image? Please answer yes or no.	No
55 | 000000556000.jpg	Is there a chair in this image? Please answer yes or no.	Yes
56 | 000000556000.jpg	Is there a dog in this image? Please answer yes or no.	No
57 | 000000557258.jpg	Is there a toilet in this image? Please answer yes or no.	Yes
58 | 000000557258.jpg	Is there a pizza in this image? Please answer yes or no.	No
59 | 000000572956.jpg	Is there a motorcycle in this image? Please answer yes or no.	Yes
60 | 000000572956.jpg	Is there a bus in this image? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/eval/gen/gedit/viescore/__init__.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.insert(0, 'viescore')
  3 | 
  4 | from utils import (
  5 |     mllm_output_to_dict
  6 | )
  7 | import math
  8 | import vie_prompts
  9 | 
 10 | class VIEScore:
 11 |     def __init__(self, backbone="gpt4o", task="t2i", key_path=None, azure_endpoint='') -> None:
 12 |         self.task = task
 13 |         self.backbone_name = backbone
 14 | 
 15 |         if self.task not in ["t2i", "tie", "t2v"]:
 16 |             raise ValueError("task must be either 't2i' or 'tie'")
 17 | 
 18 |         if self.backbone_name == "gpt4o":
 19 |             from mllm_tools.openai import GPT4o
 20 |             self.model = GPT4o(key_path, model_name="gpt-4.1-2025-04-14", azure_endpoint=azure_endpoint)
 21 |         elif self.backbone_name == "qwen25vl":
 22 |             from mllm_tools.qwen25vl_eval import Qwen25VL
 23 |             self.model = Qwen25VL()
 24 |         else:
 25 |             raise NotImplementedError("backbone not supported")
 26 |         self.context = vie_prompts._context_no_delimit
 27 |         if self.task == "t2i":
 28 |             self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_image_gen_rule, vie_prompts._prompts_0shot_t2i_rule_SC])
 29 |             self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
 30 |         elif self.task == "tie":
 31 |             self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC])
 32 |             self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
 33 |         elif self.task == "t2v":
 34 |             self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_video_gen_rule, vie_prompts._prompts_0shot_t2v_rule_SC])
 35 |             self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_t2v_rule_PQ])
 36 | 
 37 |     def evaluate(self, image_prompts, text_prompt, extract_overall_score_only=False, extract_all_score=True, echo_output=False):
 38 |         if not isinstance(image_prompts, list):
 39 |             image_prompts = [image_prompts]
 40 |         if self.backbone_name in ['gpt4o', 'gpt4v']:
 41 |             self.model.use_encode = False if isinstance(image_prompts[0], str) else True
 42 |             #print("Using encode:", self.model.use_encode)
 43 |         if self.task == "t2i":
 44 |             _SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
 45 |         elif self.task == "tie":
 46 |             _SC_prompt = self.SC_prompt.replace("<instruction>", text_prompt)
 47 |         elif self.task == "t2v":
 48 |             _SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
 49 |         SC_prompt_final = self.model.prepare_prompt(image_prompts, _SC_prompt)
 50 |         if self.task == "tie":
 51 |             PQ_prompt_final = self.model.prepare_prompt(image_prompts[-1], self.PQ_prompt)
 52 |         else:
 53 |             PQ_prompt_final = self.model.prepare_prompt(image_prompts, self.PQ_prompt)
 54 | 
 55 |         results_dict = {}
 56 | 
 57 |         SC_dict = False
 58 |         PQ_dict = False
 59 |         tries = 0
 60 |         max_tries = 1
 61 |         while SC_dict is False or PQ_dict is False:
 62 |             tries += 1
 63 |             guess_if_cannot_parse = True if tries > max_tries else False
 64 |             result_SC = self.model.get_parsed_output(SC_prompt_final)
 65 |             result_PQ = self.model.get_parsed_output(PQ_prompt_final)
 66 |             SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
 67 |             PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
 68 | 
 69 |         if SC_dict == "rate_limit_exceeded" or PQ_dict == "rate_limit_exceeded":
 70 |             print("rate_limit_exceeded") 
 71 |             raise ValueError("rate_limit_exceeded")
 72 |         results_dict['SC'] = SC_dict
 73 |         results_dict['PQ'] = PQ_dict
 74 |         if echo_output:
 75 |             print("results_dict", results_dict)
 76 |         if extract_all_score:
 77 |             SC_score = min(results_dict['SC']['score'])
 78 |             PQ_score = min(results_dict['PQ']['score'])
 79 |             O_score = math.sqrt(SC_score * PQ_score)
 80 |             return [SC_score, PQ_score, O_score]
 81 |         if extract_overall_score_only:
 82 |             SC_scores = results_dict['SC']['score']
 83 |             PQ_scores = results_dict['PQ']['score']
 84 |             O_score = math.sqrt(min(SC_scores) * min(PQ_scores))
 85 |             return O_score
 86 |         return results_dict
 87 | 
 88 | if __name__ == "__main__":
 89 |     model = VIEScore(backbone="gemini", task="t2i")
 90 |     from datasets import load_dataset
 91 |     dataset = load_dataset("TIGER-Lab/GenAI-Arena-Bench", "image_generation")
 92 |     dataset = dataset["test"]
 93 |     print("Now running the VIEScore model")
 94 |     for idx in range(5):
 95 |         left_image = dataset['left_image'][idx]
 96 |         right_image = dataset['right_image'][idx]
 97 |         prompt = dataset['prompt'][idx]
 98 |         print(model.evaluate(left_image, prompt, extract_all_score=True))
 99 |         print(model.evaluate(right_image, prompt, extract_all_score=True))
100 | 
101 | 


--------------------------------------------------------------------------------
/modeling/qwen2/tokenization_qwen2_fast.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """Tokenization classes for Qwen2."""
  5 | 
  6 | from typing import Optional, Tuple
  7 | 
  8 | from transformers.tokenization_utils import AddedToken
  9 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 10 | from transformers.utils import logging
 11 | from .tokenization_qwen2 import Qwen2Tokenizer
 12 | 
 13 | 
 14 | logger = logging.get_logger(__name__)
 15 | 
 16 | VOCAB_FILES_NAMES = {
 17 |     "vocab_file": "vocab.json",
 18 |     "merges_file": "merges.txt",
 19 |     "tokenizer_file": "tokenizer.json",
 20 | }
 21 | 
 22 | 
 23 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 24 | 
 25 | 
 26 | class Qwen2TokenizerFast(PreTrainedTokenizerFast):
 27 |     """
 28 |     Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 29 |     Byte-Pair-Encoding.
 30 | 
 31 |     Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
 32 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 33 | 
 34 |     ```python
 35 |     >>> from transformers import Qwen2TokenizerFast
 36 | 
 37 |     >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
 38 |     >>> tokenizer("Hello world")["input_ids"]
 39 |     [9707, 1879]
 40 | 
 41 |     >>> tokenizer(" Hello world")["input_ids"]
 42 |     [21927, 1879]
 43 |     ```
 44 |     This is expected.
 45 | 
 46 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 47 |     refer to this superclass for more information regarding those methods.
 48 | 
 49 |     Args:
 50 |         vocab_file (`str`, *optional*):
 51 |             Path to the vocabulary file.
 52 |         merges_file (`str`, *optional*):
 53 |             Path to the merges file.
 54 |         tokenizer_file (`str`, *optional*):
 55 |             Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
 56 |             contains everything needed to load the tokenizer.
 57 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 58 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 59 |             token instead. Not applicable to this tokenizer.
 60 |         bos_token (`str`, *optional*):
 61 |             The beginning of sequence token. Not applicable for this tokenizer.
 62 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 63 |             The end of sequence token.
 64 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 65 |             The token used for padding, for example when batching sequences of different lengths.
 66 |     """
 67 | 
 68 |     vocab_files_names = VOCAB_FILES_NAMES
 69 |     model_input_names = ["input_ids", "attention_mask"]
 70 |     slow_tokenizer_class = Qwen2Tokenizer
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         vocab_file=None,
 75 |         merges_file=None,
 76 |         tokenizer_file=None,
 77 |         unk_token="<|endoftext|>",
 78 |         bos_token=None,
 79 |         eos_token="<|endoftext|>",
 80 |         pad_token="<|endoftext|>",
 81 |         **kwargs,
 82 |     ):
 83 |         # We need to at least pass vocab_file and merges_file to base class
 84 |         # in case a slow tokenizer needs to be initialized; other can be
 85 |         # configured through files.
 86 |         # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
 87 | 
 88 |         bos_token = (
 89 |             AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
 90 |             if isinstance(bos_token, str)
 91 |             else bos_token
 92 |         )
 93 |         eos_token = (
 94 |             AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
 95 |             if isinstance(eos_token, str)
 96 |             else eos_token
 97 |         )
 98 |         unk_token = (
 99 |             AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
100 |             if isinstance(unk_token, str)
101 |             else unk_token
102 |         )
103 |         pad_token = (
104 |             AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
105 |             if isinstance(pad_token, str)
106 |             else pad_token
107 |         )
108 | 
109 |         super().__init__(
110 |             vocab_file=vocab_file,
111 |             merges_file=merges_file,
112 |             tokenizer_file=tokenizer_file,
113 |             unk_token=unk_token,
114 |             bos_token=bos_token,
115 |             eos_token=eos_token,
116 |             pad_token=pad_token,
117 |             **kwargs,
118 |         )
119 | 
120 |     # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
121 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
122 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
123 |         return tuple(files)
124 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/color.txt:
--------------------------------------------------------------------------------
 1 | 000000006723.jpg	Is there a red brick building in the image? Please answer yes or no.	Yes
 2 | 000000006723.jpg	Is there a yellow brick building in the image? Please answer yes or no.	No
 3 | 000000008277.jpg	Is there a white plate in the image? Please answer yes or no.	Yes
 4 | 000000008277.jpg	Is there a yellow plate in the image? Please answer yes or no.	No
 5 | 000000012120.jpg	Is there a blue court in the image? Please answer yes or no.	Yes
 6 | 000000012120.jpg	Is there a purple court in the image? Please answer yes or no.	No
 7 | 000000014831.jpg	Is there a brown and white animal in the image? Please answer yes or no.	Yes
 8 | 000000014831.jpg	Is there a green and red animal in the image? Please answer yes or no.	No
 9 | 000000028993.jpg	Are there yellow poles in the image? Please answer yes or no.	Yes
10 | 000000028993.jpg	Are there blue poles in the image? Please answer yes or no.	No
11 | 000000029393.jpg	Is there a brown dog in the image? Please answer yes or no.	Yes
12 | 000000029393.jpg	Is there a black dog in the image? Please answer yes or no.	No
13 | 000000035770.jpg	Is there a black and white toilet in the image? Please answer yes or no.	Yes
14 | 000000035770.jpg	Is there a red and white toilet in the image? Please answer yes or no.	No
15 | 000000038118.jpg	Is there a red coat in the image? Please answer yes or no.	Yes
16 | 000000038118.jpg	Is there a yellow coat in the image? Please answer yes or no.	No
17 | 000000047112.jpg	Is there a white plate in the image? Please answer yes or no.	Yes
18 | 000000047112.jpg	Is there a yellow plate in the image? Please answer yes or no.	No
19 | 000000047121.jpg	Is there a black cat in the image? Please answer yes or no.	Yes
20 | 000000047121.jpg	Is there a brown cat in the image? Please answer yes or no.	No
21 | 000000053529.jpg	Is there a green hat in the image? Please answer yes or no.	Yes
22 | 000000053529.jpg	Is there a red hat in the image? Please answer yes or no.	No
23 | 000000053994.jpg	Is there a gray wall in the image? Please answer yes or no.	Yes
24 | 000000053994.jpg	Is there a red wall in the image? Please answer yes or no.	No
25 | 000000055072.jpg	Is there a brown giraffe in the image?  Please answer yes or no.	Yes
26 | 000000055072.jpg	Is there a black giraffe in the image? Please answer yes or no.	No
27 | 000000057597.jpg	Are there any red shoes in the image? Please answer yes or no.	Yes
28 | 000000057597.jpg	Are there any yellow shoes in the image? Please answer yes or no.	No
29 | 000000061658.jpg	Are there a white dish in the image? Please answer yes or no.	Yes
30 | 000000061658.jpg	Are there a green dish in the image? Please answer yes or no.	No
31 | 000000338560.jpg	Is there a blue and yellow fire hydrant in the image? Please answer yes or no.	Yes
32 | 000000338560.jpg	Is there a blue and orange fire hydrant in the image? Please answer yes or no.	No
33 | 000000370208.jpg	Is there a red bicycle with white handlebars in the image? Please answer yes or no.	Yes
34 | 000000370208.jpg	Is there a red bicycle with black handlebars in the image? Please answer yes or no.	No
35 | 000000377723.jpg	Is there a blue bus in the image? Please answer yes or no.	Yes
36 | 000000377723.jpg	Is there a orange bus in the image? Please answer yes or no.	No
37 | 000000405205.jpg	Is there a white bus in the image?  Please answer yes or no.	Yes
38 | 000000405205.jpg	Is there a red bus in the image?  Please answer yes or no.	No
39 | 000000410612.jpg	Is there a red boat in the image? Please answer yes or no.	Yes
40 | 000000410612.jpg	Is there a gray boat in the image? Please answer yes or no.	No
41 | 000000427034.jpg	Is there a brown and black dog in the image? Please answer yes or no.	Yes
42 | 000000427034.jpg	Is there a brown and white dog in the image? Please answer yes or no.	No
43 | 000000442456.jpg	Is there a man wearing a red shirt in the image? Please answer yes or no.	Yes
44 | 000000442456.jpg	Is there a man wearing a white shirt in the image? Please answer yes or no.	No
45 | 000000492362.jpg	Is there a skateboard with red wheels in the image? Please answer yes or no.	Yes
46 | 000000492362.jpg	Is there a skateboard with black wheels in the image? Please answer yes or no.	No
47 | 000000492992.jpg	Is there a white bird in the image? Please answer yes or no.	Yes
48 | 000000492992.jpg	Is there a yellow bird in the image? Please answer yes or no.	No
49 | 000000512929.jpg	Are there any green beans in the image? Please answer yes or no.	Yes
50 | 000000512929.jpg	Are there any orange beans in the image? Please answer yes or no.	No
51 | 000000530457.jpg	Are there any red flowers in the image? Please answer yes or no.	Yes
52 | 000000530457.jpg	Are there any green flowers in the image? Please answer yes or no.	No
53 | 000000532761.jpg	Is there a living room painted yellow in the image? Please answer yes or no.	Yes
54 | 000000532761.jpg	Is there a living room painted black in the image? Please answer yes or no.	No
55 | 000000534041.jpg	Is there a purple bottle in the image? Please answer yes or no.	Yes
56 | 000000534041.jpg	Is there a white bottle in the image? Please answer yes or no.	No
57 | 000000563758.jpg	Is there a red scarf in the image?  Please answer yes or no.	Yes
58 | 000000563758.jpg	Is there a brown scarf in the image?  Please answer yes or no.	No
59 | 000000564280.jpg	Is there a red couch in the image? Please answer yes or no.	Yes
60 | 000000564280.jpg	Is there a black couch in the image? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mmmu/main_eval_only.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | """Parse and Evalate"""
 13 | import os
 14 | import json
 15 | from argparse import ArgumentParser
 16 | 
 17 | from .data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT, save_json
 18 | from .eval_utils import calculate_ins_level_acc, evaluate, parse_open_response
 19 | 
 20 | if __name__ == '__main__':
 21 | 
 22 |     parser = ArgumentParser()
 23 |     parser.add_argument('--output_path', type=str, default='./example_outputs/qwen_vl/total_val_output.json',
 24 |                         help='The path to model output file.')
 25 |     parser.add_argument('--answer_path', type=str, default='./answer_dict_val.json', help='Answer file path.')
 26 |     parser.add_argument('--out-dir', type=str, default='results')
 27 |     args = parser.parse_args()
 28 | 
 29 |     output_dict = json.load(open(args.output_path))
 30 |     answer_dict = json.load(open(args.answer_path))
 31 | 
 32 |     # group by category
 33 |     output_dict_w_cat = {}
 34 |     for data_id, parsed_pred in output_dict.items():
 35 |         category = '_'.join(data_id.split('_')[1:-1])
 36 |         if category not in output_dict_w_cat:
 37 |             output_dict_w_cat.update({category: {}})
 38 |         output_dict_w_cat[category].update({data_id: parsed_pred})
 39 | 
 40 |     # group by category
 41 |     answer_dict_w_cat = {}
 42 |     for data_id, parsed_pred in answer_dict.items():
 43 |         category = '_'.join(data_id.split('_')[1:-1])
 44 |         if category not in answer_dict_w_cat:
 45 |             answer_dict_w_cat.update({category: {}})
 46 |         answer_dict_w_cat[category].update({data_id: parsed_pred})
 47 | 
 48 |     evaluation_result = {}
 49 | 
 50 |     for category in CAT_SHORT2LONG.values():
 51 |         print('Evaluating: {}'.format(category))
 52 |         # get cat_outputs and cat_answers
 53 |         try:
 54 |             cat_outputs = output_dict_w_cat[category]
 55 |             cat_answers = answer_dict_w_cat[category]
 56 |         except KeyError:
 57 |             print('Skipping {} for not found'.format(category))
 58 |             continue
 59 | 
 60 |         exampels_to_eval = []
 61 |         for data_id, parsed_pred in cat_outputs.items():
 62 |             question_type = cat_answers[data_id]['question_type']
 63 |             if question_type != 'multiple-choice':
 64 |                 parsed_pred = parse_open_response(parsed_pred)  # mainly for type consistency (make it number, etc.)
 65 |             else:
 66 |                 parsed_pred = parsed_pred
 67 | 
 68 |             exampels_to_eval.append({
 69 |                 'id': data_id,
 70 |                 'question_type': question_type,
 71 |                 'answer': cat_answers[data_id]['ground_truth'],
 72 |                 'parsed_pred': parsed_pred
 73 |             })
 74 | 
 75 |         judge_dict, metric_dict = evaluate(exampels_to_eval)
 76 |         metric_dict.update({'num_example': len(exampels_to_eval)})
 77 | 
 78 |         evaluation_result[category] = metric_dict
 79 | 
 80 |     printable_results = {}
 81 |     # pdb.set_trace()
 82 |     # add domain Subject
 83 |     for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
 84 |         in_domain_cat_results = {}
 85 |         for cat_name in in_domain_cats:  # use the order in DOMAIN_CAT2SUB_CAT
 86 |             if cat_name in evaluation_result.keys():
 87 |                 in_domain_cat_results[cat_name] = evaluation_result[cat_name]
 88 |             else:
 89 |                 pass
 90 |         in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
 91 |         in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
 92 |         printable_results['Overall-' + domain] = {'num': int(in_domain_data_num),
 93 |                                                   'acc': round(in_domain_ins_acc, 3)
 94 |                                                   }
 95 |         # add sub category
 96 |         for cat_name, cat_results in in_domain_cat_results.items():
 97 |             printable_results[cat_name] = {'num': int(cat_results['num_example']),
 98 |                                            'acc': round(cat_results['acc'], 3)
 99 |                                            }
100 | 
101 |     # table.append(["-----------------------------", "-----", "----"])
102 |     all_ins_acc = calculate_ins_level_acc(evaluation_result)
103 |     printable_results['Overall'] = {
104 |         'num': sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
105 |         'acc': round(all_ins_acc, 3)}
106 | 
107 |     print(printable_results)
108 |     writer = open(os.path.join(args.out_dir, "results.txt"), 'w')
109 |     print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}")
110 |     for key, value in printable_results.items():
111 |         line = f'{key}: num={value["num"]}, acc={value["acc"]}\n'
112 |         writer.write(line)
113 |     writer.close()
114 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/count.txt:
--------------------------------------------------------------------------------
 1 | 000000006040.jpg	Is there a train in the picture? Please answer yes or no.	Yes
 2 | 000000006040.jpg	Are there a total of two trains in the picture? Please answer yes or no.	No
 3 | 000000044279.jpg	Is there a total of two people in the image? Please answer yes or no.	Yes
 4 | 000000044279.jpg	Is there only one people in the image? Please answer yes or no.	No
 5 | 000000067213.jpg	Is there only one dog in the image? Please answer yes or no.	Yes
 6 | 000000067213.jpg	Is there two dogs in the image? Please answer yes or no.	No
 7 | 000000071226.jpg	Is there a total of two dogs in the image? Please answer yes or no.	Yes
 8 | 000000071226.jpg	Is there only one dogs in the image? Please answer yes or no.	No
 9 | 000000097994.jpg	Are there three laptops in the picture? Please answer yes or no.	Yes
10 | 000000097994.jpg	Are there four laptops in the picture? Please answer yes or no.	No
11 | 000000195918.jpg	Is there a total of two display devices in the image? Please answer yes or no.	Yes
12 | 000000195918.jpg	Is there only one display device in the image?  Please answer yes or no.	No
13 | 000000236721.jpg	Are there two bananas in the image? Please answer yes or no.	Yes
14 | 000000236721.jpg	Are there three bananas in the image? Please answer yes or no.	No
15 | 000000261712.jpg	Are there two giraffes in this image? Please answer yes or no.	Yes
16 | 000000261712.jpg	Are there three giraffes in this picture? Please answer yes or no.	No
17 | 000000274066.jpg	Are there four people appear in this image? Please answer yes or no.	Yes
18 | 000000274066.jpg	Are there only three people appear in this image? Please answer yes or no.	No
19 | 000000276434.jpg	Is there a total of three cakes in this image? Please answer yes or no.	Yes
20 | 000000276434.jpg	Are there only two cakes in this image? Please answer yes or no.	No
21 | 000000289059.jpg	Is there a total of two person appear in the image? Please answer yes or no.	Yes
22 | 000000289059.jpg	Is there only one person appear in the image? Please answer yes or no.	No
23 | 000000290081.jpg	Is there only one bowl in this image? Please answer yes or no.	Yes
24 | 000000290081.jpg	Are there two bowls in this image? Please answer yes or no.	No
25 | 000000301867.jpg	Are there three people appear in this image? Please answer yes or no.	Yes
26 | 000000301867.jpg	Are there only two people appear in this image? Please answer yes or no.	No
27 | 000000335954.jpg	Are there two bowls in this image? Please answer yes or no.	Yes
28 | 000000335954.jpg	Are there three bowls in this image? Please answer yes or no.	No
29 | 000000357816.jpg	Are there four people in this image? Please answer yes or no.	Yes
30 | 000000357816.jpg	Are there five people in this image? Please answer yes or no.	No
31 | 000000372819.jpg	Are there four dogs appear in this image? Please answer yes or no.	Yes
32 | 000000372819.jpg	Are there only three dogs appear in this image? Please answer yes or no.	No
33 | 000000410612.jpg	Is there only one ship in the picture? Please answer yes or no.	Yes
34 | 000000410612.jpg	Is there a total of two ships in the picture? Please answer yes or no.	No
35 | 000000423944.jpg	Is there no person in this picture? Please answer yes or no.	Yes
36 | 000000423944.jpg	Are there two people appear in this image? Please answer yes or no.	No
37 | 000000427034.jpg	Is there a dog in the picture? Please answer yes or no.	Yes
38 | 000000427034.jpg	Are there a total of two dogs in the picture? Please answer yes or no.	No
39 | 000000430286.jpg	Are there three remotes in this image? Please answer yes or no.	Yes
40 | 000000430286.jpg	Are there only two remotes in this image? Please answer yes or no.	No
41 | 000000432468.jpg	Are there three zippers in the picture? Please answer yes or no.	Yes
42 | 000000432468.jpg	Is there a zipper in the picture? Please answer yes or no.	No
43 | 000000434479.jpg	Are there two pieces of pizza in this image? Please answer yes or no.	Yes
44 | 000000434479.jpg	Is there only one piece of pizza in this image? Please answer yes or no.	No
45 | 000000438304.jpg	Are there two tennis rackets in the picture? Please answer yes or no.	Yes
46 | 000000438304.jpg	Are there only one tennis racket in the picture? Please answer yes or no.	No
47 | 000000450303.jpg	Are there six people appear in this image? Please answer yes or no.	Yes
48 | 000000450303.jpg	Are there seven people appear in this image? Please answer yes or no.	No
49 | 000000470121.jpg	Is there only one bottle in the image? Please answer yes or no.	Yes
50 | 000000470121.jpg	Is there two bottles in the image? Please answer yes or no.	No
51 | 000000476215.jpg	Are there two horses in this image? Please answer yes or no.	Yes
52 | 000000476215.jpg	Is there only one horse in this image? Please answer yes or no.	No
53 | 000000482100.jpg	Are there two toilets in the picture? Please answer yes or no.	Yes
54 | 000000482100.jpg	Is there only one toilet in the picture? Please answer yes or no.	No
55 | 000000491867.jpg	Is there only one necktie in the image? Please answer yes or no.	Yes
56 | 000000491867.jpg	Is there three neckties in the image? Please answer yes or no.	No
57 | 000000556000.jpg	Are there four people in the image? Please answer yes or no.	Yes
58 | 000000556000.jpg	Are there only three people in the image? Please answer yes or no.	No
59 | 000000565045.jpg	Are there two bath towels in the picture? Please answer yes or no.	Yes
60 | 000000565045.jpg	Is there only one bath towel in the picture? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/position.txt:
--------------------------------------------------------------------------------
 1 | 000000006471.jpg	Is the cricket bat above the batter's body? Please answer yes or no.	Yes
 2 | 000000006471.jpg	Is the cricket bat under the batter's body Please answer yes or no.	No
 3 | 000000007281.jpg	Is the sea behind people in the image? Please answer yes or no.	Yes
 4 | 000000007281.jpg	Is the sea in front of people in the image? Please answer yes or no.	No
 5 | 000000014038.jpg	Is the refrigerator on the left side of the picture? Please answer yes or no.	Yes
 6 | 000000014038.jpg	Is the refrigerator on the right side of the picture Please answer yes or no.	No
 7 | 000000031248.jpg	Is there a sofa in the middle of potted plants in the image? Please answer yes or no.	Yes
 8 | 000000031248.jpg	Is there a sofa in the right side of potted plants in the image? Please answer yes or no.	No
 9 | 000000048504.jpg	Is the gray elephant in front of the brown elephant? Please answer yes or no.	Yes
10 | 000000048504.jpg	Is the brown elephant in front of the gray elephant? Please answer yes or no.	No
11 | 000000052007.jpg	Are the pedestrians on the right of the bus? Please answer yes or no.	Yes
12 | 000000052007.jpg	Are the pedestrians on the left of the bus? Please answer yes or no.	No
13 | 000000056127.jpg	Is the light above the fire hydrant in the image? Please answer yes or no.	Yes
14 | 000000056127.jpg	Is the light under the fire hydrant in the image?  Please answer yes or no.	No
15 | 000000062025.jpg	Is the trash can under the cup in the image？ Please answer yes or no.	Yes
16 | 000000062025.jpg	Is the trash can above the cup in the image？ Please answer yes or no.	No
17 | 000000062808.jpg	Is the phone above the pizza in the image? Please answer yes or no.	Yes
18 | 000000062808.jpg	Is the phone under the pizza in the image? Please answer yes or no.	No
19 | 000000067213.jpg	Is the dog above the pool in the image? Please answer yes or no.	Yes
20 | 000000067213.jpg	Is the dog under the pool in the image? Please answer yes or no.	No
21 | 000000097994.jpg	Is the light above the computer in the image? Please answer yes or no.	Yes
22 | 000000097994.jpg	Is the light under the computer in the image? Please answer yes or no.	No
23 | 000000204871.jpg	Is the car on the right side of the fire hydrant in the picture? Please answer yes or no.	Yes
24 | 000000204871.jpg	Is the car on the left side of the fire hydrant in the picture? Please answer yes or no.	No
25 | 000000206487.jpg	Is the motorcycle on the right side of the bus? Please answer yes or no.	Yes
26 | 000000206487.jpg	Is the motorcycle on the left side of the bus Please answer yes or no.	No
27 | 000000211825.jpg	Is the cake on the left side of the camera? Please answer yes or no.	Yes
28 | 000000211825.jpg	Is the cake on the right side of the camera? Please answer yes or no.	No
29 | 000000212800.jpg	Is the blue umbrella under the black umbrella? Please answer yes or no.	Yes
30 | 000000212800.jpg	Is the blue umbrella above the black umbrella? Please answer yes or no.	No
31 | 000000395701.jpg	Is the TV on the left of the bookshelf? Please answer yes or no.	Yes
32 | 000000395701.jpg	Is the TV on the right of the bookshelf? Please answer yes or no.	No
33 | 000000395801.jpg	Is the clock above people? Please answer yes or no.	Yes
34 | 000000395801.jpg	Is the clock under people? Please answer yes or no.	No
35 | 000000405970.jpg	Is the grey sofa on the right of the TV? Please answer yes or no.	Yes
36 | 000000405970.jpg	Is the grey sofa on the left of the TV? Please answer yes or no.	No
37 | 000000426241.jpg	Is the white mouse on the right of the black keyboard? Please answer yes or no.	Yes
38 | 000000426241.jpg	Is the white mouse on the left of the black keyboard? Please answer yes or no.	No
39 | 000000450303.jpg	Is the monitor on top of a person? Please answer yes or no.	Yes
40 | 000000450303.jpg	Is the monitor under the person? Please answer yes or no.	No
41 | 000000458410.jpg	Is the TV on the left of the lamp? Please answer yes or no.	Yes
42 | 000000458410.jpg	Is the TV on the right of the lamp? Please answer yes or no.	No
43 | 000000472046.jpg	Is the pineapple on the left of the pot in the image? Please answer yes or no.	Yes
44 | 000000472046.jpg	Is the pineapple on the right of the pot in the image? Please answer yes or no.	No
45 | 000000477955.jpg	Is the person under the kite? Please answer yes or no.	Yes
46 | 000000477955.jpg	Is the person above the kite? Please answer yes or no.	No
47 | 000000482585.jpg	Is the person on the right of the train? Please answer yes or no.	Yes
48 | 000000482585.jpg	Is the person on the left of the train? Please answer yes or no.	No
49 | 000000494869.jpg	Is the baby on the right of the dog in the image? Please answer yes or no.	Yes
50 | 000000494869.jpg	Is the baby on the left of the dog in the image? Please answer yes or no.	No
51 | 000000509699.jpg	Is the mirror above the TV? Please answer yes or no.	Yes
52 | 000000509699.jpg	Is the mirror under the TV? Please answer yes or no.	No
53 | 000000519569.jpg	Is the vase on the left of the bottle? Please answer yes or no.	Yes
54 | 000000519569.jpg	Is the vase on the right of the bottle? Please answer yes or no.	No
55 | 000000530162.jpg	Is the big red and black umbrella on the top of people? Please answer yes or no.	Yes
56 | 000000530162.jpg	Is the big red and black umbrella under people? Please answer yes or no.	No
57 | 000000551660.jpg	Is the spoon in the bowl? Please answer yes or no.	Yes
58 | 000000551660.jpg	Is the spoon out of the bowl? Please answer yes or no.	No
59 | 000000578922.jpg	Is the vase on the left of the toothbrush? Please answer yes or no.	Yes
60 | 000000578922.jpg	Is the vase on the right of the toothbrush? Please answer yes or no.	No
61 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/Your_Results/text_translation.txt:
--------------------------------------------------------------------------------
 1 | 0001.png	Is it appropriate to translate the Chinese in the image into English 'classic taste' in the picture? Please answer yes or no.	Yes
 2 | 0001.png	Is it appropriate to translate the Chinese in the image into English 'classic strawberry flavor' in the picture? Please answer yes or no.	No
 3 | 0002.png	Is it appropriate to translate the Chinese in the image into English 'a delicious dinner' in the picture? Please answer yes or no.	Yes
 4 | 0002.png	Is it appropriate to translate the Chinese in the image into English 'hamburger and chips' in the picture? Please answer yes or no.	No
 5 | 0003.png	Is it appropriate to translate the Chinese in the image into English 'sunny weather' in the picture? Please answer yes or no.	Yes
 6 | 0003.png	Is it appropriate to translate the Chinese in the image into English 'cold weather' in the picture? Please answer yes or no.	No
 7 | 0004.png	Is it appropriate to translate the Chinese in the image into English 'run very fast' in the picture? Please answer yes or no.	Yes
 8 | 0004.png	Is it appropriate to translate the Chinese in the image into English 'run very slow' in the picture? Please answer yes or no.	No
 9 | 0005.png	Is it appropriate to translate the Chinese in the image into English 'feeling happy' in the picture? Please answer yes or no.	Yes
10 | 0005.png	Is it appropriate to translate the Chinese in the image into English 'feeling bored' in the picture? Please answer yes or no.	No
11 | 0006.png	Is it appropriate to translate the Chinese in the image into English 'work hard together' in the picture? Please answer yes or no.	Yes
12 | 0006.png	Is it appropriate to translate the Chinese in the image into English 'be filled with intrigue' in the picture? Please answer yes or no.	No
13 | 0007.png	Is it appropriate to translate the Chinese in the image into English 'walking very slowly' in the picture? Please answer yes or no.	Yes
14 | 0007.png	Is it appropriate to translate the Chinese in the image into English 'runing very slowly' in the picture? Please answer yes or no.	No
15 | 0008.png	Is it appropriate to translate the Chinese in the image into English 'very proud' in the picture? Please answer yes or no.	Yes
16 | 0008.png	Is it appropriate to translate the Chinese in the image into English 'very thankful' in the picture? Please answer yes or no.	No
17 | 0009.png	Is it appropriate to translate the Chinese in the image into English 'creative people' in the picture? Please answer yes or no.	Yes
18 | 0009.png	Is it appropriate to translate the Chinese in the image into English 'leading people' in the picture? Please answer yes or no.	No
19 | 0010.png	Is it appropriate to translate the Chinese in the image into English 'a beautiful garden' in the picture? Please answer yes or no.	Yes
20 | 0010.png	Is it appropriate to translate the Chinese in the image into English 'a beautiful campus' in the picture? Please answer yes or no.	No
21 | 0011.png	Is it appropriate to translate the Chinese in the image into English 'a difficult work' in the picture? Please answer yes or no.	Yes
22 | 0011.png	Is it appropriate to translate the Chinese in the image into English 'a easy work' in the picture? Please answer yes or no.	No
23 | 0012.png	Is it appropriate to translate the Chinese in the image into English 'a small amount' in the picture? Please answer yes or no.	Yes
24 | 0012.png	Is it appropriate to translate the Chinese in the image into English 'difficult and dangerous' in the picture? Please answer yes or no.	No
25 | 0013.png	Is it appropriate to translate the Chinese in the image into English 'feeling frustrated' in the picture? Please answer yes or no.	Yes
26 | 0013.png	Is it appropriate to translate the Chinese in the image into English 'feeling relaxed' in the picture? Please answer yes or no.	No
27 | 0014.png	Is it appropriate to translate the Chinese in the image into English 'waiting for a long time' in the picture? Please answer yes or no.	Yes
28 | 0014.png	Is it appropriate to translate the Chinese in the image into English 'sleeping for a long time' in the picture? Please answer yes or no.	No
29 | 0015.png	Is it appropriate to translate the Chinese in the image into English 'very powerful' in the picture? Please answer yes or no.	Yes
30 | 0015.png	Is it appropriate to translate the Chinese in the image into English 'to be fragile throughout the world' in the picture? Please answer yes or no.	No
31 | 0016.png	Is it appropriate to translate the Chinese in the image into English 'all talk and no action' in the picture? Please answer yes or no.	Yes
32 | 0016.png	Is it appropriate to translate the Chinese in the image into English 'hands-on practice' in the picture? Please answer yes or no.	No
33 | 0017.png	Is it appropriate to translate the Chinese in the image into English 'delicious fruit' in the picture? Please answer yes or no.	Yes
34 | 0017.png	Is it appropriate to translate the Chinese in the image into English 'banana' in the picture? Please answer yes or no.	No
35 | 0018.png	Is it appropriate to translate the Chinese in the image into English 'very unforgettable' in the picture? Please answer yes or no.	Yes
36 | 0018.png	Is it appropriate to translate the Chinese in the image into English 'very happy' in the picture? Please answer yes or no.	No
37 | 0019.png	Is it appropriate to translate the Chinese in the image into English 'get along well' in the picture? Please answer yes or no.	Yes
38 | 0019.png	Is it appropriate to translate the Chinese in the image into English 'for own self-interest' in the picture? Please answer yes or no.	No
39 | 0020.png	Is it appropriate to translate the Chinese in the image into English 'rank first' in the picture? Please answer yes or no.	Yes
40 | 0020.png	Is it appropriate to translate the Chinese in the image into English 'to add the finishing touches' in the picture? Please answer yes or no.	No
41 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mathvista/extract_answer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import argparse
 13 | 
 14 | from tqdm import tqdm
 15 | from utilities import *
 16 | 
 17 | openai.api_key = os.getenv('OPENAI_API_KEY')
 18 | print(openai.api_key)
 19 | 
 20 | # load demo prompt
 21 | from prompts.ext_ans import demo_prompt
 22 | 
 23 | 
 24 | def verify_extraction(extraction):
 25 |     extraction = extraction.strip()
 26 |     if extraction == '' or extraction is None:
 27 |         return False
 28 |     return True
 29 | 
 30 | 
 31 | def create_test_prompt(demo_prompt, query, response):
 32 |     demo_prompt = demo_prompt.strip()
 33 |     test_prompt = f'{query}\n\n{response}'
 34 |     full_prompt = f'{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: '
 35 |     return full_prompt
 36 | 
 37 | 
 38 | def _extract_answer(text):
 39 |     match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
 40 |     if match:
 41 |         return match.group(2).strip()
 42 |     return text
 43 | 
 44 | 
 45 | def extract_answer(response, problem, quick_extract=False):
 46 |     question_type = problem['question_type']
 47 |     answer_type = problem['answer_type']
 48 |     choices = problem['choices']
 49 |     query = problem['query']
 50 | 
 51 |     if response == '':
 52 |         return ''
 53 | 
 54 |     if question_type == 'multi_choice' and response in choices:
 55 |         return response
 56 | 
 57 |     if answer_type == 'integer':
 58 |         try:
 59 |             extraction = int(response)
 60 |             return str(extraction)
 61 |         except:
 62 |             pass
 63 | 
 64 |     if answer_type == 'float':
 65 |         try:
 66 |             extraction = str(float(response))
 67 |             return extraction
 68 |         except:
 69 |             pass
 70 | 
 71 |     # quick extraction
 72 |     if quick_extract:
 73 |         print('Quickly extracting answer...')
 74 |         # The answer is "text". -> "text"
 75 |         try:
 76 |             result = _extract_answer(response)
 77 |             return result
 78 |             # result = re.search(r'The answer is "(.*)"\.', response)
 79 |             # if result:
 80 |             #     extraction = result.group(1)
 81 |             #     return extraction
 82 |         except:
 83 |             pass
 84 | 
 85 |     # general extraction
 86 |     try:
 87 |         full_prompt = create_test_prompt(demo_prompt, query, response)
 88 |         extraction = get_chat_response(full_prompt, openai.api_key, patience=5)
 89 |         return extraction
 90 |     except Exception as e:
 91 |         print(e)
 92 |         print(f'Error in extracting answer for {pid}')
 93 | 
 94 |     return ''
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     parser = argparse.ArgumentParser()
 99 |     # input
100 |     parser.add_argument('--output_dir', type=str, default='./results')
101 |     parser.add_argument('--output_file', type=str, default='mathvista_answer.json')
102 |     parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
103 |     # model
104 |     parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
105 |                         choices=['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
106 |     parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
107 |     parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
108 |     parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
109 |     # output
110 |     parser.add_argument('--save_every', type=int, default=10, help='save every n problems')
111 |     parser.add_argument('--output_label', type=str, default='', help='label for the output file')
112 |     args = parser.parse_args()
113 | 
114 |     # args
115 |     label = args.response_label
116 |     result_file = os.path.join(args.output_dir, args.output_file)
117 | 
118 |     if args.output_label != '':
119 |         output_file = result_file.replace('.json', f'_{args.output_label}.json')
120 |     else:
121 |         output_file = result_file
122 | 
123 |     # read results
124 |     print(f'Reading {result_file}...')
125 |     results = read_json(result_file)
126 | 
127 |     # full pids
128 |     full_pids = list(results.keys())
129 |     if args.number > 0:
130 |         full_pids = full_pids[:min(args.number, len(full_pids))]
131 |     print('Number of testing problems:', len(full_pids))
132 | 
133 |     # test pids
134 |     if args.rerun:
135 |         test_pids = full_pids
136 |     else:
137 |         test_pids = []
138 |         for pid in full_pids:
139 |             # print(pid)
140 |             if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
141 |                 test_pids.append(pid)
142 | 
143 |     test_num = len(test_pids)
144 |     print('Number of problems to run:', test_num)
145 |     # print(test_pids)
146 | 
147 |     # tqdm, enumerate results
148 |     for i, pid in enumerate(tqdm(test_pids)):
149 |         problem = results[pid]
150 | 
151 |         assert label in problem
152 |         response = problem[label]
153 | 
154 |         extraction = extract_answer(response, problem, args.quick_extract)
155 |         results[pid]['extraction'] = extraction
156 | 
157 |         if i % args.save_every == 0 or i == test_num - 1:
158 |             print(f'Saving results to {output_file}...')
159 |             save_json(results, output_file)
160 |             print(f'Results saved.')
161 | 


--------------------------------------------------------------------------------
/modeling/bagel/modeling_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: CC BY-NC 4.0
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under CC BY-NC 4.0, with the full license text
  8 | # available at https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import math
 13 | 
 14 | import numpy as np
 15 | import torch
 16 | from torch import nn
 17 | from transformers.activations import ACT2FN
 18 | 
 19 | # --------------------------------------------------------
 20 | # 2D sine-cosine position embedding
 21 | # References:
 22 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
 23 | # --------------------------------------------------------
 24 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
 25 |     grid_h = np.arange(grid_size, dtype=np.float32)
 26 |     grid_w = np.arange(grid_size, dtype=np.float32)
 27 |     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
 28 |     grid = np.stack(grid, axis=0)
 29 | 
 30 |     grid = grid.reshape([2, 1, grid_size, grid_size])
 31 |     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
 32 |     if cls_token and extra_tokens > 0:
 33 |         pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
 34 |     return pos_embed
 35 | 
 36 | 
 37 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
 38 |     assert embed_dim % 2 == 0
 39 | 
 40 |     # use half of dimensions to encode grid_h
 41 |     emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
 42 |     emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
 43 | 
 44 |     emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
 45 |     return emb
 46 | 
 47 | 
 48 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
 49 |     """
 50 |     embed_dim: output dimension for each position
 51 |     pos: a list of positions to be encoded: size (M,)
 52 |     out: (M, D)
 53 |     """
 54 |     assert embed_dim % 2 == 0
 55 |     omega = np.arange(embed_dim // 2, dtype=np.float64)
 56 |     omega /= embed_dim / 2.
 57 |     omega = 1. / 10000**omega  # (D/2,)
 58 | 
 59 |     pos = pos.reshape(-1)  # (M,)
 60 |     out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
 61 | 
 62 |     emb_sin = np.sin(out) # (M, D/2)
 63 |     emb_cos = np.cos(out) # (M, D/2)
 64 | 
 65 |     emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
 66 |     return emb
 67 | 
 68 | 
 69 | # --------------------------------------------------------
 70 | # TimestepEmbedder
 71 | # Reference:
 72 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
 73 | # --------------------------------------------------------
 74 | class TimestepEmbedder(nn.Module):
 75 |     """
 76 |     Embeds scalar timesteps into vector representations.
 77 |     """
 78 |     def __init__(self, hidden_size, frequency_embedding_size=256):
 79 |         super().__init__()
 80 |         self.mlp = nn.Sequential(
 81 |             nn.Linear(frequency_embedding_size, hidden_size, bias=True),
 82 |             nn.SiLU(),
 83 |             nn.Linear(hidden_size, hidden_size, bias=True),
 84 |         )
 85 |         self.frequency_embedding_size = frequency_embedding_size
 86 | 
 87 |     @staticmethod
 88 |     def timestep_embedding(t, dim, max_period=10000):
 89 |         """
 90 |         Create sinusoidal timestep embeddings.
 91 |         :param t: a 1-D Tensor of N indices, one per batch element.
 92 |                           These may be fractional.
 93 |         :param dim: the dimension of the output.
 94 |         :param max_period: controls the minimum frequency of the embeddings.
 95 |         :return: an (N, D) Tensor of positional embeddings.
 96 |         """
 97 |         half = dim // 2
 98 |         freqs = torch.exp(
 99 |             -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
100 |         ).to(device=t.device)
101 |         args = t[:, None].float() * freqs[None]
102 |         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
103 |         if dim % 2:
104 |             embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
105 |         return embedding
106 | 
107 |     def forward(self, t):
108 |         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
109 |         t_emb = self.mlp(t_freq)
110 |         return t_emb
111 | 
112 | 
113 | class MLPconnector(nn.Module):
114 |     def __init__(self, in_dim: int, out_dim: int, hidden_act: str):
115 |         super().__init__()
116 |         self.activation_fn = ACT2FN[hidden_act]
117 |         self.fc1 = nn.Linear(in_dim, out_dim)
118 |         self.fc2 = nn.Linear(out_dim, out_dim)
119 | 
120 |     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
121 |         hidden_states = self.fc1(hidden_states)
122 |         hidden_states = self.activation_fn(hidden_states)
123 |         hidden_states = self.fc2(hidden_states)
124 |         return hidden_states
125 | 
126 | 
127 | class PositionEmbedding(nn.Module):
128 |     def __init__(self, max_num_patch_per_side, hidden_size):
129 |         super().__init__()
130 |         self.max_num_patch_per_side = max_num_patch_per_side
131 |         self.hidden_size = hidden_size
132 |         self.pos_embed = nn.Parameter(
133 |             torch.zeros(max_num_patch_per_side ** 2, hidden_size), 
134 |             requires_grad=False
135 |         )
136 |         self._init_weights()
137 | 
138 |     def _init_weights(self):
139 |         # Initialize (and freeze) pos_embed by sin-cos embedding:
140 |         pos_embed = get_2d_sincos_pos_embed(self.hidden_size, self.max_num_patch_per_side)
141 |         self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float())
142 | 
143 |     def forward(self, position_ids):
144 |         return self.pos_embed[position_ids]


--------------------------------------------------------------------------------
/eval/gen/wise/cal_score.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import json
  5 | import os
  6 | import argparse
  7 | from collections import defaultdict
  8 | 
  9 | 
 10 | def calculate_wiscore(consistency, realism, aesthetic_quality):
 11 |     return 0.7 * consistency + 0.2 * realism + 0.1 * aesthetic_quality
 12 | 
 13 | 
 14 | def cal_culture(file_path):
 15 |     all_scores = []
 16 |     total_objects = 0
 17 |     has_9_9 = False
 18 |     
 19 |     with open(file_path, 'r') as file:
 20 |         for line in file:
 21 |             total_objects += 1
 22 |             data = json.loads(line)
 23 |             if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]:
 24 |                 has_9_9 = True
 25 |             wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality'])
 26 |             all_scores.append(wiscore)
 27 |     
 28 |     if has_9_9 or total_objects < 400:
 29 |         print(f"Skipping file {file_path}: Contains 9.9 or has less than 400 objects.")
 30 |         return None
 31 |     
 32 |     total_score = sum(all_scores)
 33 |     avg_score = total_score / (len(all_scores)*2) if len(all_scores) > 0 else 0
 34 |     
 35 |     score = {
 36 |         'total': total_score,
 37 |         'average': avg_score
 38 |     }
 39 | 
 40 |     print(f"  Cultural - Total: {score['total']:.2f}, Average: {score['average']:.2f}")
 41 | 
 42 |     return avg_score
 43 | 
 44 | 
 45 | def cal_space_time(file_path):
 46 |     categories = defaultdict(list)
 47 |     total_objects = 0
 48 |     has_9_9 = False
 49 |     
 50 |     with open(file_path, 'r') as file:
 51 |         for line in file:
 52 |             total_objects += 1
 53 |             data = json.loads(line)
 54 |             if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]:
 55 |                 has_9_9 = True
 56 |             subcategory = data['Subcategory']
 57 |             wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality'])
 58 |             if subcategory in ['Longitudinal time', 'Horizontal time']:
 59 |                 categories['Time'].append(wiscore)
 60 |             else:
 61 |                 categories['Space'].append(wiscore)
 62 |     
 63 |     if has_9_9 or total_objects < 300:
 64 |         print(f"Skipping file {file_path}: Contains 9.9 or has less than 400 objects.")
 65 |         return None
 66 |     
 67 |     total_scores = {category: sum(scores) for category, scores in categories.items()}
 68 |     avg_scores = {category: sum(scores) / (len(scores) * 2 )if len(scores) > 0 else 0 for category, scores in categories.items()}
 69 |     
 70 |     scores = {
 71 |         'total': total_scores,
 72 |         'average': avg_scores
 73 |     }
 74 | 
 75 |     print(f"  Time - Total: {scores['total'].get('Time', 0):.2f}, Average: {scores['average'].get('Time', 0):.2f}")
 76 |     print(f"  Space - Total: {scores['total'].get('Space', 0):.2f}, Average: {scores['average'].get('Space', 0):.2f}")
 77 | 
 78 |     return avg_scores
 79 | 
 80 | 
 81 | def cal_science(file_path):
 82 |     categories = defaultdict(list)
 83 |     total_objects = 0
 84 |     has_9_9 = False
 85 |     
 86 |     with open(file_path, 'r') as file:
 87 |         for line in file:
 88 |             total_objects += 1
 89 |             data = json.loads(line)
 90 |             if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]:
 91 |                 has_9_9 = True
 92 |             
 93 |             prompt_id = data.get('prompt_id', 0)
 94 |             if 701 <= prompt_id <= 800:
 95 |                 category = 'Biology'
 96 |             elif 801 <= prompt_id <= 900:
 97 |                 category = 'Physics'
 98 |             elif 901 <= prompt_id <= 1000:
 99 |                 category = 'Chemistry'
100 |             else:
101 |                 category = "?"
102 |             
103 |             wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality'])
104 |             categories[category].append(wiscore)
105 |     
106 |     if has_9_9 or total_objects < 300: 
107 |         print(f"Skipping file {file_path}: Contains 9.9 or has less than 300 objects.")
108 |         return None
109 |     
110 |     total_scores = {category: sum(scores) for category, scores in categories.items()}
111 |     avg_scores = {category: sum(scores) / (len(scores)*2) if len(scores) > 0 else 0 for category, scores in categories.items()}
112 | 
113 |     scores = {
114 |         'total': total_scores,
115 |         'average': avg_scores
116 |     }
117 | 
118 |     for category in ['Biology', 'Physics', 'Chemistry']:
119 |         print(f"  {category} - Total: {scores['total'].get(category, 0):.2f}, Average: {scores['average'].get(category, 0):.2f}")
120 |     
121 |     return avg_scores
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     parser = argparse.ArgumentParser(description='Image Quality Assessment Tool')
126 |     parser.add_argument('--output_dir', required=True,
127 |                         help='Path to the output directory')
128 |     args = parser.parse_args()
129 | 
130 |     avg_score = dict()
131 | 
132 |     score = cal_culture(
133 |         os.path.join(args.output_dir, "cultural_common_sense_scores.jsonl")
134 |     )
135 |     avg_score['Cultural'] = score
136 | 
137 |     scores = cal_space_time(
138 |         os.path.join(args.output_dir, "spatio-temporal_reasoning_scores.jsonl")
139 |     )
140 |     avg_score.update(scores)
141 | 
142 |     scores = cal_science(
143 |         os.path.join(args.output_dir, "natural_science_scores.jsonl")
144 |     )
145 |     avg_score.update(scores)
146 | 
147 |     avg_all = sum(avg_score.values()) / len(avg_score)
148 | 
149 |     avg_score['Overall'] = avg_all
150 |     keys = ""
151 |     values = ""
152 |     for k, v in avg_score.items():
153 |         keys += f"{k} "
154 |         values += f"{v:.2f} "
155 |     print(keys)
156 |     print(values)
157 | 
158 |     writer = open(os.path.join(args.output_dir, "results.txt"), 'w')
159 |     print(f"write results to file {os.path.join(args.output_dir, 'results.txt')}")
160 |     writer.write(keys + "\n")
161 |     writer.write(values + "\n")
162 |     writer.close()


--------------------------------------------------------------------------------
/eval/vlm/eval/mathvista/extract_answer_mp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | 
 13 | import argparse
 14 | import os
 15 | import re
 16 | import openai
 17 | from concurrent.futures import ThreadPoolExecutor, as_completed
 18 | from tqdm import tqdm
 19 | from utilities import *
 20 | from prompts.ext_ans import demo_prompt
 21 | 
 22 | openai.api_key = os.getenv('OPENAI_API_KEY')
 23 | print(openai.api_key)
 24 | 
 25 | def verify_extraction(extraction):
 26 |     extraction = extraction.strip()
 27 |     if extraction == '' or extraction is None:
 28 |         return False
 29 |     return True
 30 | 
 31 | def create_test_prompt(demo_prompt, query, response):
 32 |     demo_prompt = demo_prompt.strip()
 33 |     test_prompt = f'{query}\n\n{response}'
 34 |     full_prompt = f'{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: '
 35 |     return full_prompt
 36 | 
 37 | def _extract_answer(text):
 38 |     match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
 39 |     if match:
 40 |         return match.group(2).strip()
 41 |     return text
 42 | 
 43 | def extract_answer(response, problem, quick_extract=False):
 44 |     question_type = problem['question_type']
 45 |     answer_type = problem['answer_type']
 46 |     choices = problem['choices']
 47 |     query = problem['query']
 48 | 
 49 |     if response == '':
 50 |         return ''
 51 | 
 52 |     if question_type == 'multi_choice' and response in choices:
 53 |         return response
 54 | 
 55 |     if answer_type == 'integer':
 56 |         try:
 57 |             extraction = int(response)
 58 |             return str(extraction)
 59 |         except:
 60 |             pass
 61 | 
 62 |     if answer_type == 'float':
 63 |         try:
 64 |             extraction = str(float(response))
 65 |             return extraction
 66 |         except:
 67 |             pass
 68 | 
 69 |     # quick extraction
 70 |     if quick_extract:
 71 |         print('Quickly extracting answer...')
 72 |         try:
 73 |             result = _extract_answer(response)
 74 |             return result
 75 |         except:
 76 |             pass
 77 | 
 78 |     try:
 79 |         full_prompt = create_test_prompt(demo_prompt, query, response)
 80 |         extraction = get_chat_response(full_prompt, openai.api_key, patience=5, model=args.llm_engine)
 81 |         return extraction
 82 |     except Exception as e:
 83 |         print(e)
 84 | 
 85 |     return ''
 86 | 
 87 | def process_problem(pid, results, label, args):
 88 |     problem = results[pid]
 89 |     response = problem[label]
 90 |     extraction = extract_answer(response, problem, args.quick_extract)
 91 |     return pid, extraction
 92 | 
 93 | if __name__ == '__main__':
 94 |     parser = argparse.ArgumentParser()
 95 |     # input
 96 |     parser.add_argument('--output_dir', type=str, default='./results')
 97 |     parser.add_argument('--output_file', type=str, default='mathvista_answer.json')
 98 |     parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
 99 |     # model
100 |     parser.add_argument('--llm_engine', type=str, default='gpt-4o-2024-11-20', help='llm engine',
101 |                         choices=['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613',
102 |                                  'gpt-4o-2024-08-06', 'gpt-4o-2024-11-20'])
103 |     parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
104 |     parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
105 |     parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
106 |     # output
107 |     parser.add_argument('--save_every', type=int, default=100, help='save every n problems')
108 |     parser.add_argument('--output_label', type=str, default='', help='label for the output file')
109 |     parser.add_argument('--max_workers', type=int, default=40, help='max workers for ThreadPoolExecutor')
110 |     args = parser.parse_args()
111 | 
112 |     label = args.response_label
113 |     result_file = os.path.join(args.output_dir, args.output_file)
114 | 
115 |     if args.output_label != '':
116 |         output_file = result_file.replace('.json', f'_{args.output_label}.json')
117 |     else:
118 |         output_file = result_file
119 | 
120 |     print(f'Reading {result_file}...')
121 |     results = read_json(result_file)
122 | 
123 |     full_pids = list(results.keys())
124 |     if args.number > 0:
125 |         full_pids = full_pids[:min(args.number, len(full_pids))]
126 |     print('Number of total problems:', len(full_pids))
127 | 
128 |     if args.rerun:
129 |         test_pids = full_pids
130 |     else:
131 |         test_pids = []
132 |         for pid in full_pids:
133 |             if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
134 |                 test_pids.append(pid)
135 | 
136 |     test_num = len(test_pids)
137 |     print('Number of problems to run:', test_num)
138 | 
139 |     with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
140 |         future_to_pid = {}
141 |         for pid in test_pids:
142 |             future = executor.submit(process_problem, pid, results, label, args)
143 |             future_to_pid[future] = pid
144 | 
145 |         completed_count = 0
146 |         for future in tqdm(as_completed(future_to_pid), total=test_num):
147 |             pid = future_to_pid[future]
148 |             try:
149 |                 pid_result, extraction = future.result()
150 |                 results[pid_result]['extraction'] = extraction
151 |             except Exception as e:
152 |                 print(f'Error processing pid={pid}: {e}')
153 | 
154 |             completed_count += 1
155 |             if (completed_count % args.save_every == 0) or (completed_count == test_num):
156 |                 print(f'Saving results to {output_file}... [{completed_count}/{test_num}]')
157 |                 save_json(results, output_file)
158 |                 print('Results saved.')
159 | 
160 |     print('All done!')
161 | 


--------------------------------------------------------------------------------
/eval/vlm/evaluate.sh:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | set -x
 13 | 
 14 | export PYTHONPATH="$(pwd):${PYTHONPATH}"
 15 | export TF_CPP_MIN_LOG_LEVEL=3
 16 | export LAUNCHER=pytorch
 17 | 
 18 | DATASET=${1}
 19 | echo "CHECKPOINT: ${CHECKPOINT}"
 20 | 
 21 | # Save original arguments
 22 | ARGS=("$@")
 23 | 
 24 | # Parse options
 25 | while [[ $# -gt 0 ]]; do
 26 |   case "$1" in
 27 |     --auto)
 28 |       GPUS=1
 29 |       shift
 30 |       ;;
 31 |     *)
 32 |       shift
 33 |       ;;
 34 |   esac
 35 | done
 36 | echo "GPUS: ${GPUS}"
 37 | 
 38 | if  [ ${DATASET} == "mme" ]; then
 39 |   python -m eval.vlm.eval.mme.eval "${ARGS[@]:1}"
 40 | fi
 41 | 
 42 | if [ ${DATASET} == "mmvet" ]; then
 43 |     python -m eval.vlm.eval.mmvet.evaluate_mmvet --datasets mmvet "${ARGS[@]:1}"
 44 | fi
 45 | 
 46 | if [ ${DATASET} == "mmbench-dev-en" ]; then
 47 |     torchrun \
 48 |       --nnodes=$ARNOLD_WORKER_NUM \
 49 |       --node_rank=$ARNOLD_ID \
 50 |       --master_addr=$ARNOLD_WORKER_0_HOST \
 51 |       --nproc_per_node=${GPUS} \
 52 |       --master_port=${MASTER_PORT} \
 53 |       -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_dev_20230712 "${ARGS[@]:1}"
 54 | fi
 55 | 
 56 | if [ ${DATASET} == "mmbench-dev-cn" ]; then
 57 |     torchrun \
 58 |       --nnodes=$ARNOLD_WORKER_NUM \
 59 |       --node_rank=$ARNOLD_ID \
 60 |       --master_addr=$ARNOLD_WORKER_0_HOST \
 61 |       --nproc_per_node=${GPUS} \
 62 |       --master_port=${MASTER_PORT} \
 63 |       -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_dev_cn_20231003 "${ARGS[@]:1}"
 64 | fi
 65 | 
 66 | if [ ${DATASET} == "mmbench-test-en" ]; then
 67 |     torchrun \
 68 |       --nnodes=$ARNOLD_WORKER_NUM \
 69 |       --node_rank=$ARNOLD_ID \
 70 |       --master_addr=$ARNOLD_WORKER_0_HOST \
 71 |       --nproc_per_node=${GPUS} \
 72 |       --master_port=${MASTER_PORT} \
 73 |       -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_test_en_20231003 "${ARGS[@]:1}"
 74 | fi
 75 | 
 76 | if [ ${DATASET} == "mmbench-test-cn" ]; then
 77 |     torchrun \
 78 |       --nnodes=$ARNOLD_WORKER_NUM \
 79 |       --node_rank=$ARNOLD_ID \
 80 |       --master_addr=$ARNOLD_WORKER_0_HOST \
 81 |       --nproc_per_node=${GPUS} \
 82 |       --master_port=${MASTER_PORT} \
 83 |       -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_test_cn_20231003 "${ARGS[@]:1}"
 84 | fi
 85 | 
 86 | if [ ${DATASET} == "mmmu-dev" ]; then
 87 |     torchrun \
 88 |       --nnodes=$ARNOLD_WORKER_NUM \
 89 |       --node_rank=$ARNOLD_ID \
 90 |       --master_addr=$ARNOLD_WORKER_0_HOST \
 91 |       --nproc_per_node=${GPUS} \
 92 |       --master_port=${MASTER_PORT} \
 93 |       -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_dev "${ARGS[@]:1}"
 94 | fi
 95 | 
 96 | if [ ${DATASET} == "mmmu-val" ]; then
 97 |     torchrun \
 98 |       --nnodes=$ARNOLD_WORKER_NUM \
 99 |       --node_rank=$ARNOLD_ID \
100 |       --master_addr=$ARNOLD_WORKER_0_HOST \
101 |       --nproc_per_node=${GPUS} \
102 |       --master_port=${MASTER_PORT} \
103 |       -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_validation "${ARGS[@]:1}"
104 | fi
105 | 
106 | if [ ${DATASET} == "mmmu-val_cot" ]; then
107 |     torchrun \
108 |       --nnodes=$ARNOLD_WORKER_NUM \
109 |       --node_rank=$ARNOLD_ID \
110 |       --master_addr=$ARNOLD_WORKER_0_HOST \
111 |       --nproc_per_node=${GPUS} \
112 |       --master_port=${MASTER_PORT} \
113 |       -m eval.vlm.eval.mmmu.evaluate_mmmu_cot --datasets MMMU_validation_cot "${ARGS[@]:1}"
114 | fi
115 | 
116 | if [ ${DATASET} == "mmmu-test" ]; then
117 |     torchrun \
118 |       --nnodes=$ARNOLD_WORKER_NUM \
119 |       --node_rank=$ARNOLD_ID \
120 |       --master_addr=$ARNOLD_WORKER_0_HOST \
121 |       --nproc_per_node=${GPUS} \
122 |       --master_port=${MASTER_PORT} \
123 |       -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_test "${ARGS[@]:1}"
124 | fi
125 | 
126 | if [ ${DATASET} == "mathvista-testmini" ]; then
127 |     torchrun \
128 |       --nnodes=$ARNOLD_WORKER_NUM \
129 |       --node_rank=$ARNOLD_ID \
130 |       --master_addr=$ARNOLD_WORKER_0_HOST \
131 |       --nproc_per_node=${GPUS} \
132 |       --master_port=${MASTER_PORT} \
133 |       -m eval.vlm.eval.mathvista.evaluate_mathvista --datasets MathVista_testmini "${ARGS[@]:1}"
134 | fi
135 | 
136 | if [ ${DATASET} == "mathvista-test" ]; then
137 |     torchrun \
138 |       --nnodes=$ARNOLD_WORKER_NUM \
139 |       --node_rank=$ARNOLD_ID \
140 |       --master_addr=$ARNOLD_WORKER_0_HOST \
141 |       --nproc_per_node=${GPUS} \
142 |       --master_port=${MASTER_PORT} \
143 |       -m eval.vlm.eval.mathvista.evaluate_mathvista --datasets MathVista_test "${ARGS[@]:1}"
144 | fi
145 | 
146 | if [ ${DATASET} == "pope" ]; then
147 |     torchrun \
148 |     --nnodes=$ARNOLD_WORKER_NUM \
149 |     --node_rank=$ARNOLD_ID \
150 |     --master_addr=$ARNOLD_WORKER_0_HOST \
151 |     --nproc_per_node=${GPUS} \
152 |     --master_port=${MASTER_PORT} \
153 |     -m eval.vlm.eval.pope.evaluate_pope --datasets pope "${ARGS[@]:1}"
154 | fi
155 | 
156 | if [ ${DATASET} == "pope_cot" ]; then
157 |     torchrun \
158 |     --nnodes=$ARNOLD_WORKER_NUM \
159 |     --node_rank=$ARNOLD_ID \
160 |     --master_addr=$ARNOLD_WORKER_0_HOST \
161 |     --nproc_per_node=${GPUS} \
162 |     --master_port=${MASTER_PORT} \
163 |     -m eval.vlm.eval.pope.evaluate_pope --datasets pope_cot --cot "${ARGS[@]:1}"
164 | fi
165 | 
166 | if [ ${DATASET} == "vqa-gqa-testdev" ]; then
167 |     torchrun \
168 |     --nnodes=$ARNOLD_WORKER_NUM \
169 |     --node_rank=$ARNOLD_ID \
170 |     --master_addr=$ARNOLD_WORKER_0_HOST \
171 |     --nproc_per_node=${GPUS} \
172 |     --master_port=${MASTER_PORT} \
173 |     -m eval.vlm.eval.vqa.evaluate_vqa --datasets gqa_testdev_llava "${ARGS[@]:1}"
174 | fi
175 | 
176 | if [ ${DATASET} == "mmvp" ]; then
177 |     torchrun \
178 |       --nnodes=$ARNOLD_WORKER_NUM \
179 |       --node_rank=$ARNOLD_ID \
180 |       --master_addr=$ARNOLD_WORKER_0_HOST \
181 |       --nproc_per_node=${GPUS} \
182 |       --master_port=${MASTER_PORT} \
183 |       -m eval.vlm.eval.mmvp.evaluate_mmvp --datasets MMVP "${ARGS[@]:1}"
184 | fi
185 | 


--------------------------------------------------------------------------------
/data/t2i_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import io
  5 | import json
  6 | import pyarrow.parquet as pq
  7 | import random
  8 | from PIL import Image
  9 | 
 10 | from .data_utils import pil_img2rgb
 11 | from .distributed_iterable_dataset import DistributedIterableDataset
 12 | from .parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
 13 | 
 14 | Image.MAX_IMAGE_PIXELS = 20_000_000
 15 | 
 16 | 
 17 | class T2IIterableDataset(DistributedIterableDataset):
 18 |     def __init__(
 19 |         self, dataset_name, transform, tokenizer, data_dir_list, num_used_data, 
 20 |         local_rank=0, world_size=1, num_workers=8, data_status=None,
 21 |     ):
 22 |         """
 23 |         data_dir_list: list of data directories contains parquet files
 24 |         num_used_data: list of number of sampled data paths for each data directory
 25 |         """
 26 |         super().__init__(dataset_name, local_rank, world_size, num_workers)
 27 |         self.transform = transform
 28 |         self.tokenizer = tokenizer
 29 |         self.data_status = data_status
 30 |         self.data_paths = self.get_data_paths(data_dir_list, num_used_data)
 31 |         self.set_epoch()
 32 | 
 33 |     def get_data_paths(self, data_dir_list, num_used_data):
 34 |         return get_parquet_data_paths(data_dir_list, num_used_data)
 35 | 
 36 |     def __iter__(self):
 37 |         data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
 38 |         if self.data_status is not None:
 39 |             parquet_start_id = self.data_status[worker_id][0]
 40 |             row_group_start_id = self.data_status[worker_id][1]
 41 |             row_start_id = self.data_status[worker_id][2] + 1
 42 |         else:
 43 |             parquet_start_id = 0
 44 |             row_group_start_id = 0
 45 |             row_start_id = 0
 46 |         transform_stride = self.transform.stride
 47 | 
 48 |         print(
 49 |             f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
 50 |             f"resuming data at parquet#{parquet_start_id}, rg#{row_group_start_id}, row#{row_start_id}"
 51 |         )
 52 | 
 53 |         while True:
 54 |             data_paths_per_worker_ = data_paths_per_worker[parquet_start_id:]
 55 |             for parquet_idx, parquet_file_path in enumerate(data_paths_per_worker_, start=parquet_start_id):
 56 |                 fs = init_arrow_pf_fs(parquet_file_path)
 57 |                 with fs.open_input_file(parquet_file_path) as f:
 58 |                     fr = pq.ParquetFile(f)
 59 |                     row_group_ids = list(range(fr.num_row_groups))
 60 |                     row_group_ids_ = row_group_ids[row_group_start_id:]
 61 | 
 62 |                     for row_group_id in row_group_ids_:
 63 |                         df = fr.read_row_group(row_group_id).to_pandas()
 64 |                         df = df.iloc[row_start_id:]
 65 | 
 66 |                         for row_idx, row in df.iterrows():
 67 |                             num_tokens = 0
 68 |                             try:
 69 |                                 image_byte = row['image']
 70 |                                 image = pil_img2rgb(Image.open(io.BytesIO(image_byte)))
 71 |                             except Exception as e:
 72 |                                 print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
 73 |                                 continue
 74 |                             image_tensor = self.transform(image)
 75 |                             height, width = image_tensor.shape[1:]
 76 |                             num_tokens += width * height // transform_stride ** 2
 77 | 
 78 |                             try:
 79 |                                 caption_dict = row['captions']
 80 |                                 caption_dict = json.loads(caption_dict)
 81 |                             except Exception as e:
 82 |                                 print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
 83 |                                 continue
 84 | 
 85 |                             caps_token = [self.tokenizer.encode(v) for _, v in caption_dict.items()]
 86 |                             if len(caps_token) == 0:
 87 |                                 print(f'no caption in rg#{row_group_id}, {parquet_file_path}')
 88 |                                 caption_token = self.tokenizer.encode(' ')
 89 |                             else:
 90 |                                 caption_token = random.choice(caps_token)
 91 | 
 92 |                             sequence_plan, text_ids_list = [], []
 93 |                             text_ids = caption_token
 94 |                             num_tokens += len(caption_token)
 95 |                             text_ids_list.append(text_ids)
 96 |                             sequence_plan.append({
 97 |                                 'type': 'text',
 98 |                                 'enable_cfg': 1,
 99 |                                 'loss': 0,
100 |                                 'special_token_loss': 0,
101 |                                 'special_token_label': None,
102 |                             })
103 |                         
104 |                             sequence_plan.append({
105 |                                 'type': 'vae_image',
106 |                                 'enable_cfg': 0,
107 |                                 'loss': 1,
108 |                                 'special_token_loss': 0,
109 |                                 'special_token_label': None,
110 |                             })
111 | 
112 |                             sample = dict(
113 |                                 image_tensor_list=[image_tensor], 
114 |                                 text_ids_list=text_ids_list,
115 |                                 num_tokens=num_tokens,
116 |                                 sequence_plan=sequence_plan,
117 |                                 data_indexes={
118 |                                     "data_indexes": [parquet_idx, row_group_id, row_idx],
119 |                                     "worker_id": worker_id,
120 |                                     "dataset_name": self.dataset_name,
121 |                                 }
122 |                             )
123 |                             yield sample
124 | 
125 |                         row_start_id = 0
126 |                     row_group_start_id = 0
127 |             parquet_start_id = 0
128 |             print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")
129 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mme/calculation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import argparse
 13 | import os
 14 | 
 15 | from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
 16 |                              recall_score)
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | # parser.add_argument('--results_dir', default='./LaVIN', type=str)
 20 | parser.add_argument('--out-dir', default='./', type=str)
 21 | 
 22 | eval_type_dict = {
 23 |     'Perception': ['existence', 'count', 'position', 'color', 'posters', 'celebrity', 'scene', 'landmark', 'artwork', 'OCR'],
 24 |     'Cognition': ['commonsense_reasoning', 'numerical_calculation', 'text_translation', 'code_reasoning']
 25 | }
 26 | 
 27 | 
 28 | class calculate_metrics:
 29 |     def divide_chunks(self, l, n=2):
 30 |         # looping till length l
 31 |         for i in range(0, len(l), n):
 32 |             yield l[i:i + n]
 33 | 
 34 |         return
 35 | 
 36 |     def parse_pred_ans(self, pred_ans):
 37 |         pred_label = None
 38 |         if pred_ans in ['yes', 'no']:
 39 |             pred_label = pred_ans
 40 |         else:
 41 |             prefix_pred_ans = pred_ans[:4]
 42 | 
 43 |             if 'yes' in prefix_pred_ans:
 44 |                 pred_label = 'yes'
 45 |             elif 'no' in prefix_pred_ans:
 46 |                 pred_label = 'no'
 47 |             else:
 48 |                 pred_label = 'other'
 49 | 
 50 |         return pred_label
 51 | 
 52 |     def compute_metric(self, gts, preds):
 53 |         assert len(gts) == len(preds)
 54 | 
 55 |         label_map = {
 56 |             'yes': 1,
 57 |             'no': 0,
 58 |             'other': -1,
 59 |         }
 60 | 
 61 |         gts = [label_map[x] for x in gts]
 62 |         preds = [label_map[x] for x in preds]
 63 | 
 64 |         acc = accuracy_score(gts, preds)
 65 | 
 66 |         clean_gts = []
 67 |         clean_preds = []
 68 |         other_num = 0
 69 |         for gt, pred in zip(gts, preds):
 70 |             if pred == -1:
 71 |                 other_num += 1
 72 |                 continue
 73 |             clean_gts.append(gt)
 74 |             clean_preds.append(pred)
 75 | 
 76 |         conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0])
 77 |         precision = precision_score(clean_gts, clean_preds, average='binary')
 78 |         recall = recall_score(clean_gts, clean_preds, average='binary')
 79 |         tp, fn = conf_mat[0]
 80 |         fp, tn = conf_mat[1]
 81 | 
 82 |         metric_dict = dict()
 83 |         metric_dict = {
 84 |             'TP': tp,
 85 |             'FN': fn,
 86 |             'TN': tn,
 87 |             'FP': fp,
 88 |             'precision': precision,
 89 |             'recall': recall,
 90 |             'other_num': other_num,
 91 |             'acc': acc,
 92 |         }
 93 | 
 94 |         return metric_dict
 95 | 
 96 |     def process_result(self, results_dir):
 97 |         ret_message = ""
 98 |         model_score_dict = dict()
 99 |         for eval_type, task_name_list in eval_type_dict.items():
100 |             print('===========', eval_type, '===========')
101 |             ret_message += f"=========== {eval_type} ===========\n"
102 | 
103 |             scores = 0
104 |             task_score_dict = dict()
105 | 
106 |             for task_name in task_name_list:
107 | 
108 |                 task_txt = os.path.join(results_dir, task_name + '.txt')
109 |                 lines = open(task_txt, 'r').readlines()
110 |                 chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions
111 | 
112 |                 img_num = len(chunk_lines)
113 |                 task_other_ans_num = 0
114 |                 task_score = 0
115 |                 acc_plus_correct_num = 0
116 |                 gts = []
117 |                 preds = []
118 | 
119 |                 for img_items in chunk_lines:
120 |                     assert len(img_items) == 2
121 |                     img_correct_num = 0
122 | 
123 |                     for img_item in img_items:
124 |                         try:
125 |                             img_name, question, gt_ans, pred_ans = img_item.split('\t')
126 |                         except:
127 |                             print(img_item)
128 |                             continue
129 |                         gt_ans = gt_ans.lower()
130 |                         pred_ans = pred_ans.lower()
131 | 
132 |                         assert gt_ans in ['yes', 'no'] # gt can only be yes or no.
133 | 
134 |                         pred_ans = self.parse_pred_ans(pred_ans)
135 |                         assert pred_ans in ['yes', 'no', 'other']
136 | 
137 |                         gts.append(gt_ans)
138 |                         preds.append(pred_ans)
139 | 
140 |                         if gt_ans == pred_ans:
141 |                             img_correct_num += 1
142 | 
143 |                         if pred_ans not in ['yes', 'no']:
144 |                             task_other_ans_num += 1
145 | 
146 |                     if img_correct_num == 2:
147 |                         acc_plus_correct_num += 1
148 | 
149 |                 # cal TP precision acc, etc.
150 |                 metric_dict = self.compute_metric(gts, preds)
151 |                 acc_plus = acc_plus_correct_num / img_num
152 |                 metric_dict['acc_plus'] = acc_plus
153 | 
154 |                 for k, v in metric_dict.items():
155 |                     if k in ['acc', 'acc_plus']:
156 |                         task_score += v*100
157 | 
158 |                 task_score_dict[task_name] = task_score
159 | 
160 |                 scores += task_score
161 | 
162 |             print('total score:', scores, '\n')
163 |             ret_message += f"total score: {scores} \n\n"
164 |             for task_name, score in task_score_dict.items():
165 |                 print('\t', task_name, ' score:', score)
166 |                 ret_message += f"\t {task_name} score: {score}\n"
167 |             print('\n')
168 |             ret_message += "\n\n"
169 | 
170 |         return ret_message
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     cal = calculate_metrics()
175 | 
176 |     args = parser.parse_args()
177 |     # results_dir = args.results_dir
178 |     results_dir = args.out_dir
179 |     ret_message = cal.process_result(results_dir)
180 | 
181 |     writer = open(os.path.join(args.out_dir, "results.txt"), 'w')
182 |     print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}")
183 |     writer.write(ret_message)
184 |     writer.close()


--------------------------------------------------------------------------------
/data/video_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | 
 13 | import io
 14 | import os
 15 | import random
 16 | import re
 17 | 
 18 | import numpy as np
 19 | import decord
 20 | from PIL import Image
 21 | 
 22 | 
 23 | def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
 24 |     if sample in ['rand', 'middle']: # uniform sampling
 25 |         acc_samples = min(num_frames, vlen)
 26 |         # split the video into `acc_samples` intervals, and sample from each interval.
 27 |         intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
 28 |         ranges = []
 29 |         for idx, interv in enumerate(intervals[:-1]):
 30 |             ranges.append((interv, intervals[idx + 1] - 1))
 31 |         if sample == 'rand':
 32 |             try:
 33 |                 frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
 34 |             except:
 35 |                 frame_indices = np.random.permutation(vlen)[:acc_samples]
 36 |                 frame_indices.sort()
 37 |                 frame_indices = list(frame_indices)
 38 |         elif fix_start is not None:
 39 |             frame_indices = [x[0] + fix_start for x in ranges]
 40 |         elif sample == 'middle':
 41 |             frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
 42 |         else:
 43 |             raise NotImplementedError
 44 | 
 45 |         if len(frame_indices) < num_frames:  # padded with last frame
 46 |             padded_frame_indices = [frame_indices[-1]] * num_frames
 47 |             padded_frame_indices[:len(frame_indices)] = frame_indices
 48 |             frame_indices = padded_frame_indices
 49 |     elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
 50 |         output_fps = float(sample[3:])
 51 |         duration = float(vlen) / input_fps
 52 |         delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
 53 |         frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
 54 |         frame_indices = np.around(frame_seconds * input_fps).astype(int)
 55 |         frame_indices = [e for e in frame_indices if e < vlen]
 56 |         if max_num_frames > 0 and len(frame_indices) > max_num_frames:
 57 |             frame_indices = frame_indices[:max_num_frames]
 58 |     else:
 59 |         raise ValueError
 60 |     return frame_indices
 61 | 
 62 | 
 63 | def read_frames_decord(video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4):
 64 |     video_reader = decord.VideoReader(video_path, num_threads=1)
 65 |     vlen = len(video_reader)
 66 |     fps = video_reader.get_avg_fps()
 67 |     duration = vlen / float(fps)
 68 |     if clip:
 69 |         start, end = clip
 70 |         duration = end - start
 71 |         vlen = int(duration * fps)
 72 |         start_index = int(start * fps)
 73 | 
 74 |     t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
 75 | 
 76 |     frame_indices = get_frame_indices(
 77 |         t_num_frames, vlen, sample=sample, fix_start=fix_start,
 78 |         input_fps=fps
 79 |     )
 80 |     if clip:
 81 |         frame_indices = [f + start_index for f in frame_indices]
 82 |     frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), np.uint8
 83 |     frames = [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
 84 |     return frames
 85 | 
 86 | 
 87 | def extract_frame_number(filename):
 88 |     # Extract the numeric part from the filename using regular expressions
 89 |     match = re.search(r'_(\d+).jpg$', filename)
 90 |     return int(match.group(1)) if match else -1
 91 | 
 92 | 
 93 | def sort_frames(frame_paths):
 94 |     # Extract filenames from each path and sort by their numeric part
 95 |     return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
 96 | 
 97 | 
 98 | def read_frames_folder(video_path, num_frames, sample='rand', fix_start=None, min_num_frames=4):
 99 |     image_list = sort_frames(list(os.listdir(video_path)))
100 |     frames = []
101 |     for image in image_list:
102 |         fp = os.path.join(video_path, image)
103 |         frame = Image.open(fp).convert('RGB')
104 |         frames.append(frame)
105 |     vlen = len(frames)
106 | 
107 |     t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
108 | 
109 |     if vlen > t_num_frames:
110 |         frame_indices = get_frame_indices(
111 |             t_num_frames, vlen, sample=sample, fix_start=fix_start
112 |         )
113 |         frames = [frames[i] for i in frame_indices]
114 |     return frames
115 | 
116 | 
117 | class FrameSampler:
118 |     def __init__(self, max_num_frames=-1, min_num_frames=8, sample='rand'):
119 |         self.max_num_frames = max_num_frames
120 |         self.min_num_frames = min_num_frames
121 |         self.sample = sample
122 |     
123 |     def __call__(self, file_name):
124 |         fn = read_frames_folder if file_name.endswith('/') else read_frames_decord
125 |         frames = fn(file_name, num_frames=self.max_num_frames, min_num_frames=self.min_num_frames, sample=self.sample)
126 |         return frames
127 | 
128 | 
129 | def decode_video_byte(video_bytes):
130 |     video_stream = io.BytesIO(video_bytes)
131 |     vr = decord.VideoReader(video_stream)
132 |     return vr
133 | 
134 | 
135 | def sample_mp4_frames(mp4_p, n_frames=None, fps=None, return_frame_indices=False, random_sample=False):
136 |     if isinstance(mp4_p, str):
137 |         vr = decord.VideoReader(mp4_p, num_threads=1)
138 |     elif isinstance(mp4_p, decord.video_reader.VideoReader):
139 |         vr = mp4_p
140 |     video_fps = vr.get_avg_fps()  # 获取视频的帧率
141 |     video_duration = len(vr) / video_fps
142 |     if n_frames is not None:
143 |         if random_sample:
144 |             frame_indices = sorted(random.sample(range(len(vr)), n_frames))
145 |         else:
146 |             frame_indices = np.linspace(0, len(vr)-1, n_frames, dtype=int).tolist()
147 |     else:
148 |         frame_indices = [int(i) for i in np.arange(0, len(vr)-1, video_fps/fps)]
149 |     frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
150 |     frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
151 |     if not return_frame_indices:
152 |         return frames, video_duration
153 |     else:
154 |         return frames, video_duration, frame_indices
155 | 
156 | 
157 | def sample_mp4_frames_by_indices(mp4_p, frame_indices: list):
158 |     if isinstance(mp4_p, str):
159 |         vr = decord.VideoReader(mp4_p, num_threads=1)
160 |     elif isinstance(mp4_p, decord.video_reader.VideoReader):
161 |         vr = mp4_p
162 |     # sample the frames in frame_indices
163 |     frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
164 |     frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
165 |     return frames


--------------------------------------------------------------------------------
/eval/gen/imgedit/basic_bench.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | import base64
  5 | import os
  6 | import json
  7 | import argparse
  8 | import openai
  9 | from tqdm import tqdm
 10 | from concurrent.futures import ThreadPoolExecutor, as_completed
 11 | import threading
 12 | 
 13 | openai.api_key = os.getenv('OPENAI_API_KEY')
 14 | 
 15 | lock = threading.Lock()  # For thread-safe file writing
 16 | 
 17 | def load_prompts(prompts_json_path):
 18 |     with open(prompts_json_path, 'r') as f:
 19 |         return json.load(f)
 20 | 
 21 | def image_to_base64(image_path):
 22 |     try:
 23 |         with open(image_path, "rb") as image_file:
 24 |             return base64.b64encode(image_file.read()).decode('utf-8')
 25 |     except FileNotFoundError:
 26 |         print(f"File {image_path} not found.")
 27 |         return None
 28 | 
 29 | def call_gpt(original_image_path, result_image_path, edit_prompt, edit_type, prompts):
 30 |     try:
 31 |         original_image_base64 = image_to_base64(original_image_path)
 32 |         result_image_base64 = image_to_base64(result_image_path)
 33 | 
 34 |         if not original_image_base64 or not result_image_base64:
 35 |             return {"error": "Image conversion failed"}
 36 |         
 37 |         prompt = prompts[edit_type]
 38 |         full_prompt = prompt.replace('<edit_prompt>', edit_prompt)
 39 | 
 40 |         response = openai_client.chat.completions.create(
 41 |             model=model,
 42 |             stream=False,
 43 |             messages=[{
 44 |                 "role": "user",
 45 |                 "content": [
 46 |                     {"type": "text", "text": full_prompt},
 47 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{original_image_base64}"}},
 48 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{result_image_base64}"}}
 49 |                 ]
 50 |             }]
 51 |         )
 52 |         return response
 53 |     except Exception as e:
 54 |         print(f"Error in calling GPT API: {e}")
 55 |         raise
 56 | 
 57 | def save_result_jsonl(result, key, output_jsonl_path):
 58 |     with lock:
 59 |         with open(output_jsonl_path, 'a', encoding='utf-8') as f:
 60 |             data = {
 61 |                 "key": key,
 62 |                 "result": result
 63 |             }
 64 |             f.write(json.dumps(data, ensure_ascii=False) + '\n')
 65 | 
 66 | def load_processed_keys(jsonl_path):
 67 |     processed_keys = set()
 68 |     if os.path.exists(jsonl_path):
 69 |         with open(jsonl_path, 'r', encoding='utf-8') as f:
 70 |             for line in f:
 71 |                 try:
 72 |                     data = json.loads(line)
 73 |                     processed_keys.add(data["key"])
 74 |                 except Exception as e:
 75 |                     print(f"Error loading line: {e}")
 76 |     return processed_keys
 77 | 
 78 | def collect_jsonl_to_dict(jsonl_path):
 79 |     result_dict = {}
 80 |     if os.path.exists(jsonl_path):
 81 |         with open(jsonl_path, 'r', encoding='utf-8') as f:
 82 |             for line in f:
 83 |                 try:
 84 |                     data = json.loads(line)
 85 |                     result_dict[data["key"]] = data["result"]
 86 |                 except Exception as e:
 87 |                     print(f"Error parsing line: {e}")
 88 |     return result_dict
 89 | 
 90 | def process_single_item(key, item, result_img_folder, origin_img_root, prompts, output_jsonl_path):
 91 |     result_img_name = f"{key}.png"
 92 |     result_img_path = os.path.join(result_img_folder, result_img_name)
 93 |     origin_img_path = os.path.join(origin_img_root, item['id'])
 94 |     edit_prompt = item['prompt']
 95 |     edit_type = item['edit_type']
 96 | 
 97 |     response = call_gpt(origin_img_path, result_img_path, edit_prompt, edit_type, prompts)
 98 |     # Ensure 'choices' attribute exists in response
 99 |     result = response.choices[0].message.content if hasattr(response, "choices") else str(response)
100 |     save_result_jsonl(result, key, output_jsonl_path)
101 |     return key, result
102 | 
103 | def process_json(edit_json, result_img_folder, origin_img_root, num_threads, prompts):
104 |     output_jsonl_path = os.path.join(result_img_folder, 'result.jsonl')
105 |     output_json_path = os.path.join(result_img_folder, 'result.json')
106 |     with open(edit_json, 'r') as f:
107 |         edit_infos = json.load(f)
108 |     # Load already processed keys
109 |     processed_keys = load_processed_keys(output_jsonl_path)
110 |     print(f"{len(processed_keys)} items already processed, {len(edit_infos) - len(processed_keys)} remaining...")
111 |     # Filter out tasks that have already been processed
112 |     left_edit_infos = {k: v for k, v in edit_infos.items() if k not in processed_keys}
113 |     total = len(left_edit_infos)
114 |     if total == 0:
115 |         print("Nothing to process. All items are completed.")
116 |     else:
117 |         with ThreadPoolExecutor(max_workers=num_threads) as executor:
118 |             future_to_key = {
119 |                 executor.submit(process_single_item, key, item, result_img_folder, origin_img_root, prompts, output_jsonl_path): key
120 |                 for key, item in left_edit_infos.items()
121 |             }
122 |             for future in tqdm(as_completed(future_to_key), total=total, desc="Processing edits"):
123 |                 key = future_to_key[future]
124 |                 try:
125 |                     future.result()  # Already saved in jsonl
126 |                 except Exception as e:
127 |                     print(f"Error processing key {key}: {e}")
128 |                     # Failed keys will not be saved to jsonl
129 |     # After all finished, collect jsonl to dict and save to json
130 |     final_results = collect_jsonl_to_dict(output_jsonl_path)
131 |     with open(output_json_path, 'w', encoding='utf-8') as f:
132 |         json.dump(final_results, f, indent=4, ensure_ascii=False)
133 |     print(f"All processing completed. Final result saved in {output_json_path}")
134 | 
135 | def main():
136 |     parser = argparse.ArgumentParser(description="Evaluate image edits using GPT")
137 |     parser.add_argument('--result_img_folder', type=str, required=True, help="Folder with subfolders of edited images")
138 |     parser.add_argument('--edit_json', type=str, required=True, help="Path to JSON file mapping keys to metadata")
139 |     parser.add_argument('--origin_img_root', type=str, required=True, help="Root path where original images are stored")
140 |     parser.add_argument('--num_processes', type=int, default=32, help="Number of parallel threads")
141 |     parser.add_argument('--prompts_json', type=str, required=True, help="JSON file containing prompts") 
142 |     args = parser.parse_args()
143 | 
144 |     prompts = load_prompts(args.prompts_json)  
145 |     process_json(args.edit_json, args.result_img_folder, args.origin_img_root, args.num_processes, prompts)
146 | 
147 | if __name__ == "__main__":
148 |     base_url = "your_api_url"
149 |     api_version = "2024-03-01-preview"
150 |     api_key = openai.api_key
151 |     model = "gpt-4o-2024-11-20"
152 |     openai_client = openai.AzureOpenAI(
153 |         azure_endpoint=base_url,
154 |         api_version=api_version,
155 |         api_key=api_key,
156 |     )
157 |     main()
158 | 


--------------------------------------------------------------------------------
/data/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | 
  5 | import math
  6 | import random
  7 | from PIL import Image
  8 | 
  9 | import torch
 10 | from torch.nn.attention.flex_attention import or_masks, and_masks
 11 | 
 12 | 
 13 | def create_sparse_mask(document_lens, split_lens, attn_modes, device):
 14 |     def causal_mask(b, h, q_idx, kv_idx):
 15 |         return q_idx >= kv_idx
 16 | 
 17 |     def full_and_noise_mask(b, h, q_idx, kv_idx):
 18 |         return (full_and_noise_seq_id[q_idx] == full_and_noise_seq_id[kv_idx]) & (full_and_noise_seq_id[q_idx] >= 0)
 19 | 
 20 |     def remove_noise_mask(b, h, q_idx, kv_idx):
 21 |         return (~((noise_seq_id[kv_idx] >= 0) & (noise_seq_id[q_idx] != noise_seq_id[kv_idx])))
 22 | 
 23 |     def sample_mask(b, h, q_idx, kv_idx):
 24 |         return document_id[q_idx] == document_id[kv_idx]
 25 | 
 26 |     full_and_noise_tmp = []
 27 |     noise_tmp = []
 28 | 
 29 |     for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
 30 |         value = i if model in ['full', 'noise'] else -1
 31 |         full_and_noise_tmp.extend([value] * length)
 32 |         value_noise = i if model == 'noise' else -1
 33 |         noise_tmp.extend([value_noise] * length)
 34 | 
 35 |     full_and_noise_seq_id = torch.Tensor(full_and_noise_tmp).to(device)
 36 |     noise_seq_id = torch.Tensor(noise_tmp).to(device)
 37 | 
 38 |     document_id = torch.cat([torch.full((l,), i) for i, l in enumerate(document_lens, start=1)]).to(device)
 39 | 
 40 |     return and_masks(or_masks(causal_mask, full_and_noise_mask), remove_noise_mask, sample_mask)
 41 | 
 42 | 
 43 | def patchify(image, patch_size):
 44 |     p = patch_size
 45 |     c, h, w = image.shape
 46 |     assert h % p == 0 and w % p == 0
 47 |     image = image.reshape(c, h // p, p, w // p, p)
 48 |     image = torch.einsum("chpwq->hwpqc", image)
 49 |     image = image.reshape(-1, p**2 * c)
 50 |     return image
 51 | 
 52 | 
 53 | def get_flattened_position_ids_extrapolate(img_h, img_w, patch_size, max_num_patches_per_side):
 54 |     num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
 55 |     coords_h = torch.arange(0, num_patches_h)
 56 |     coords_w = torch.arange(0, num_patches_w)
 57 |     pos_ids = (coords_h[:, None] * max_num_patches_per_side + coords_w).flatten()
 58 |     return pos_ids
 59 | 
 60 | 
 61 | def get_flattened_position_ids_interpolate(img_h, img_w, patch_size, max_num_patches_per_side):
 62 |     num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
 63 |     boundaries = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side)
 64 |     fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h)
 65 |     fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w)
 66 |     bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
 67 |     bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
 68 |     pos_ids = (bucket_coords_h[:, None] * max_num_patches_per_side + bucket_coords_w).flatten()
 69 |     return pos_ids
 70 | 
 71 | 
 72 | def prepare_attention_mask_per_sample(split_lens, attn_modes, device="cpu"):
 73 |     """
 74 |     nested_split_lens: A list of N lists of ints. Each int indicates the length of a split within 
 75 |         a sample, where each sample contains multiple splits with different attn modes.
 76 |     nested_attn_modes: whether to use full attn in each split.
 77 |     """
 78 |     sample_len = sum(split_lens)
 79 |     attention_mask = torch.zeros((sample_len, sample_len), dtype=torch.bool, device=device)
 80 | 
 81 |     csum = 0
 82 |     for s, attn_mode in zip(split_lens, attn_modes):
 83 |         assert attn_mode in ['causal', 'full', 'noise']
 84 |         if attn_mode == "causal":
 85 |             attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s), device=device).tril()
 86 |             attention_mask[csum:csum + s, :csum] = 1
 87 |         else:
 88 |             attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s))
 89 |             attention_mask[csum:csum + s, :csum] = 1
 90 |         csum += s
 91 | 
 92 |     csum = 0
 93 |     for s, attn_mode in zip(split_lens, attn_modes):
 94 |         if attn_mode == "noise":
 95 |             attention_mask[:, csum : csum + s] = torch.zeros((sample_len, s))
 96 |             attention_mask[csum : csum + s, csum : csum + s] = torch.ones((s, s))
 97 |         csum += s
 98 | 
 99 |     attention_mask = torch.zeros_like(attention_mask, dtype=torch.float).masked_fill_(
100 |         ~attention_mask, float("-inf")
101 |     )
102 | 
103 |     return attention_mask
104 | 
105 | 
106 | def split_integer_exp_decay(S, ng_sample_decay=1.0):
107 |     if ng_sample_decay == 1.0:
108 |         N = random.randint(1, S)
109 |     else:
110 |         base = (1 - ng_sample_decay) / (1 - math.pow(ng_sample_decay, S))
111 |         p = [base * math.pow(ng_sample_decay, i) for i in range(S)]
112 |         N = random.choices(list(range(1, S + 1)), p, k=1)[0]
113 |     cumsum = [0] + sorted(random.sample(range(1, S), N - 1)) + [S]
114 |     result = [cumsum[i+1] - cumsum[i] for i in range(len(cumsum) - 1)]
115 |     return result, cumsum
116 | 
117 | 
118 | def pil_img2rgb(image):
119 |     if image.mode == "RGBA" or image.info.get("transparency", None) is not None:
120 |         image = image.convert("RGBA")
121 |         white = Image.new(mode="RGB", size=image.size, color=(255, 255, 255))
122 |         white.paste(image, mask=image.split()[3])
123 |         image = white
124 |     else:
125 |         image = image.convert("RGB")
126 | 
127 |     return image
128 | 
129 | 
130 | def add_special_tokens(tokenizer):
131 |     all_special_tokens = []
132 |     for k, v in tokenizer.special_tokens_map.items():
133 |         if isinstance(v, str):
134 |             all_special_tokens.append(v)
135 |         elif isinstance(v, list):
136 |             all_special_tokens += v
137 | 
138 |     new_tokens = []
139 | 
140 |     if '<|im_start|>' not in all_special_tokens:
141 |         new_tokens.append('<|im_start|>')
142 | 
143 |     if '<|im_end|>' not in all_special_tokens:
144 |         new_tokens.append('<|im_end|>')
145 | 
146 |     if '<|vision_start|>' not in all_special_tokens:
147 |         new_tokens.append('<|vision_start|>')
148 | 
149 |     if '<|vision_end|>' not in all_special_tokens:
150 |         new_tokens.append('<|vision_end|>')
151 | 
152 |     num_new_tokens = tokenizer.add_tokens(new_tokens)
153 |     bos_token_id = tokenizer.convert_tokens_to_ids('<|im_start|>')
154 |     eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>')
155 |     start_of_image = tokenizer.convert_tokens_to_ids('<|vision_start|>')
156 |     end_of_image = tokenizer.convert_tokens_to_ids('<|vision_end|>')
157 | 
158 |     new_token_ids = dict(
159 |         bos_token_id=bos_token_id, 
160 |         eos_token_id=eos_token_id, 
161 |         start_of_image=start_of_image, 
162 |         end_of_image=end_of_image, 
163 |     )
164 | 
165 |     return tokenizer, new_token_ids, num_new_tokens
166 | 
167 | 
168 | def len2weight(x, loss_reduction='square'):
169 |     if x == 0:
170 |         return x
171 |     if loss_reduction == 'token':
172 |         return 1
173 |     if loss_reduction == 'sample':
174 |         return 1 / x
175 |     if loss_reduction == 'square':
176 |         return 1 / (x ** 0.5)
177 |     raise NotImplementedError(loss_reduction)
178 | 


--------------------------------------------------------------------------------
/modeling/siglip/processing_siglip.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The HuggingFace Inc. team.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | """
  5 | Image/Text processor class for SigLIP.
  6 | """
  7 | 
  8 | from typing import List, Optional, Union
  9 | 
 10 | from transformers.feature_extraction_utils import BatchFeature
 11 | from transformers.image_utils import ImageInput
 12 | from transformers.processing_utils import ProcessorMixin
 13 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 14 | from transformers.utils import TensorType
 15 | 
 16 | 
 17 | class SiglipProcessor(ProcessorMixin):
 18 |     r"""
 19 |     Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
 20 | 
 21 |     [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
 22 |     [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
 23 | 
 24 |     Args:
 25 |         image_processor ([`SiglipImageProcessor`]):
 26 |             The image processor is a required input.
 27 |         tokenizer ([`SiglipTokenizer`]):
 28 |             The tokenizer is a required input.
 29 |     """
 30 | 
 31 |     attributes = ["image_processor", "tokenizer"]
 32 |     image_processor_class = "SiglipImageProcessor"
 33 |     tokenizer_class = "SiglipTokenizer"
 34 | 
 35 |     def __init__(self, image_processor, tokenizer):
 36 |         super().__init__(image_processor, tokenizer)
 37 | 
 38 |     def __call__(
 39 |         self,
 40 |         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
 41 |         images: ImageInput = None,
 42 |         padding: Union[bool, str, PaddingStrategy] = False,
 43 |         truncation: Union[bool, str, TruncationStrategy] = None,
 44 |         max_length: int = None,
 45 |         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
 46 |     ) -> BatchFeature:
 47 |         """
 48 |         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
 49 |         and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
 50 |         the text. To prepare the image(s), this method forwards the `images` argument to
 51 |         SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
 52 |         of the above two methods for more information.
 53 | 
 54 |         Args:
 55 |             text (`str`, `List[str]`, `List[List[str]]`):
 56 |                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
 57 |                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
 58 |                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 59 |             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
 60 |                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
 61 |                 tensor. Both channels-first and channels-last formats are supported.
 62 |             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
 63 |                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
 64 |                 index) among:
 65 |                 - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
 66 |                   sequence if provided).
 67 |                 - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
 68 |                   acceptable input length for the model if that argument is not provided.
 69 |                 - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
 70 |                   lengths).
 71 |             max_length (`int`, *optional*):
 72 |                 Maximum length of the returned list and optionally padding length (see above).
 73 |             truncation (`bool`, *optional*):
 74 |                 Activates truncation to cut input sequences longer than `max_length` to `max_length`.
 75 |             return_tensors (`str` or [`~utils.TensorType`], *optional*):
 76 |                 If set, will return tensors of a particular framework. Acceptable values are:
 77 | 
 78 |                 - `'tf'`: Return TensorFlow `tf.constant` objects.
 79 |                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
 80 |                 - `'np'`: Return NumPy `np.ndarray` objects.
 81 |                 - `'jax'`: Return JAX `jnp.ndarray` objects.
 82 | 
 83 |         Returns:
 84 |             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 85 | 
 86 |             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
 87 |             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
 88 |               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
 89 |               `None`).
 90 |             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
 91 |         """
 92 | 
 93 |         if text is None and images is None:
 94 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
 95 | 
 96 |         if text is not None:
 97 |             encoding = self.tokenizer(
 98 |                 text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
 99 |             )
100 | 
101 |         if images is not None:
102 |             image_features = self.image_processor(images, return_tensors=return_tensors)
103 | 
104 |         if text is not None and images is not None:
105 |             encoding["pixel_values"] = image_features.pixel_values
106 |             return encoding
107 |         elif text is not None:
108 |             return encoding
109 |         else:
110 |             return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
111 | 
112 |     def decode(self, *args, **kwargs):
113 |         """
114 |         This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
115 |         the docstring of this method for more information.
116 |         """
117 |         return self.tokenizer.decode(*args, **kwargs)
118 | 
119 |     def batch_decode(self, *args, **kwargs):
120 |         """
121 |         This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
122 |         refer to the docstring of this method for more information.
123 |         """
124 |         return self.tokenizer.batch_decode(*args, **kwargs)
125 | 
126 |     @property
127 |     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
128 |     def model_input_names(self):
129 |         tokenizer_input_names = self.tokenizer.model_input_names
130 |         image_processor_input_names = self.image_processor.model_input_names
131 |         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
132 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mmmu/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | """Utils for data load, save, and process (e.g., prompt construction)"""
 13 | 
 14 | import json
 15 | import os
 16 | import re
 17 | 
 18 | import yaml
 19 | 
 20 | DOMAIN_CAT2SUB_CAT = {
 21 |     'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
 22 |     'Business': ['Accounting', 'Economics', 'Finance', 'Manage', 'Marketing'],
 23 |     'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics', ],
 24 |     'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine',
 25 |                             'Pharmacy', 'Public_Health'],
 26 |     'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
 27 |     'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics',
 28 |                              'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
 29 | }
 30 | 
 31 | CAT_SHORT2LONG = {
 32 |     'acc': 'Accounting',
 33 |     'agri': 'Agriculture',
 34 |     'arch': 'Architecture_and_Engineering',
 35 |     'art': 'Art',
 36 |     'art_theory': 'Art_Theory',
 37 |     'bas_med': 'Basic_Medical_Science',
 38 |     'bio': 'Biology',
 39 |     'chem': 'Chemistry',
 40 |     'cli_med': 'Clinical_Medicine',
 41 |     'cs': 'Computer_Science',
 42 |     'design': 'Design',
 43 |     'diag_med': 'Diagnostics_and_Laboratory_Medicine',
 44 |     'econ': 'Economics',
 45 |     'elec': 'Electronics',
 46 |     'ep': 'Energy_and_Power',
 47 |     'fin': 'Finance',
 48 |     'geo': 'Geography',
 49 |     'his': 'History',
 50 |     'liter': 'Literature',
 51 |     'manage': 'Manage',
 52 |     'mark': 'Marketing',
 53 |     'mate': 'Materials',
 54 |     'math': 'Math',
 55 |     'mech': 'Mechanical_Engineering',
 56 |     'music': 'Music',
 57 |     'phar': 'Pharmacy',
 58 |     'phys': 'Physics',
 59 |     'psy': 'Psychology',
 60 |     'pub_health': 'Public_Health',
 61 |     'socio': 'Sociology'
 62 | }
 63 | 
 64 | 
 65 | # DATA SAVING
 66 | def save_json(filename, ds):
 67 |     with open(filename, 'w') as f:
 68 |         json.dump(ds, f, indent=4)
 69 | 
 70 | 
 71 | def get_multi_choice_info(options):
 72 |     """
 73 |     Given the list of options for multiple choice question
 74 |     Return the index2ans and all_choices
 75 |     """
 76 | 
 77 |     start_chr = 'A'
 78 |     all_choices = []
 79 |     index2ans = {}
 80 |     for i, option in enumerate(options):
 81 |         index2ans[chr(ord(start_chr) + i)] = option
 82 |         all_choices.append(chr(ord(start_chr) + i))
 83 | 
 84 |     return index2ans, all_choices
 85 | 
 86 | 
 87 | def load_yaml(file_path):
 88 |     with open(file_path, 'r') as stream:
 89 |         try:
 90 |             yaml_dict = yaml.safe_load(stream)
 91 |         except yaml.YAMLError as exc:
 92 |             print(exc)
 93 | 
 94 |     return yaml_dict
 95 | 
 96 | 
 97 | def parse_img_path(text):
 98 |     matches = re.findall("<img='(.*?)'>", text)
 99 |     return matches
100 | 
101 | 
102 | def process_single_sample(data):
103 |     question = data['question']
104 |     o_imgs_paths = []
105 |     for option in data['options']:
106 |         current_o_imgs_paths = parse_img_path(option)
107 |         for img_path in current_o_imgs_paths:
108 |             o_imgs_paths.append(img_path)
109 |     images = [data['image_1'], data['image_2'], data['image_3'], data['image_4'],
110 |               data['image_5'], data['image_6'], data['image_7']]
111 |     return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
112 |             'image': images, 'question_type': data['question_type']}
113 | 
114 | 
115 | # DATA SAVING
116 | def save_json(filename, ds):
117 |     with open(filename, 'w') as f:
118 |         json.dump(ds, f, indent=4)
119 | 
120 | 
121 | def save_jsonl(filename, data):
122 |     """
123 |     Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
124 | 
125 |     Args:
126 |         filename (str): The path to the file where the data should be saved.
127 |         data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
128 |     """
129 |     with open(filename, 'w', encoding='utf-8') as f:
130 |         for img_path, caption in data.items():
131 |             # Extract the base filename without the extension
132 |             base_filename = os.path.basename(img_path)
133 |             # Create a JSON object with the filename as the key and caption as the value
134 |             json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
135 |             # Write the JSON object to the file, one per line
136 |             f.write(json_record + '\n')
137 | 
138 | 
139 | def save_args(args, path_dir):
140 |     argsDict = args.__dict__
141 |     with open(path_dir + 'setting.txt', 'w') as f:
142 |         f.writelines('------------------ start ------------------' + '\n')
143 |         for eachArg, value in argsDict.items():
144 |             f.writelines(eachArg + ' : ' + str(value) + '\n')
145 |         f.writelines('------------------- end -------------------')
146 | 
147 | 
148 | # DATA PROCESSING
149 | def construct_prompt(sample, config):
150 |     question = sample['question']
151 |     options = eval(sample['options'])
152 |     example = ''
153 |     if sample['question_type'] == 'multiple-choice':
154 |         start_chr = 'A'
155 |         prediction_range = []
156 |         index2ans = {}
157 |         for option in options:
158 |             prediction_range.append(start_chr)
159 |             example += f'({start_chr}) {option}\n'
160 |             index2ans[start_chr] = option
161 |             start_chr = chr(ord(start_chr) + 1)
162 |         empty_prompt_sample_structure = config['multi_choice_example_format']
163 |         empty_prompt = empty_prompt_sample_structure.format(question, example)
164 |         res_dict = {}
165 |         res_dict['index2ans'] = index2ans
166 |         res_dict['correct_choice'] = sample['answer']
167 |         res_dict['all_choices'] = prediction_range
168 |         res_dict['empty_prompt'] = empty_prompt
169 |         if config['task_instructions']:
170 |             res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
171 |         else:
172 |             res_dict['final_input_prompt'] = empty_prompt
173 | 
174 |         res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
175 |     else:
176 |         empty_prompt_sample_structure = config['short_ans_example_format']
177 |         empty_prompt = empty_prompt_sample_structure.format(question)
178 |         res_dict = {}
179 |         res_dict['empty_prompt'] = empty_prompt
180 |         if config['task_instructions']:
181 |             res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
182 |         else:
183 |             res_dict['final_input_prompt'] = empty_prompt
184 |         res_dict['gt_content'] = sample['answer']
185 | 
186 |     res_dict.update(sample)
187 |     return res_dict
188 | 


--------------------------------------------------------------------------------
/eval/gen/geneval/prompts/create_prompts.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 Dhruba Ghosh
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | """
 13 | Generate prompts for evaluation
 14 | """
 15 | 
 16 | import argparse
 17 | import json
 18 | import os
 19 | import yaml
 20 | 
 21 | import numpy as np
 22 | 
 23 | # Load classnames
 24 | 
 25 | with open("object_names.txt") as cls_file:
 26 |     classnames = [line.strip() for line in cls_file]
 27 | 
 28 | # Proper a vs an
 29 | 
 30 | def with_article(name: str):
 31 |     if name[0] in "aeiou":
 32 |         return f"an {name}"
 33 |     return f"a {name}"
 34 | 
 35 | # Proper plural
 36 | 
 37 | def make_plural(name: str):
 38 |     if name[-1] in "s":
 39 |         return f"{name}es"
 40 |     return f"{name}s"
 41 | 
 42 | # Generates single object samples
 43 | 
 44 | def generate_single_object_sample(rng: np.random.Generator, size: int = None):
 45 |     TAG = "single_object"
 46 |     if size > len(classnames):
 47 |         size = len(classnames)
 48 |         print(f"Not enough distinct classes, generating only {size} samples")
 49 |     return_scalar = size is None
 50 |     size = size or 1
 51 |     idxs = rng.choice(len(classnames), size=size, replace=False)
 52 |     samples = [dict(
 53 |         tag=TAG,
 54 |         include=[
 55 |             {"class": classnames[idx], "count": 1}
 56 |         ],
 57 |         prompt=f"a photo of {with_article(classnames[idx])}"
 58 |     ) for idx in idxs]
 59 |     if return_scalar:
 60 |         return samples[0]
 61 |     return samples
 62 | 
 63 | # Generate two object samples
 64 | 
 65 | def generate_two_object_sample(rng: np.random.Generator):
 66 |     TAG = "two_object"
 67 |     idx_a, idx_b = rng.choice(len(classnames), size=2, replace=False)
 68 |     return dict(
 69 |         tag=TAG,
 70 |         include=[
 71 |             {"class": classnames[idx_a], "count": 1},
 72 |             {"class": classnames[idx_b], "count": 1}
 73 |         ],
 74 |         prompt=f"a photo of {with_article(classnames[idx_a])} and {with_article(classnames[idx_b])}"
 75 |     )
 76 | 
 77 | # Generate counting samples
 78 | 
 79 | numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]
 80 | 
 81 | def generate_counting_sample(rng: np.random.Generator, max_count=4):
 82 |     TAG = "counting"
 83 |     idx = rng.choice(len(classnames))
 84 |     num = int(rng.integers(2, max_count, endpoint=True))
 85 |     return dict(
 86 |         tag=TAG,
 87 |         include=[
 88 |             {"class": classnames[idx], "count": num}
 89 |         ],
 90 |         exclude=[
 91 |             {"class": classnames[idx], "count": num + 1}
 92 |         ],
 93 |         prompt=f"a photo of {numbers[num]} {make_plural(classnames[idx])}"
 94 |     )
 95 | 
 96 | # Generate color samples
 97 | 
 98 | colors = ["red", "orange", "yellow", "green", "blue", "purple", "pink", "brown", "black", "white"]
 99 | 
100 | def generate_color_sample(rng: np.random.Generator):
101 |     TAG = "colors"
102 |     idx = rng.choice(len(classnames) - 1) + 1
103 |     idx = (idx + classnames.index("person")) % len(classnames) # No "[COLOR] person" prompts
104 |     color = colors[rng.choice(len(colors))]
105 |     return dict(
106 |         tag=TAG,
107 |         include=[
108 |             {"class": classnames[idx], "count": 1, "color": color}
109 |         ],
110 |         prompt=f"a photo of {with_article(color)} {classnames[idx]}"
111 |     )
112 | 
113 | # Generate position samples
114 | 
115 | positions = ["left of", "right of", "above", "below"]
116 | 
117 | def generate_position_sample(rng: np.random.Generator):
118 |     TAG = "position"
119 |     idx_a, idx_b = rng.choice(len(classnames), size=2, replace=False)
120 |     position = positions[rng.choice(len(positions))]
121 |     return dict(
122 |         tag=TAG,
123 |         include=[
124 |             {"class": classnames[idx_b], "count": 1},
125 |             {"class": classnames[idx_a], "count": 1, "position": (position, 0)}
126 |         ],
127 |         prompt=f"a photo of {with_article(classnames[idx_a])} {position} {with_article(classnames[idx_b])}"
128 |     )
129 | 
130 | # Generate color attribution samples
131 | 
132 | def generate_color_attribution_sample(rng: np.random.Generator):
133 |     TAG = "color_attr"
134 |     idxs = rng.choice(len(classnames) - 1, size=2, replace=False) + 1
135 |     idx_a, idx_b = (idxs + classnames.index("person")) % len(classnames) # No "[COLOR] person" prompts
136 |     cidx_a, cidx_b = rng.choice(len(colors), size=2, replace=False)
137 |     return dict(
138 |         tag=TAG,
139 |         include=[
140 |             {"class": classnames[idx_a], "count": 1, "color": colors[cidx_a]},
141 |             {"class": classnames[idx_b], "count": 1, "color": colors[cidx_b]}
142 |         ],
143 |         prompt=f"a photo of {with_article(colors[cidx_a])} {classnames[idx_a]} and {with_article(colors[cidx_b])} {classnames[idx_b]}"
144 |     )
145 | 
146 | 
147 | # Generate evaluation suite
148 | 
149 | def generate_suite(rng: np.random.Generator, n: int = 100, output_path: str = ""):
150 |     samples = []
151 |     # Generate single object samples for all COCO classnames
152 |     samples.extend(generate_single_object_sample(rng, size=len(classnames)))
153 |     # Generate two object samples (~100)
154 |     for _ in range(n):
155 |         samples.append(generate_two_object_sample(rng))
156 |     # Generate counting samples
157 |     for _ in range(n):
158 |         samples.append(generate_counting_sample(rng, max_count=4))
159 |     # Generate color samples
160 |     for _ in range(n):
161 |         samples.append(generate_color_sample(rng))
162 |     # Generate position samples
163 |     for _ in range(n):
164 |         samples.append(generate_position_sample(rng))
165 |     # Generate color attribution samples
166 |     for _ in range(n):
167 |         samples.append(generate_color_attribution_sample(rng))
168 |     # De-duplicate
169 |     unique_samples, used_samples = [], set()
170 |     for sample in samples:
171 |         sample_text = yaml.safe_dump(sample)
172 |         if sample_text not in used_samples:
173 |             unique_samples.append(sample)
174 |             used_samples.add(sample_text)
175 | 
176 |     # Write to files
177 |     os.makedirs(output_path, exist_ok=True)
178 |     with open(os.path.join(output_path, "generation_prompts.txt"), "w") as fp:
179 |         for sample in unique_samples:
180 |             print(sample['prompt'], file=fp)
181 |     with open(os.path.join(output_path, "evaluation_metadata.jsonl"), "w") as fp:
182 |         for sample in unique_samples:
183 |             print(json.dumps(sample), file=fp)
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     parser = argparse.ArgumentParser()
188 |     parser.add_argument("--seed", type=int, default=43, help="generation seed (default: 43)")
189 |     parser.add_argument("--num-prompts", "-n", type=int, default=100, help="number of prompts per task (default: 100)")
190 |     parser.add_argument("--output-path", "-o", type=str, default="prompts", help="output folder for prompts and metadata (default: 'prompts/')")
191 |     args = parser.parse_args()
192 |     rng = np.random.default_rng(args.seed)
193 |     generate_suite(rng, args.num_prompts, args.output_path)
194 | 
195 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mathvista/utilities.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import json
 13 | import os
 14 | import pickle
 15 | import re
 16 | import time
 17 | 
 18 | import cv2
 19 | import openai
 20 | from word2number import w2n
 21 | 
 22 | openai_client = None
 23 | 
 24 | 
 25 | def create_dir(output_dir):
 26 |     if not os.path.exists(output_dir):
 27 |         os.makedirs(output_dir)
 28 | 
 29 | 
 30 | def read_csv(file):
 31 |     data = []
 32 |     with open(file, 'r') as f:
 33 |         for line in f:
 34 |             data.append(line.strip())
 35 |     return data
 36 | 
 37 | 
 38 | def read_pandas_csv(csv_path):
 39 |     # read a pandas csv sheet
 40 |     import pandas as pd
 41 |     df = pd.read_csv(csv_path)
 42 |     return df
 43 | 
 44 | 
 45 | def read_json(path):
 46 |     with open(path, 'r', encoding='utf-8') as f:
 47 |         return json.load(f)
 48 | 
 49 | 
 50 | def read_jsonl(file):
 51 |     with open(file, 'r') as f:
 52 |         data = [json.loads(line) for line in f]
 53 |     return data
 54 | 
 55 | 
 56 | def read_pickle(path):
 57 |     with open(path, 'rb') as f:
 58 |         return pickle.load(f)
 59 | 
 60 | 
 61 | def save_json(data, path):
 62 |     with open(path, 'w') as f:
 63 |         json.dump(data, f, indent=4)
 64 | 
 65 | 
 66 | def save_array_img(path, image):
 67 |     cv2.imwrite(path, image)
 68 | 
 69 | 
 70 | def contains_digit(text):
 71 |     # check if text contains a digit
 72 |     if any(char.isdigit() for char in text):
 73 |         return True
 74 |     return False
 75 | 
 76 | 
 77 | def contains_number_word(text):
 78 |     # check if text contains a number word
 79 |     ignore_words = ['a', 'an', 'point']
 80 |     words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
 81 |     for word in words:
 82 |         if word in ignore_words:
 83 |             continue
 84 |         try:
 85 |             w2n.word_to_num(word)
 86 |             return True  # If the word can be converted to a number, return True
 87 |         except ValueError:
 88 |             continue  # If the word can't be converted to a number, continue with the next word
 89 | 
 90 |     # check if text contains a digit
 91 |     if any(char.isdigit() for char in text):
 92 |         return True
 93 | 
 94 |     return False  # If none of the words could be converted to a number, return False
 95 | 
 96 | 
 97 | def contains_quantity_word(text, special_keep_words=[]):
 98 |     # check if text contains a quantity word
 99 |     quantity_words = ['most', 'least', 'fewest'
100 |                                        'more', 'less', 'fewer',
101 |                       'largest', 'smallest', 'greatest',
102 |                       'larger', 'smaller', 'greater',
103 |                       'highest', 'lowest', 'higher', 'lower',
104 |                       'increase', 'decrease',
105 |                       'minimum', 'maximum', 'max', 'min',
106 |                       'mean', 'average', 'median',
107 |                       'total', 'sum', 'add', 'subtract',
108 |                       'difference', 'quotient', 'gap',
109 |                       'half', 'double', 'twice', 'triple',
110 |                       'square', 'cube', 'root',
111 |                       'approximate', 'approximation',
112 |                       'triangle', 'rectangle', 'circle', 'square', 'cube', 'sphere', 'cylinder', 'cone', 'pyramid',
113 |                       'multiply', 'divide',
114 |                       'percentage', 'percent', 'ratio', 'proportion', 'fraction', 'rate',
115 |                       ]
116 | 
117 |     quantity_words += special_keep_words  # dataset specific words
118 | 
119 |     words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
120 |     if any(word in quantity_words for word in words):
121 |         return True
122 | 
123 |     return False  # If none of the words could be converted to a number, return False
124 | 
125 | 
126 | def is_bool_word(text):
127 |     if text in ['Yes', 'No', 'True', 'False',
128 |                 'yes', 'no', 'true', 'false',
129 |                 'YES', 'NO', 'TRUE', 'FALSE']:
130 |         return True
131 |     return False
132 | 
133 | 
134 | def is_digit_string(text):
135 |     # remove ".0000"
136 |     text = text.strip()
137 |     text = re.sub(r'\.0+$', '', text)
138 |     try:
139 |         int(text)
140 |         return True
141 |     except ValueError:
142 |         return False
143 | 
144 | 
145 | def is_float_string(text):
146 |     # text is a float string if it contains a "." and can be converted to a float
147 |     if '.' in text:
148 |         try:
149 |             float(text)
150 |             return True
151 |         except ValueError:
152 |             return False
153 |     return False
154 | 
155 | 
156 | def copy_image(image_path, output_image_path):
157 |     from shutil import copyfile
158 |     copyfile(image_path, output_image_path)
159 | 
160 | 
161 | def copy_dir(src_dir, dst_dir):
162 |     from shutil import copytree
163 | 
164 |     # copy the source directory to the target directory
165 |     copytree(src_dir, dst_dir)
166 | 
167 | 
168 | import PIL.Image as Image
169 | 
170 | 
171 | def get_image_size(img_path):
172 |     img = Image.open(img_path)
173 |     width, height = img.size
174 |     return width, height
175 | 
176 | 
177 | def get_chat_response(
178 |         promot="", api_key="", 
179 |         base_url="your_api_url",
180 |         api_version="2024-03-01-preview", model="gpt-4-0613", 
181 |         temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0
182 |     ):
183 |     openai_client = openai.AzureOpenAI(
184 |         azure_endpoint=base_url,
185 |         api_version=api_version,
186 |         api_key=api_key,
187 |     )
188 | 
189 |     messages = [
190 |         {'role': 'user', 'content': promot},
191 |     ]
192 |     while patience > 0:
193 |         patience -= 1
194 |         try:
195 |             response = openai_client.chat.completions.create(
196 |                 model=model,
197 |                 messages=messages,
198 |                 # api_key=api_key,
199 |                 temperature=temperature,
200 |                 max_tokens=max_tokens,
201 |                 n=n,
202 |             )
203 |             response = response.to_dict()
204 |             if n == 1:
205 |                 prediction = response['choices'][0]['message']['content'].strip()
206 |                 if prediction != '' and prediction is not None:
207 |                     return prediction
208 |             else:
209 |                 prediction = [choice['message']['content'].strip() for choice in response['choices']]
210 |                 if prediction[0] != '' and prediction[0] is not None:
211 |                     return prediction
212 | 
213 |         except Exception as e:
214 |             if 'Rate limit' not in str(e):
215 |                 print(e)
216 | 
217 |             if 'Please reduce the length of the messages' in str(e):
218 |                 print('!!Reduce promot size')
219 |                 # reduce input prompt and keep the tail
220 |                 new_size = int(len(promot) * 0.9)
221 |                 new_start = len(promot) - new_size
222 |                 promot = promot[new_start:]
223 |                 messages = [
224 |                     {'role': 'user', 'content': promot},
225 |                 ]
226 | 
227 |             if sleep_time > 0:
228 |                 time.sleep(sleep_time)
229 |     return ''
230 | 


--------------------------------------------------------------------------------
/eval/vlm/eval/mathvista/evaluate_mathvista.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 OpenGVLab
  2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
  6 | #
  7 | # Original file was released under MIT, with the full license text
  8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
  9 | #
 10 | # This modified file is released under the same license.
 11 | 
 12 | import argparse
 13 | import itertools
 14 | import json
 15 | import os
 16 | import random
 17 | 
 18 | import torch
 19 | from datasets import load_dataset
 20 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation
 21 | from tqdm import tqdm
 22 | 
 23 | ds_collections = {
 24 |     'MathVista_testmini': {
 25 |         'root': 'AI4Math/MathVista',
 26 |         'max_new_tokens': 4096,
 27 |         'min_new_tokens': 1,
 28 |         'split': 'testmini'
 29 |     },
 30 |     'MathVista_test': {
 31 |         'root': 'AI4Math/MathVista',
 32 |         'max_new_tokens': 4096,
 33 |         'min_new_tokens': 1,
 34 |         'split': 'test'
 35 |     },
 36 | }
 37 | 
 38 | 
 39 | COT_INSTRUCTION = (
 40 |     'Your task is to answer the question below. '
 41 |     "Give step by step reasoning before you answer, and when you're ready to answer, "
 42 |     "please use the format \"Final answer: ..\""
 43 |     '\n\n'
 44 |     'Question:'
 45 |     '\n\n'
 46 |     '{question}'
 47 | )
 48 | 
 49 | 
 50 | def collate_fn(batches):
 51 |     images = [_['images'] for _ in batches]
 52 |     data_items = [_['data_item'] for _ in batches]
 53 |     return images, data_items
 54 | 
 55 | 
 56 | class MathVistaDataset(torch.utils.data.Dataset):
 57 | 
 58 |     def __init__(self, root, split):
 59 |         dataset = load_dataset(root, cache_dir=os.path.join(os.getcwd(), 'eval/vlm/data/MathVista/'))
 60 |         self.data = dataset[split]
 61 | 
 62 |     def __len__(self):
 63 |         return len(self.data)
 64 | 
 65 |     def __getitem__(self, idx):
 66 |         data_item = self.data[idx]
 67 |         image = data_item['decoded_image']
 68 |         del data_item['decoded_image']
 69 | 
 70 |         images = [image.convert('RGB') if image.mode != 'RGB' else image]
 71 | 
 72 |         return {
 73 |             'images': images,
 74 |             'data_item': data_item,
 75 |         }
 76 | 
 77 | 
 78 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 79 | 
 80 |     def __init__(self, size):
 81 |         self._size = int(size)
 82 |         assert size > 0
 83 |         self._rank = torch.distributed.get_rank()
 84 |         self._world_size = torch.distributed.get_world_size()
 85 |         self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
 86 | 
 87 |     @staticmethod
 88 |     def _get_local_indices(total_size, world_size, rank):
 89 |         shard_size = total_size // world_size
 90 |         left = total_size % world_size
 91 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 92 | 
 93 |         begin = sum(shard_sizes[:rank])
 94 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
 95 |         return range(begin, end)
 96 | 
 97 |     def __iter__(self):
 98 |         yield from self._local_indices
 99 | 
100 |     def __len__(self):
101 |         return len(self._local_indices)
102 | 
103 | 
104 | def evaluate_chat_model():
105 |     random.seed(args.seed)
106 | 
107 |     for ds_name in args.datasets:
108 |         dataset = MathVistaDataset(
109 |             root=ds_collections[ds_name]['root'],
110 |             split=ds_collections[ds_name]['split'],
111 |         )
112 |         dataloader = torch.utils.data.DataLoader(
113 |             dataset=dataset,
114 |             sampler=InferenceSampler(len(dataset)),
115 |             batch_size=args.batch_size,
116 |             num_workers=args.num_workers,
117 |             pin_memory=True,
118 |             drop_last=False,
119 |             collate_fn=collate_fn,
120 |         )
121 | 
122 |         outputs = []
123 |         for _, (images, data_items) in tqdm(enumerate(dataloader)):
124 |             if args.cot:
125 |                 question = COT_INSTRUCTION.format(question=data_items[0]['query'])
126 |             else:
127 |                 question = data_items[0]['query']
128 | 
129 |             images = images[0]
130 |             images, conversation = process_conversation(images, question)
131 | 
132 |             pred = model.chat(
133 |                 tokenizer, 
134 |                 new_token_ids,
135 |                 image_transform,
136 |                 images=images,
137 |                 prompt=conversation,
138 |                 max_length=ds_collections[ds_name]['max_new_tokens'] if not args.cot else 4096, # TODO: how to use ds_collections[ds_name]['min_new_tokens']
139 |             )
140 | 
141 |             data_item = data_items[0]
142 |             data_item['response'] = pred
143 |             outputs.append(data_item)
144 | 
145 |         torch.distributed.barrier()
146 | 
147 |         world_size = torch.distributed.get_world_size()
148 |         merged_outputs = [None for _ in range(world_size)]
149 |         torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
150 | 
151 |         merged_outputs = [json.loads(_) for _ in merged_outputs]
152 |         merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
153 | 
154 |         if torch.distributed.get_rank() == 0:
155 |             temp = {}
156 |             for data_item in merged_outputs:
157 |                 pid = data_item['pid']
158 |                 temp[pid] = data_item
159 | 
160 |             print(f'Evaluating {ds_name} ...')
161 |             results_file = 'results.json'
162 |             output_path = os.path.join(args.out_dir, 'results.json')
163 |             json.dump(temp, open(output_path, 'w'), indent=4)
164 |             print('Results saved to {}'.format(output_path))
165 | 
166 |             if args.cot:
167 |                 cmd = f'python eval/vlm/eval/mathvista/extract_answer_mp.py --output_file {results_file} --output_dir {args.out_dir}'
168 |             else:
169 |                 cmd = f'python eval/vlm/eval/mathvista/extract_answer_mp.py --output_file {results_file} --output_dir {args.out_dir}'
170 |             print(cmd)
171 |             os.system(cmd)
172 | 
173 |             cmd = f'python eval/vlm/eval/mathvista/calculate_score.py --output_file {results_file} --output_dir {args.out_dir} --score_file score.json'
174 |             print(cmd)
175 |             os.system(cmd)
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     parser = argparse.ArgumentParser()
180 |     parser.add_argument('--datasets', type=str, default='MathVista_testmini')
181 |     parser.add_argument('--batch-size', type=int, default=1)
182 |     parser.add_argument('--num-workers', type=int, default=1)
183 |     parser.add_argument('--out-dir', type=str, default='results')
184 |     parser.add_argument('--seed', type=int, default=0)
185 |     parser.add_argument('--cot', action='store_true')
186 |     parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/')
187 |     args = parser.parse_args()
188 | 
189 |     if not os.path.exists(args.out_dir):
190 |         os.makedirs(args.out_dir, exist_ok=True)
191 | 
192 |     args.datasets = args.datasets.split(',')
193 |     print('datasets:', args.datasets)
194 |     assert args.batch_size == 1, 'Only batch size 1 is supported'
195 | 
196 |     torch.distributed.init_process_group(
197 |         backend='nccl',
198 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
199 |         rank=int(os.getenv('RANK', '0')),
200 |     )
201 | 
202 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
203 | 
204 |     model, tokenizer, new_token_ids = load_model_and_tokenizer(args)
205 |     image_transform = build_transform()
206 | 
207 |     total_params = sum(p.numel() for p in model.parameters()) / 1e9
208 |     print(f'[test] total_params: {total_params}B')
209 | 
210 |     evaluate_chat_model()
211 | 


--------------------------------------------------------------------------------