├── eval ├── gen │ ├── gedit │ │ └── viescore │ │ │ ├── mllm_tools │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── qwen25vl_eval.py │ │ │ ├── parse_prompt.py │ │ │ └── __init__.py │ ├── geneval │ │ ├── evaluation │ │ │ ├── download_models.sh │ │ │ ├── object_names.txt │ │ │ └── summary_scores.py │ │ └── prompts │ │ │ ├── object_names.txt │ │ │ └── create_prompts.py │ ├── imgedit │ │ ├── step1_get_avgscore.py │ │ ├── step2_typescore.py │ │ └── basic_bench.py │ └── wise │ │ └── cal_score.py ├── __init__.py └── vlm │ ├── __init__.py │ ├── eval │ ├── mathvista │ │ ├── prompts │ │ │ └── ext_ans.py │ │ ├── extract_answer.py │ │ ├── extract_answer_mp.py │ │ ├── utilities.py │ │ └── evaluate_mathvista.py │ ├── mme │ │ ├── eval.py │ │ ├── Your_Results │ │ │ ├── OCR.txt │ │ │ ├── numerical_calculation.txt │ │ │ ├── code_reasoning.txt │ │ │ ├── existence.txt │ │ │ ├── color.txt │ │ │ ├── count.txt │ │ │ ├── position.txt │ │ │ └── text_translation.txt │ │ └── calculation.py │ ├── mmvet │ │ └── evaluate_mmvet.py │ ├── pope │ │ └── eval_pope.py │ └── mmmu │ │ ├── main_eval_only.py │ │ └── data_utils.py │ ├── utils.py │ └── evaluate.sh ├── assets ├── arch.png ├── teaser.webp └── emerging_curves.png ├── test_images ├── meme.jpg ├── women.jpg └── octupusy.jpg ├── data ├── __init__.py ├── interleave_datasets │ ├── __init__.py │ └── edit_dataset.py ├── configs │ └── example.yaml ├── dataset_info.py ├── distributed_iterable_dataset.py ├── parquet_utils.py ├── t2i_dataset.py ├── video_utils.py └── data_utils.py ├── train ├── __init__.py └── train_utils.py ├── .gitignore ├── modeling ├── __init__.py ├── bagel │ ├── __init__.py │ └── modeling_utils.py ├── qwen2 │ ├── __init__.py │ └── tokenization_qwen2_fast.py └── siglip │ ├── __init__.py │ └── processing_siglip.py ├── scripts ├── eval │ ├── run_eval_vlm.sh │ ├── eval_vlm.sh │ ├── run_rise.sh │ ├── run_geneval.sh │ ├── run_kris.sh │ ├── run_wise.sh │ ├── run_imgedit.sh │ └── run_gedit.sh └── train.sh └── requirements.txt /eval/gen/gedit/viescore/mllm_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/arch.png -------------------------------------------------------------------------------- /assets/teaser.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/teaser.webp -------------------------------------------------------------------------------- /test_images/meme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/meme.jpg -------------------------------------------------------------------------------- /test_images/women.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/women.jpg -------------------------------------------------------------------------------- /test_images/octupusy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/test_images/octupusy.jpg -------------------------------------------------------------------------------- /assets/emerging_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liuziyu77/Bagel_L/main/assets/emerging_curves.png -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /eval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | wandb 2 | __pycache__ 3 | .vscode 4 | notebooks 5 | results 6 | *.ipynb_checkpoints 7 | eval_results 8 | tests 9 | .DS_Store 10 | gradio.sh -------------------------------------------------------------------------------- /modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from . import bagel, qwen2, siglip, autoencoder -------------------------------------------------------------------------------- /data/interleave_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .edit_dataset import UnifiedEditIterableDataset 5 | 6 | -------------------------------------------------------------------------------- /modeling/bagel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | from .bagel import BagelConfig, Bagel 6 | from .qwen2_navit import Qwen2Config, Qwen2Model, Qwen2ForCausalLM 7 | from .siglip_navit import SiglipVisionConfig, SiglipVisionModel 8 | 9 | 10 | __all__ = [ 11 | 'BagelConfig', 12 | 'Bagel', 13 | 'Qwen2Config', 14 | 'Qwen2Model', 15 | 'Qwen2ForCausalLM', 16 | 'SiglipVisionConfig', 17 | 'SiglipVisionModel', 18 | ] 19 | -------------------------------------------------------------------------------- /scripts/eval/run_eval_vlm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | # Set proxy and API key 7 | export OPENAI_API_KEY=$openai_api_key 8 | 9 | export GPUS=1 10 | 11 | DATASETS=("mme" "mmbench-dev-en" "mmvet" "mmmu-val" "mathvista-testmini" "mmvp") 12 | # DATASETS=("mmmu-val_cot") 13 | 14 | DATASETS_STR="${DATASETS[*]}" 15 | export DATASETS_STR 16 | 17 | bash scripts/eval/eval_vlm.sh \ 18 | $output_path \ 19 | --model-path $model_path -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | decord==0.6.0 2 | einops==0.8.1 3 | huggingface_hub==0.29.1 4 | matplotlib==3.7.0 5 | numpy==1.24.4 6 | opencv_python==4.7.0.72 7 | pyarrow==11.0.0 8 | PyYAML==6.0.2 9 | Requests==2.32.3 10 | safetensors==0.4.5 11 | scipy==1.10.1 12 | sentencepiece==0.1.99 13 | torch==2.5.1 14 | torchvision==0.20.1 15 | transformers==4.49.0 16 | #flash_attn==2.5.8 17 | accelerate>=0.34.0 18 | wandb 19 | gradio 20 | setuptools 21 | wheel 22 | ninja 23 | bitsandbytes 24 | xlsxwriter 25 | triton ; sys_platform != 'win32' 26 | triton-windows ; sys_platform == 'win32' -------------------------------------------------------------------------------- /scripts/eval/eval_vlm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Check if enough arguments are provided 5 | if [ $# -lt 2 ]; then 6 | echo "Error: PREFIX_DIR and MODEL_PATH are required as the first and second arguments respectively." 7 | exit 1 8 | fi 9 | 10 | LOG_PATH=$1 11 | if [ ! -d "$LOG_PATH" ]; then 12 | mkdir -p "$LOG_PATH" 13 | fi 14 | shift 1 15 | ARGS=("$@") 16 | export MASTER_PORT=10042 17 | 18 | FULL_MODEL_PATH="$PREFIX_DIR/$MODEL_PATH" 19 | 20 | IFS=' ' read -r -a DATASETS <<< "$DATASETS_STR" 21 | 22 | for DATASET in "${DATASETS[@]}"; do 23 | bash eval/vlm/evaluate.sh \ 24 | "$DATASET" \ 25 | --out-dir "$LOG_PATH/$DATASET" \ 26 | "${ARGS[@]}" 27 | done -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # replace the variables with your own 5 | torchrun \ 6 | --nnodes=$num_nodes \ 7 | --node_rank=$node_rank \ 8 | --nproc_per_node=8 \ 9 | --master_addr=$master_addr \ 10 | --master_port=$master_port \ 11 | train/pretrain_unified_navit.py \ 12 | --dataset_config_file ./data/configs/example.yaml \ 13 | --layer_module Qwen2MoTDecoderLayer \ 14 | --vae_path $vae_path \ 15 | --vit_path $vit_path \ 16 | --llm_path $llm_path \ 17 | --use_flex True \ 18 | --resume_from $resume_from \ 19 | --results_dir $output_path \ 20 | --checkpoint_dir $ckpt_path \ 21 | --max_latent_size 64 \ 22 | --num_workers 1 # use small num_workers since the num_used_data (10) are not enough to split -------------------------------------------------------------------------------- /scripts/eval/run_rise.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | export OPENAI_API_KEY=$openai_api_key 7 | 8 | GPUS=8 9 | 10 | 11 | # generate images 12 | torchrun \ 13 | --nnodes=1 \ 14 | --node_rank=0 \ 15 | --nproc_per_node=$GPUS \ 16 | --master_addr=127.0.0.1 \ 17 | --master_port=12345 \ 18 | ./eval/gen/gen_images_mp_rise.py \ 19 | --output_dir $output_path/bagel \ 20 | --metadata_file ./eval/gen/rise/data/datav2_total_w_subtask.json \ 21 | --max_latent_size 64 \ 22 | --model-path $model_path \ 23 | --think 24 | 25 | 26 | # calculate score 27 | python ./eval/gen/rise/gpt_eval.py \ 28 | --data ./eval/gen/rise/data/datav2_total_w_subtask.json \ 29 | --input ./eval/gen/rise/data \ 30 | --output $output_path/bagel -------------------------------------------------------------------------------- /eval/gen/geneval/evaluation/download_models.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Dhruba Ghosh 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | #!/bin/bash 13 | 14 | # Download Mask2Former object detection config and weights 15 | 16 | if [ ! -z "$1" ] 17 | then 18 | mkdir -p "$1" 19 | wget https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756-743b7d99.pth -O "$1/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.pth" 20 | fi 21 | -------------------------------------------------------------------------------- /eval/gen/geneval/evaluation/object_names.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | computer mouse 66 | tv remote 67 | computer keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /eval/gen/geneval/prompts/object_names.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | computer mouse 66 | tv remote 67 | computer keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/configs/example.yaml: -------------------------------------------------------------------------------- 1 | t2i_pretrain: 2 | dataset_names: 3 | - t2i 4 | image_transform_args: 5 | image_stride: 16 6 | max_image_size: 1024 7 | min_image_size: 512 8 | is_mandatory: true 9 | num_used_data: # The sum should be larger that NUM_GPUS x NUM_WORKERS 10 | - 10 11 | weight: 1 12 | 13 | unified_edit: 14 | dataset_names: 15 | - seedxedit_multi 16 | image_transform_args: 17 | image_stride: 16 18 | max_image_size: 1024 19 | min_image_size: 512 20 | vit_image_transform_args: 21 | image_stride: 14 22 | max_image_size: 518 23 | min_image_size: 224 24 | is_mandatory: false 25 | num_used_data: 26 | - 10 27 | weight: 1 28 | 29 | vlm_sft: 30 | dataset_names: 31 | - llava_ov 32 | image_transform_args: 33 | image_stride: 14 34 | max_image_size: 980 35 | min_image_size: 378 36 | max_pixels: 2_007_040 37 | frame_sampler_args: 38 | max_num_frames: 12 39 | min_num_frames: 8 40 | is_mandatory: true 41 | shuffle_lines: True 42 | shuffle_seed: 0 43 | num_used_data: 44 | - 1000 45 | weight: 1 46 | -------------------------------------------------------------------------------- /eval/gen/gedit/viescore/parse_prompt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def create_python_file_with_texts(folder_path, output_file): 4 | with open(output_file, 'w', encoding='utf-8') as out_file: 5 | out_file.write("# This file is generated automatically through parse_prompt.py\n\n") 6 | for root, dirs, files in os.walk(folder_path): 7 | for file in files: 8 | if file.endswith(".txt"): 9 | file_path = os.path.join(root, file) 10 | var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_") 11 | with open(file_path, 'r', encoding='utf-8') as f: 12 | content = f.read().replace('"""', '\"\"\"') 13 | out_file.write(f'{var_name} = """{content}"""\n\n') 14 | 15 | # Example usage 16 | current_file_path = os.path.abspath(__file__) 17 | current_folder_path = os.path.dirname(current_file_path) 18 | folder_path = os.path.join(current_folder_path, "prompts_raw") 19 | output_file = os.path.join(current_folder_path, "vie_prompts.py") 20 | create_python_file_with_texts(folder_path, output_file) 21 | -------------------------------------------------------------------------------- /scripts/eval/run_geneval.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | GPUS=8 7 | 8 | 9 | # generate images 10 | torchrun \ 11 | --nnodes=1 \ 12 | --node_rank=0 \ 13 | --nproc_per_node=$GPUS \ 14 | --master_addr=127.0.0.1 \ 15 | --master_port=12345 \ 16 | ./eval/gen/gen_images_mp.py \ 17 | --output_dir $output_path/images \ 18 | --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata_long.jsonl \ 19 | --batch_size 1 \ 20 | --num_images 4 \ 21 | --resolution 1024 \ 22 | --max_latent_size 64 \ 23 | --model-path $model_path \ 24 | # --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata.jsonl \ 25 | 26 | 27 | # calculate score 28 | torchrun \ 29 | --nnodes=1 \ 30 | --node_rank=0 \ 31 | --nproc_per_node=$GPUS \ 32 | --master_addr=127.0.0.1 \ 33 | --master_port=12345 \ 34 | ./eval/gen/geneval/evaluation/evaluate_images_mp.py \ 35 | $output_path/images \ 36 | --outfile $output_path/results.jsonl \ 37 | --model-path ./eval/gen/geneval/model 38 | 39 | 40 | # summarize score 41 | python ./eval/gen/geneval/evaluation/summary_scores.py $output_path/results.jsonl -------------------------------------------------------------------------------- /train/train_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import os 6 | 7 | 8 | def create_logger(logging_dir, rank, filename="log"): 9 | """ 10 | Create a logger that writes to a log file and stdout. 11 | """ 12 | if rank == 0 and logging_dir is not None: # real logger 13 | logging.basicConfig( 14 | level=logging.INFO, 15 | format='[\033[34m%(asctime)s\033[0m] %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | handlers=[ 18 | logging.StreamHandler(), 19 | logging.FileHandler(f"{logging_dir}/{filename}.txt") 20 | ] 21 | ) 22 | logger = logging.getLogger(__name__) 23 | else: # dummy logger (does nothing) 24 | logger = logging.getLogger(__name__) 25 | logger.addHandler(logging.NullHandler()) 26 | return logger 27 | 28 | 29 | def get_latest_ckpt(checkpoint_dir): 30 | step_dirs = [d for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))] 31 | if len(step_dirs) == 0: 32 | return None 33 | step_dirs = sorted(step_dirs, key=lambda x: int(x)) 34 | latest_step_dir = os.path.join(checkpoint_dir, step_dirs[-1]) 35 | return latest_step_dir 36 | -------------------------------------------------------------------------------- /scripts/eval/run_kris.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | export OPENAI_API_KEY=$openai_api_key 7 | 8 | GPUS=8 9 | 10 | 11 | # generate images 12 | torchrun \ 13 | --nnodes=1 \ 14 | --node_rank=0 \ 15 | --nproc_per_node=$GPUS \ 16 | --master_addr=127.0.0.1 \ 17 | --master_port=12345 \ 18 | ./eval/gen/gen_images_mp_kris.py \ 19 | --output_dir $output_path/bagel \ 20 | --metadata_file ./eval/gen/kris/final_data.json \ 21 | --max_latent_size 64 \ 22 | --model-path $model_path \ 23 | --think 24 | 25 | 26 | # calculate score 27 | python ./eval/gen/kris/metrics_common.py \ 28 | --results_dir $output_path \ 29 | --max_workers 8 30 | 31 | python ./eval/gen/kris/metrics_knowledge.py \ 32 | --results_dir $output_path \ 33 | --max_workers 8 34 | 35 | python ./eval/gen/kris/metrics_multi_element.py \ 36 | --results_dir $output_path \ 37 | --max_workers 8 38 | 39 | python ./eval/gen/kris/metrics_temporal_prediction.py \ 40 | --results_dir $output_path \ 41 | --max_workers 8 42 | 43 | python ./eval/gen/kris/metrics_view_change.py \ 44 | --results_dir $output_path \ 45 | --max_workers 8 46 | 47 | 48 | # summarize score 49 | python ./eval/gen/kris/summarize.py \ 50 | --results_dir $output_path/bagel \ -------------------------------------------------------------------------------- /scripts/eval/run_wise.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | export OPENAI_API_KEY=$openai_api_key 7 | 8 | GPUS=8 9 | 10 | 11 | # generate images 12 | torchrun \ 13 | --nnodes=1 \ 14 | --node_rank=0 \ 15 | --nproc_per_node=$GPUS \ 16 | --master_addr=127.0.0.1 \ 17 | --master_port=12345 \ 18 | ./eval/gen/gen_images_mp_wise.py \ 19 | --output_dir $output_path/images \ 20 | --metadata-file ./eval/gen/wise/final_data.json \ 21 | --resolution 1024 \ 22 | --max-latent_size 64 \ 23 | --model-path $model_path \ 24 | --think 25 | 26 | 27 | # calculate score 28 | python3 eval/gen/wise/gpt_eval_mp.py \ 29 | --json_path eval/gen/wise/data/cultural_common_sense.json \ 30 | --image_dir $output_path/images \ 31 | --output_dir $output_path 32 | 33 | python3 eval/gen/wise/gpt_eval_mp.py \ 34 | --json_path eval/gen/wise/data/spatio-temporal_reasoning.json \ 35 | --image_dir $output_path/images \ 36 | --output_dir $output_path 37 | 38 | python3 eval/gen/wise/gpt_eval_mp.py \ 39 | --json_path eval/gen/wise/data/natural_science.json \ 40 | --image_dir $output_path/images \ 41 | --output_dir $output_path 42 | 43 | python3 eval/gen/wise/cal_score.py \ 44 | --output_dir $output_path -------------------------------------------------------------------------------- /data/dataset_info.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .interleave_datasets import UnifiedEditIterableDataset 5 | from .t2i_dataset import T2IIterableDataset 6 | from .vlm_dataset import SftJSONLIterableDataset 7 | 8 | 9 | DATASET_REGISTRY = { 10 | 't2i_pretrain': T2IIterableDataset, 11 | 'vlm_sft': SftJSONLIterableDataset, 12 | 'unified_edit': UnifiedEditIterableDataset, 13 | } 14 | 15 | 16 | DATASET_INFO = { 17 | 't2i_pretrain': { 18 | 't2i': { 19 | 'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files 20 | 'num_files': 10, # number of data units to be sharded across all ranks and workers 21 | 'num_total_samples': 1000, # number of total samples in the dataset 22 | }, 23 | }, 24 | 'unified_edit':{ 25 | 'seedxedit_multi': { 26 | 'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi', 27 | 'num_files': 10, 28 | 'num_total_samples': 1000, 29 | "parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files 30 | }, 31 | }, 32 | 'vlm_sft': { 33 | 'llava_ov': { 34 | 'data_dir': 'your_data_path/bagel_example/vlm/images', 35 | 'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl', 36 | 'num_total_samples': 1000 37 | }, 38 | }, 39 | } -------------------------------------------------------------------------------- /scripts/eval/run_imgedit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set -x 5 | 6 | export OPENAI_API_KEY=$openai_api_key 7 | 8 | GPUS=8 9 | 10 | 11 | # generate images 12 | torchrun \ 13 | --nnodes=1 \ 14 | --node_rank=0 \ 15 | --nproc_per_node=$GPUS \ 16 | --master_addr=127.0.0.1 \ 17 | --master_port=12345 \ 18 | ./eval/gen/gen_images_mp_imgedit.py \ 19 | --output_dir $output_path/bagel \ 20 | --metadata_file ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \ 21 | --max_latent_size 64 \ 22 | --model-path $model_path 23 | 24 | 25 | # calculate score 26 | python ./eval/gen/imgedit/basic_bench.py \ 27 | --result_img_folder $output_path/bagel \ 28 | --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \ 29 | --origin_img_root ./eval/gen/imgedit/Benchmark/singleturn \ 30 | --num_processes 4 \ 31 | --prompts_json ./eval/gen/imgedit/Benchmark/singleturn/judge_prompt.json 32 | 33 | 34 | # summarize score 35 | python ./eval/gen/imgedit/step1_get_avgscore.py \ 36 | --result_json $output_path/bagel/result.json \ 37 | --average_score_json $output_path/bagel/average_score.json 38 | 39 | python ./eval/gen/imgedit/step2_typescore.py \ 40 | --average_score_json $output_path/bagel/average_score.json \ 41 | --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \ 42 | --typescore_json $output_path/bagel/typescore.json -------------------------------------------------------------------------------- /eval/gen/imgedit/step1_get_avgscore.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import argparse 6 | 7 | def extract_scores_and_average(entry: str) -> float: 8 | lines = entry.splitlines() 9 | scores = [] 10 | for line in lines: 11 | parts = line.strip().split(': ') 12 | if len(parts) == 2 and parts[1].isdigit(): 13 | scores.append(int(parts[1])) 14 | if scores: 15 | return round(sum(scores) / len(scores), 2) 16 | return None 17 | 18 | def compute_averages(result_json_dict): 19 | result = {} 20 | for key, value in result_json_dict.items(): 21 | avg = extract_scores_and_average(value) 22 | if avg is not None: 23 | result[key] = avg 24 | return result 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser(description="Calculate the average score for each key and save it as a new JSON file") 28 | parser.add_argument('--result_json', type=str, required=True, help='Path of result_json json') 29 | parser.add_argument('--average_score_json', type=str, required=True, help='Path of average_score_json json') 30 | 31 | args = parser.parse_args() 32 | 33 | with open(args.result_json, 'r', encoding='utf-8') as f: 34 | data = json.load(f) 35 | 36 | averaged_data = compute_averages(data) 37 | 38 | with open(args.average_score_json, 'w', encoding='utf-8') as f: 39 | json.dump(averaged_data, f, indent=2) 40 | 41 | 42 | if __name__ == '__main__': 43 | main() -------------------------------------------------------------------------------- /scripts/eval/run_gedit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # run this script at the root of the project folder 5 | pip install httpx==0.23.0 6 | pip install openai==1.87.0 7 | pip install datasets 8 | pip install megfile 9 | 10 | 11 | N_GPU=8 # Number of GPU used in for the evaluation 12 | MODEL_PATH="/Path/to/BAGEL-7B-MoT" 13 | OUTPUT_DIR="/Path/to/save/results" 14 | GEN_DIR="$OUTPUT_DIR/gen_image" 15 | LOG_DIR="$OUTPUT_DIR/logs" 16 | 17 | AZURE_ENDPOINT="https://azure_endpoint_url_you_use" # set up the azure openai endpoint url 18 | AZURE_OPENAI_KEY="" # set up the azure openai key 19 | N_GPT_PARALLEL=10 20 | 21 | 22 | mkdir -p "$OUTPUT_DIR" 23 | mkdir -p "$GEN_DIR" 24 | mkdir -p "$LOG_DIR" 25 | 26 | 27 | # # ---------------------------- 28 | # # Download GEdit Dataset 29 | # # ---------------------------- 30 | python -c "from datasets import load_dataset; dataset = load_dataset('stepfun-ai/GEdit-Bench')" 31 | echo "Dataset Downloaded" 32 | 33 | 34 | # # --------------------- 35 | # # Generate Images 36 | # # --------------------- 37 | for ((i=0; i<$N_GPU; i++)); do 38 | nohup python3 eval/gen/gedit/gen_images_gedit.py --model_path "$MODEL_PATH" --output_dir "$GEN_DIR" --shard_id $i --total_shards "$N_GPU" --device $i 2>&1 | tee "$LOG_DIR"/request_$(($N_GPU + i)).log & 39 | done 40 | 41 | wait 42 | echo "Image Generation Done" 43 | 44 | 45 | # # --------------------- 46 | # # GPT Evaluation 47 | # # --------------------- 48 | cd eval/gen/gedit 49 | python test_gedit_score.py --save_path "$OUTPUT_DIR" --azure_endpoint "$AZURE_ENDPOINT" --gpt_keys "$AZURE_OPENAI_KEY" --max_workers "$N_GPT_PARALLEL" 50 | echo "Evaluation Done" 51 | 52 | 53 | # # -------------------- 54 | # # Print Results 55 | # # -------------------- 56 | python calculate_statistics.py --save_path "$OUTPUT_DIR" --language en 57 | 58 | -------------------------------------------------------------------------------- /modeling/qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import TYPE_CHECKING 5 | 6 | from transformers.utils import ( 7 | OptionalDependencyNotAvailable, 8 | _LazyModule, 9 | is_tokenizers_available, 10 | is_torch_available, 11 | ) 12 | 13 | 14 | _import_structure = { 15 | "configuration_qwen2": ["Qwen2Config"], 16 | "tokenization_qwen2": ["Qwen2Tokenizer"], 17 | } 18 | 19 | try: 20 | if not is_tokenizers_available(): 21 | raise OptionalDependencyNotAvailable() 22 | except OptionalDependencyNotAvailable: 23 | pass 24 | else: 25 | _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"] 26 | 27 | try: 28 | if not is_torch_available(): 29 | raise OptionalDependencyNotAvailable() 30 | except OptionalDependencyNotAvailable: 31 | pass 32 | else: 33 | _import_structure["modeling_qwen2"] = [ 34 | "Qwen2ForCausalLM", 35 | "Qwen2Model", 36 | "Qwen2PreTrainedModel", 37 | ] 38 | 39 | 40 | if TYPE_CHECKING: 41 | from .configuration_qwen2 import Qwen2Config 42 | from .tokenization_qwen2 import Qwen2Tokenizer 43 | 44 | try: 45 | if not is_tokenizers_available(): 46 | raise OptionalDependencyNotAvailable() 47 | except OptionalDependencyNotAvailable: 48 | pass 49 | else: 50 | from .tokenization_qwen2_fast import Qwen2TokenizerFast 51 | 52 | try: 53 | if not is_torch_available(): 54 | raise OptionalDependencyNotAvailable() 55 | except OptionalDependencyNotAvailable: 56 | pass 57 | else: 58 | from .modeling_qwen2 import ( 59 | Qwen2ForCausalLM, 60 | Qwen2Model, 61 | Qwen2PreTrainedModel, 62 | ) 63 | 64 | 65 | else: 66 | import sys 67 | 68 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 69 | -------------------------------------------------------------------------------- /eval/gen/imgedit/step2_typescore.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import argparse 6 | from collections import defaultdict 7 | 8 | def compute_edit_type_averages(score_dict, meta_dict): 9 | edit_type_scores = defaultdict(list) 10 | all_scores = [] 11 | 12 | for key, score in score_dict.items(): 13 | meta = meta_dict.get(key, {}) 14 | edit_type = meta.get("edit_type") 15 | if edit_type is not None: 16 | edit_type_scores[edit_type].append(score) 17 | all_scores.append(score) 18 | 19 | averaged_by_type = { 20 | etype: round(sum(scores) / len(scores), 2) 21 | for etype, scores in edit_type_scores.items() if scores 22 | } 23 | if all_scores: 24 | averaged_by_type['overall'] = round(sum(all_scores) / len(all_scores), 2) 25 | 26 | return averaged_by_type 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description="Calculate edit type averages") 30 | parser.add_argument('--average_score_json', type=str, required=True, help='path to the JSON file containing the scores') 31 | parser.add_argument('--edit_json', type=str, required=True, help='Path to the JSON file containing the basic edit information') 32 | parser.add_argument('--typescore_json', type=str, required=True, help='Path to the JSON file containing the edit type scores') 33 | 34 | args = parser.parse_args() 35 | 36 | with open(args.average_score_json, 'r', encoding='utf-8') as f: 37 | score_data = json.load(f) 38 | 39 | with open(args.edit_json, 'r', encoding='utf-8') as f: 40 | meta_data = json.load(f) 41 | 42 | averaged_result = compute_edit_type_averages(score_data, meta_data) 43 | for k, v in averaged_result.items(): 44 | print(f"{k}: {v}") 45 | 46 | with open(args.typescore_json, 'w', encoding='utf-8') as f: 47 | json.dump(averaged_result, f, indent=2) 48 | 49 | 50 | if __name__ == '__main__': 51 | main() -------------------------------------------------------------------------------- /eval/gen/geneval/evaluation/summary_scores.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Dhruba Ghosh 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import os 14 | 15 | import numpy as np 16 | import pandas as pd 17 | 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("filename", type=str) 21 | args = parser.parse_args() 22 | 23 | # Load classnames 24 | 25 | with open(os.path.join(os.path.dirname(__file__), "object_names.txt")) as cls_file: 26 | classnames = [line.strip() for line in cls_file] 27 | cls_to_idx = {"_".join(cls.split()):idx for idx, cls in enumerate(classnames)} 28 | 29 | # Load results 30 | 31 | df = pd.read_json(args.filename, orient="records", lines=True) 32 | 33 | # Measure overall success 34 | 35 | print("Summary") 36 | print("=======") 37 | print(f"Total images: {len(df)}") 38 | print(f"Total prompts: {len(df.groupby('metadata'))}") 39 | print(f"% correct images: {df['correct'].mean():.2%}") 40 | print(f"% correct prompts: {df.groupby('metadata')['correct'].any().mean():.2%}") 41 | print() 42 | 43 | # By group 44 | 45 | task_scores = [] 46 | 47 | print("Task breakdown") 48 | print("==============") 49 | for tag, task_df in df.groupby('tag', sort=False): 50 | task_scores.append(task_df['correct'].mean()) 51 | print(f"{tag:<16} = {task_df['correct'].mean():.2%} ({task_df['correct'].sum()} / {len(task_df)})") 52 | print() 53 | 54 | print(f"Overall score (avg. over tasks): {np.mean(task_scores):.5f}") 55 | 56 | 57 | print("\n\n==============") 58 | output_info = "SO TO CT CL POS ATTR ALL\n" 59 | for score in task_scores: 60 | output_info += f"{score:.2f} " 61 | output_info += f"{np.mean(task_scores):.2f}" + "\n" 62 | print(output_info) 63 | with open(os.path.join(os.path.dirname(args.filename), "geneval_results.txt"), "w") as f: 64 | f.write(output_info) -------------------------------------------------------------------------------- /data/distributed_iterable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import random 5 | import torch 6 | 7 | 8 | class DistributedIterableDataset(torch.utils.data.IterableDataset): 9 | def __init__(self, dataset_name, local_rank=0, world_size=1, num_workers=8): 10 | self.dataset_name = dataset_name 11 | self.local_rank = local_rank 12 | self.world_size = world_size 13 | self.num_workers = num_workers 14 | self.rng = random.Random() 15 | self.data_paths = None 16 | 17 | def get_data_paths(self, *args, **kwargs): 18 | raise NotImplementedError 19 | 20 | def set_epoch(self, seed=42): 21 | if self.data_paths is None: 22 | return 23 | 24 | if isinstance(self.data_paths[0], tuple): 25 | data_paths = sorted(self.data_paths, key=lambda x: (x[0], x[1])) 26 | elif isinstance(self.data_paths[0], str): 27 | data_paths = sorted(self.data_paths) 28 | else: 29 | raise ValueError(f"Unknown data_paths type: {type(self.data_paths[0])}") 30 | 31 | self.rng.seed(seed) 32 | self.rng.shuffle(data_paths) 33 | 34 | num_files_per_rank = len(data_paths) // self.world_size 35 | local_start = self.local_rank * num_files_per_rank 36 | local_end = (self.local_rank + 1) * num_files_per_rank 37 | self.num_files_per_rank = num_files_per_rank 38 | self.data_paths_per_rank = data_paths[local_start:local_end] 39 | 40 | def get_data_paths_per_worker(self): 41 | if self.data_paths is None: 42 | return None 43 | 44 | info = torch.utils.data.get_worker_info() 45 | if info is None: 46 | # Single worker: Use all files assigned to the rank 47 | return self.data_paths_per_rank, 0 48 | 49 | worker_id = info.id 50 | num_files_per_worker = self.num_files_per_rank // info.num_workers 51 | start = num_files_per_worker * worker_id 52 | end = num_files_per_worker * (worker_id + 1) 53 | data_paths_per_worker = self.data_paths_per_rank[start:end] 54 | 55 | return data_paths_per_worker[::-1], worker_id 56 | 57 | def __iter__(self): 58 | raise NotImplementedError 59 | -------------------------------------------------------------------------------- /eval/vlm/eval/mathvista/prompts/ext_ans.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | # pids = 852, 104, 824, 506, 540 13 | 14 | demo_prompt = """ 15 | Please read the following example. Then extract the answer from the model response and type it at the end of the prompt. 16 | 17 | Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end. 18 | Question: Which number is missing? 19 | 20 | Model response: The number missing in the sequence is 14. 21 | 22 | Extracted answer: 14 23 | 24 | Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end. 25 | Question: What is the fraction of females facing the camera? 26 | 27 | Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera. 28 | 29 | Extracted answer: 0.6 30 | 31 | Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end. 32 | Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $) 33 | 34 | Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy. 35 | 36 | Extracted answer: 1.45 37 | 38 | Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end. 39 | Question: Between which two years does the line graph saw its maximum peak? 40 | 41 | Model response: The line graph saw its maximum peak between 2007 and 2008. 42 | 43 | Extracted answer: [2007, 2008] 44 | 45 | Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end. 46 | Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5 47 | 48 | Model response: The correct answer is (B) 8/11. 49 | 50 | Extracted answer: B 51 | """ 52 | -------------------------------------------------------------------------------- /eval/gen/gedit/viescore/mllm_tools/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import base64 3 | from io import BytesIO 4 | from PIL import Image 5 | import requests 6 | 7 | def pil_image_to_base64(pil_image, format="PNG"): 8 | buffered = BytesIO() 9 | pil_image.save(buffered, format=format) # Save image to the buffer in the specified format 10 | img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') # Encode the buffer's content to base64 11 | return img_str 12 | 13 | def load_image(image_file): 14 | if image_file.startswith("http"): 15 | response = requests.get(image_file) 16 | image = Image.open(BytesIO(response.content)).convert("RGB") 17 | else: 18 | import os 19 | image = Image.open(image_file).convert("RGB") 20 | return image 21 | 22 | 23 | def load_images(image_files): 24 | out = [] 25 | for image_file in image_files: 26 | image = load_image(image_file) 27 | out.append(image) 28 | return out 29 | 30 | def merge_images(image_links: List = []): 31 | """Merge multiple images into one image 32 | 33 | Args: 34 | image_links (List, optional): List of image links. Defaults to []. 35 | 36 | Returns: 37 | [type]: [description] 38 | """ 39 | if len(image_links) == 0: 40 | return None 41 | images = load_images(image_links) 42 | if len(images) == 1: 43 | return images[0] 44 | widths, heights = zip(*(i.size for i in images)) 45 | average_height = sum(heights) // len(heights) 46 | for i, im in enumerate(images): 47 | # scale in proportion 48 | images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height)) 49 | widths, heights = zip(*(i.size for i in images)) 50 | total_width = sum(widths) 51 | max_height = max(heights) 52 | new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height)) 53 | x_offset = 0 54 | for i, im in enumerate(images): 55 | if i > 0: 56 | # past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black 57 | new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) 58 | x_offset += 1 59 | new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0)) 60 | x_offset += 8 61 | new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) 62 | x_offset += 1 63 | new_im.paste(im, (x_offset, 0)) 64 | x_offset += im.size[0] 65 | return new_im -------------------------------------------------------------------------------- /modeling/siglip/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import TYPE_CHECKING 5 | 6 | from transformers.utils import ( 7 | OptionalDependencyNotAvailable, 8 | _LazyModule, 9 | is_sentencepiece_available, 10 | is_torch_available, 11 | is_vision_available, 12 | ) 13 | 14 | 15 | _import_structure = { 16 | "configuration_siglip": [ 17 | "SiglipConfig", 18 | "SiglipTextConfig", 19 | "SiglipVisionConfig", 20 | ], 21 | "processing_siglip": ["SiglipProcessor"], 22 | } 23 | 24 | try: 25 | if not is_sentencepiece_available(): 26 | raise OptionalDependencyNotAvailable() 27 | except OptionalDependencyNotAvailable: 28 | pass 29 | else: 30 | _import_structure["tokenization_siglip"] = ["SiglipTokenizer"] 31 | 32 | 33 | try: 34 | if not is_vision_available(): 35 | raise OptionalDependencyNotAvailable() 36 | except OptionalDependencyNotAvailable: 37 | pass 38 | else: 39 | _import_structure["image_processing_siglip"] = ["SiglipImageProcessor"] 40 | 41 | try: 42 | if not is_torch_available(): 43 | raise OptionalDependencyNotAvailable() 44 | except OptionalDependencyNotAvailable: 45 | pass 46 | else: 47 | _import_structure["modeling_siglip"] = [ 48 | "SiglipModel", 49 | "SiglipPreTrainedModel", 50 | "SiglipTextModel", 51 | "SiglipVisionModel", 52 | "SiglipForImageClassification", 53 | ] 54 | 55 | 56 | if TYPE_CHECKING: 57 | from .configuration_siglip import ( 58 | SiglipConfig, 59 | SiglipTextConfig, 60 | SiglipVisionConfig, 61 | ) 62 | from .processing_siglip import SiglipProcessor 63 | 64 | try: 65 | if not is_sentencepiece_available(): 66 | raise OptionalDependencyNotAvailable() 67 | except OptionalDependencyNotAvailable: 68 | pass 69 | else: 70 | from .tokenization_siglip import SiglipTokenizer 71 | 72 | try: 73 | if not is_vision_available(): 74 | raise OptionalDependencyNotAvailable() 75 | except OptionalDependencyNotAvailable: 76 | pass 77 | else: 78 | from .image_processing_siglip import SiglipImageProcessor 79 | 80 | try: 81 | if not is_torch_available(): 82 | raise OptionalDependencyNotAvailable() 83 | except OptionalDependencyNotAvailable: 84 | pass 85 | else: 86 | from .modeling_siglip import ( 87 | SiglipForImageClassification, 88 | SiglipModel, 89 | SiglipPreTrainedModel, 90 | SiglipTextModel, 91 | SiglipVisionModel, 92 | ) 93 | 94 | 95 | else: 96 | import sys 97 | 98 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 99 | -------------------------------------------------------------------------------- /data/interleave_datasets/edit_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import io 5 | import random 6 | from PIL import Image, ImageFile, PngImagePlugin 7 | 8 | from .interleave_t2i_dataset import InterleavedBaseIterableDataset, ParquetStandardIterableDataset 9 | from ..data_utils import pil_img2rgb 10 | 11 | 12 | Image.MAX_IMAGE_PIXELS = 200000000 13 | ImageFile.LOAD_TRUNCATED_IMAGES = True 14 | MaximumDecompressedSize = 1024 15 | MegaByte = 2 ** 20 16 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte 17 | 18 | 19 | class UnifiedEditIterableDataset(InterleavedBaseIterableDataset, ParquetStandardIterableDataset): 20 | 21 | def parse_row(self, row): 22 | image_num = len(row["image_list"]) 23 | # randomly choose start and end, return [0, 1] when only two images 24 | start_idx = random.choice(range(image_num - 1)) 25 | max_end = min(start_idx + 3, image_num) 26 | end_idx = random.choice(range(start_idx + 1, max_end)) 27 | 28 | data = self._init_data() 29 | data = self._add_image( 30 | data, 31 | pil_img2rgb(Image.open(io.BytesIO(row["image_list"][start_idx]))), 32 | need_loss=False, 33 | need_vae=True, 34 | need_vit=True, 35 | ) 36 | 37 | if end_idx - start_idx > 1 and random.random() < 0.5: # concat multiple insturction 38 | if end_idx == image_num - 1: 39 | end_idx -= 1 40 | 41 | instruction = "" 42 | for idx in range(start_idx + 1, end_idx + 1): 43 | instruction += random.choice(row["instruction_list"][idx-1]) + ". " 44 | data = self._add_text(data, instruction.rstrip(), need_loss=False) 45 | data = self._add_image( 46 | data, 47 | pil_img2rgb(Image.open(io.BytesIO(row["image_list"][end_idx]))), 48 | need_loss=True, 49 | need_vae=False, 50 | need_vit=False, 51 | ) 52 | else: 53 | for idx in range(start_idx + 1, end_idx + 1): 54 | instruction = random.choice(row["instruction_list"][idx-1]) 55 | data = self._add_text(data, instruction, need_loss=False) 56 | if idx != end_idx: 57 | data = self._add_image( 58 | data, 59 | pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))), 60 | need_loss=True, 61 | need_vae=True, 62 | need_vit=True, 63 | ) 64 | else: 65 | data = self._add_image( 66 | data, 67 | pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))), 68 | need_loss=True, 69 | need_vae=False, 70 | need_vit=False, 71 | ) 72 | return data 73 | -------------------------------------------------------------------------------- /data/parquet_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | import os 6 | import subprocess 7 | import logging 8 | 9 | import pyarrow.fs as pf 10 | import torch.distributed as dist 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def get_parquet_data_paths(data_dir_list, num_sampled_data_paths, rank=0, world_size=1): 16 | num_data_dirs = len(data_dir_list) 17 | if world_size > 1: 18 | chunk_size = (num_data_dirs + world_size - 1) // world_size 19 | start_idx = rank * chunk_size 20 | end_idx = min(start_idx + chunk_size, num_data_dirs) 21 | local_data_dir_list = data_dir_list[start_idx:end_idx] 22 | local_num_sampled_data_paths = num_sampled_data_paths[start_idx:end_idx] 23 | else: 24 | local_data_dir_list = data_dir_list 25 | local_num_sampled_data_paths = num_sampled_data_paths 26 | 27 | local_data_paths = [] 28 | for data_dir, num_data_path in zip(local_data_dir_list, local_num_sampled_data_paths): 29 | if data_dir.startswith("hdfs://"): 30 | files = hdfs_ls_cmd(data_dir) 31 | data_paths_per_dir = [ 32 | file for file in files if file.endswith(".parquet") 33 | ] 34 | else: 35 | files = os.listdir(data_dir) 36 | data_paths_per_dir = [ 37 | os.path.join(data_dir, name) 38 | for name in files 39 | if name.endswith(".parquet") 40 | ] 41 | repeat = num_data_path // len(data_paths_per_dir) 42 | data_paths_per_dir = data_paths_per_dir * (repeat + 1) 43 | local_data_paths.extend(data_paths_per_dir[:num_data_path]) 44 | 45 | if world_size > 1: 46 | gather_list = [None] * world_size 47 | dist.all_gather_object(gather_list, local_data_paths) 48 | 49 | combined_chunks = [] 50 | for chunk_list in gather_list: 51 | if chunk_list is not None: 52 | combined_chunks.extend(chunk_list) 53 | else: 54 | combined_chunks = local_data_paths 55 | 56 | return combined_chunks 57 | 58 | 59 | # NOTE: cumtomize this function for your cluster 60 | def get_hdfs_host(): 61 | return "hdfs://xxx" 62 | 63 | 64 | # NOTE: cumtomize this function for your cluster 65 | def get_hdfs_block_size(): 66 | return 134217728 67 | 68 | 69 | # NOTE: cumtomize this function for your cluster 70 | def get_hdfs_extra_conf(): 71 | return None 72 | 73 | 74 | def init_arrow_pf_fs(parquet_file_path): 75 | if parquet_file_path.startswith("hdfs://"): 76 | fs = pf.HadoopFileSystem( 77 | host=get_hdfs_host(), 78 | port=0, 79 | buffer_size=get_hdfs_block_size(), 80 | extra_conf=get_hdfs_extra_conf(), 81 | ) 82 | else: 83 | fs = pf.LocalFileSystem() 84 | return fs 85 | 86 | 87 | def hdfs_ls_cmd(dir): 88 | result = subprocess.run(["hdfs", "dfs", "ls", dir], capture_output=True, text=True).stdout 89 | return ['hdfs://' + i.split('hdfs://')[-1].strip() for i in result.split('\n') if 'hdfs://' in i] 90 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import os 14 | import re 15 | 16 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation 17 | from PIL import Image 18 | from tqdm import tqdm 19 | 20 | 21 | def post_processing(response): 22 | response = response.replace('\n', '').replace('不是', 'No').replace('是', 'Yes').replace('否', 'No') 23 | response = response.lower().replace('true', 'yes').replace('false', 'no') 24 | pattern = re.compile(r'[\u4e00-\u9fa5]') 25 | response = re.sub(pattern, '', response) 26 | return response 27 | 28 | 29 | if __name__ == '__main__': 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--root', type=str, default='eval/vlm/eval/mme/Your_Results') 32 | parser.add_argument('--out-dir', type=str, default='results') 33 | parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/') 34 | args = parser.parse_args() 35 | 36 | model, tokenizer, new_token_ids = load_model_and_tokenizer(args) 37 | image_transform = build_transform() 38 | 39 | total_params = sum(p.numel() for p in model.parameters()) / 1e9 40 | print(f'[test] total_params: {total_params}B') 41 | 42 | os.makedirs(args.out_dir, exist_ok=True) 43 | prompt = 'Answer the question using a single word or phrase.' 44 | 45 | for filename in os.listdir(args.root): 46 | fin = open(os.path.join(args.root, filename), 'r', encoding='utf-8') 47 | fout = open(os.path.join(args.out_dir, filename), 'w', encoding='utf-8') 48 | lines = fin.readlines() 49 | filename = filename.replace('.txt', '') 50 | for line in tqdm(lines): 51 | img, question, gt = line.strip().split('\t') 52 | question = question + ' ' + prompt 53 | img_path = os.path.join('eval/vlm/data/mme/MME_Benchmark_release_version', filename, img) 54 | if not os.path.exists(img_path): 55 | img_path = os.path.join('eval/vlm/data/mme/MME_Benchmark_release_version', filename, "images", img) 56 | if not os.path.exists(img_path): 57 | continue 58 | images = [Image.open(img_path).convert('RGB')] 59 | images, conversation = process_conversation(images, question) 60 | 61 | response = model.chat( 62 | tokenizer, 63 | new_token_ids, 64 | image_transform, 65 | images=images, 66 | prompt=conversation, 67 | max_length=20, 68 | ) 69 | response = post_processing(response) 70 | print(img, question, gt, response, sep='\t', file=fout) 71 | fin.close() 72 | fout.close() 73 | 74 | os.system(f"python -m eval.vlm.eval.mme.calculation --out-dir {args.out_dir}") 75 | -------------------------------------------------------------------------------- /eval/vlm/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import os 13 | import yaml 14 | 15 | from data.data_utils import add_special_tokens, pil_img2rgb 16 | from modeling.bagel import ( 17 | BagelConfig, 18 | Bagel, 19 | Qwen2Config, 20 | Qwen2ForCausalLM, 21 | SiglipVisionConfig, 22 | SiglipVisionModel, 23 | ) 24 | from modeling.qwen2 import Qwen2Tokenizer 25 | from safetensors.torch import load_file 26 | 27 | from data.transforms import ImageTransform 28 | 29 | 30 | def load_model_and_tokenizer(args): 31 | llm_config = Qwen2Config.from_json_file(os.path.join(args.model_path, "llm_config.json")) 32 | llm_config.qk_norm = True 33 | llm_config.tie_word_embeddings = False 34 | llm_config.layer_module ="Qwen2MoTDecoderLayer" 35 | 36 | vit_config = SiglipVisionConfig.from_json_file(os.path.join(args.model_path, "vit_config.json")) 37 | vit_config.rope = False 38 | vit_config.num_hidden_layers = vit_config.num_hidden_layers - 1 39 | 40 | config = BagelConfig( 41 | visual_gen=False, 42 | visual_und=True, 43 | llm_config=llm_config, 44 | vit_config=vit_config, 45 | vit_max_num_patch_per_side=70, 46 | connector_act='gelu_pytorch_tanh', 47 | ) 48 | language_model = Qwen2ForCausalLM(llm_config) 49 | vit_model = SiglipVisionModel(vit_config) 50 | model = Bagel(language_model, vit_model, config) 51 | model.vit_model.vision_model.embeddings.convert_conv2d_to_linear(vit_config) 52 | 53 | tokenizer = Qwen2Tokenizer.from_pretrained(args.model_path) 54 | tokenizer, new_token_ids, _ = add_special_tokens(tokenizer) 55 | 56 | model_state_dict_path = os.path.join(args.model_path, "ema.safetensors") 57 | model_state_dict = load_file(model_state_dict_path, device="cpu") 58 | msg = model.load_state_dict(model_state_dict, strict=False) 59 | print(msg) 60 | del model_state_dict 61 | model = model.cuda().eval() 62 | 63 | return model, tokenizer, new_token_ids 64 | 65 | 66 | def build_transform(): 67 | with open("./data/configs/example.yaml", "r") as f: 68 | data_config = yaml.safe_load(f) 69 | 70 | max_image_size = data_config['vlm_sft']['image_transform_args']['max_image_size'] 71 | min_image_size = data_config['vlm_sft']['image_transform_args']['min_image_size'] 72 | image_stride = data_config['vlm_sft']['image_transform_args']['image_stride'] 73 | max_pixels = data_config['vlm_sft']['image_transform_args']['max_pixels'] 74 | 75 | image_transform = ImageTransform( 76 | max_image_size=max_image_size, 77 | min_image_size=min_image_size, 78 | image_stride=image_stride, 79 | max_pixels=max_pixels, 80 | ) 81 | 82 | return image_transform 83 | 84 | 85 | def process_conversation(images, conversation): 86 | images = [pil_img2rgb(image) for image in images] 87 | return images, conversation 88 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/OCR.txt: -------------------------------------------------------------------------------- 1 | 0001.jpg Is the word in the logo "angie's"? Please answer yes or no. Yes 2 | 0001.jpg Is the word in the logo "angle's"? Please answer yes or no. No 3 | 0002.jpg Is the word in the logo "c'est cheese"? Please answer yes or no. Yes 4 | 0002.jpg Is the word in the logo "crest cheese"? Please answer yes or no. No 5 | 0003.jpg Is the word in the logo "beavertails pastry"? Please answer yes or no. Yes 6 | 0003.jpg Is the word in the logo "beavertalls pastry"? Please answer yes or no. No 7 | 0004.jpg Is the word in the logo "old market sundries"? Please answer yes or no. Yes 8 | 0004.jpg Is the word in the logo "old market hundreds"? Please answer yes or no. No 9 | 0005.jpg Is the word in the logo "kress"? Please answer yes or no. Yes 10 | 0005.jpg Is the word in the logo "dress"? Please answer yes or no. No 11 | 0006.jpg Is the word in the logo "the beatles story liver pool"? Please answer yes or no. Yes 12 | 0006.jpg Is the word in the logo "the beats story liver pool"? Please answer yes or no. No 13 | 0007.jpg Is the phone number in the picture "0131 555 6363"? Please answer yes or no. Yes 14 | 0007.jpg Is the phone number in the picture "0137 556 6363"? Please answer yes or no. No 15 | 0008.jpg Is the word in the logo "phil's market"? Please answer yes or no. Yes 16 | 0008.jpg Is the word in the logo "phll's market"? Please answer yes or no. No 17 | 0009.jpg Is the word in the logo "fenders diner"? Please answer yes or no. Yes 18 | 0009.jpg Is the word in the logo "finders diner"? Please answer yes or no. No 19 | 0010.jpg Is the word in the logo "high time coffee shop"? Please answer yes or no. Yes 20 | 0010.jpg Is the word in the logo "high tite cofeee shop"? Please answer yes or no. No 21 | 0011.jpg Is the word in the logo "ihop restaurant"? Please answer yes or no. Yes 22 | 0011.jpg Is the word in the logo "lhop restaurant"? Please answer yes or no. No 23 | 0012.jpg Is the word in the logo "casa grecque restaurants"? Please answer yes or no. Yes 24 | 0012.jpg Is the word in the logo "case grecque restaurants"? Please answer yes or no. No 25 | 0013.jpg Is the word in the picture "seabreeze motel"? Please answer yes or no. Yes 26 | 0013.jpg Is the word in the picture "seebreeze model"? Please answer yes or no. No 27 | 0014.jpg Is the word in the logo "penarth pier built 1894"? Please answer yes or no. Yes 28 | 0014.jpg Is the word in the logo "penarth pies buid 1894"? Please answer yes or no. No 29 | 0015.jpg Is the text in the picture "hollywood"? Please answer yes or no. Yes 30 | 0015.jpg Is the text in the picture "holly word"? Please answer yes or no. No 31 | 0016.jpg Is the word in the logo "shop rite"? Please answer yes or no. Yes 32 | 0016.jpg Is the word in the logo "stop rite"? Please answer yes or no. No 33 | 0017.jpg Is the word in the logo "hardco industrial construction"? Please answer yes or no. Yes 34 | 0017.jpg Is the word in the logo "hardto industal construction"? Please answer yes or no. No 35 | 0018.jpg Is the word in the logo "oldsmobile service"? Please answer yes or no. Yes 36 | 0018.jpg Is the word in the logo "old mobile service"? Please answer yes or no. No 37 | 0019.jpg Is the word in the logo "exchange hotel"? Please answer yes or no. Yes 38 | 0019.jpg Is the word in the logo "excharge hotel"? Please answer yes or no. No 39 | 0020.jpg Is the word in the logo "cold drinks"? Please answer yes or no. Yes 40 | 0020.jpg Is the word in the logo "cold rinks"? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/numerical_calculation.txt: -------------------------------------------------------------------------------- 1 | 0001.png Is the answer to the arithmetic question in the image 225? Please answer yes or no. Yes 2 | 0001.png Is the answer to the arithmetic question in the image 1515? Please answer yes or no. No 3 | 0002.png Is the answer to the arithmetic question in the image 340? Please answer yes or no. Yes 4 | 0002.png Is the answer to the arithmetic question in the image 17? Please answer yes or no. No 5 | 0003.png Is the answer to the arithmetic question in the image 65? Please answer yes or no. Yes 6 | 0003.png Is the answer to the arithmetic question in the image 56? Please answer yes or no. No 7 | 0004.png Is the answer to the arithmetic question in the image 33? Please answer yes or no. Yes 8 | 0004.png Is the answer to the arithmetic question in the image 32? Please answer yes or no. No 9 | 0005.png Is the area of the square in the picture equal to 40? Please answer yes or no. Yes 10 | 0005.png Is the area of the square in the picture equal to 8? Please answer yes or no. No 11 | 0006.png Is the area of the square in the picture equal to 9? Please answer yes or no. Yes 12 | 0006.png Is the area of the square in the picture equal to 3? Please answer yes or no. No 13 | 0007.png Is the answer to the arithmetic question in the image 49? Please answer yes or no. Yes 14 | 0007.png Is the answer to the arithmetic question in the image 39? Please answer yes or no. No 15 | 0008.png Should the value of "a" in the picture equal 7? Please answer yes or no. Yes 16 | 0008.png Should the value of "a" in the picture equal 14? Please answer yes or no. No 17 | 0009.png Should the value of "a" in the picture equal 2? Please answer yes or no. Yes 18 | 0009.png Should the value of "a" in the picture equal 3? Please answer yes or no. No 19 | 0010.png Is the answer to the arithmetic question in the image 13? Please answer yes or no. Yes 20 | 0010.png Is the answer to the arithmetic question in the image 12? Please answer yes or no. No 21 | 0011.png Is the area of the parallelogram in the picture equal to 24? Please answer yes or no. Yes 22 | 0011.png Is the area of the parallelogram in the picture equal to 6? Please answer yes or no. No 23 | 0012.png Should the value of "a" in the picture equal 9? Please answer yes or no. Yes 24 | 0012.png Should the value of "a" in the picture equal 1? Please answer yes or no. No 25 | 0013.png Is the area of the right triangle in the picture equal to 24? Please answer yes or no. Yes 26 | 0013.png Is the area of the right triangle in the picture equal to 8? Please answer yes or no. No 27 | 0014.png Is the answer to the arithmetic question in the image 200? Please answer yes or no. Yes 28 | 0014.png Is the answer to the arithmetic question in the image 400? Please answer yes or no. No 29 | 0015.png Is the answer to the arithmetic question in the image 11? Please answer yes or no. Yes 30 | 0015.png Is the answer to the arithmetic question in the image 111? Please answer yes or no. No 31 | 0016.png Is the answer to the arithmetic question in the image 9? Please answer yes or no. Yes 32 | 0016.png Is the answer to the arithmetic question in the image 16? Please answer yes or no. No 33 | 0017.png Is the answer to the arithmetic question in the image 14? Please answer yes or no. Yes 34 | 0017.png Is the answer to the arithmetic question in the image 83? Please answer yes or no. No 35 | 0018.png Should the value of "a" in the picture equal 3? Please answer yes or no. Yes 36 | 0018.png Should the value of "a" in the picture equal 2? Please answer yes or no. No 37 | 0019.png Is the answer to the arithmetic question in the image 18? Please answer yes or no. Yes 38 | 0019.png Is the answer to the arithmetic question in the image 36? Please answer yes or no. No 39 | 0020.png Is the answer to the arithmetic question in the image 9? Please answer yes or no. Yes 40 | 0020.png Is the answer to the arithmetic question in the image 45? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/code_reasoning.txt: -------------------------------------------------------------------------------- 1 | 0001.png The image shows a python code. Is the output of the code 'Hello'? Please answer yes or no. Yes 2 | 0001.png The image shows a python code. Is the output of the code 'World'? Please answer yes or no. No 3 | 0002.png The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no. Yes 4 | 0002.png The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no. No 5 | 0003.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. Yes 6 | 0003.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 7 | 0004.png The image shows a python code. Is the output of the code '3'? Please answer yes or no. Yes 8 | 0004.png The image shows a python code. Is the output of the code '2'? Please answer yes or no. No 9 | 0005.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. Yes 10 | 0005.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 11 | 0006.png The image shows a python code. Is the output of the code '0'? Please answer yes or no. Yes 12 | 0006.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 13 | 0007.png Is a c++ code shown in the picture? Please answer yes or no. Yes 14 | 0007.png Is a python code shown in the picture? Please answer yes or no. No 15 | 0008.png The image shows a python code. Is the output of the code '1234'? Please answer yes or no. Yes 16 | 0008.png The image shows a python code. Is the output of the code '12345'? Please answer yes or no. No 17 | 0009.png The image shows a python code. Is the output of the code '36'? Please answer yes or no. Yes 18 | 0009.png The image shows a python code. Is the output of the code '6'? Please answer yes or no. No 19 | 0010.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. Yes 20 | 0010.png The image shows a python code. Is the output of the code '5'? Please answer yes or no. No 21 | 0011.png The image shows a python code. Is the output of the code '0'? Please answer yes or no. Yes 22 | 0011.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 23 | 0012.png The image shows a python code. Is the output of the code 'working hard'? Please answer yes or no. Yes 24 | 0012.png The image shows a python code. Is the output of the code 'playing hard'? Please answer yes or no. No 25 | 0013.png The image shows a python code. Is the output of the code 'a cat'? Please answer yes or no. Yes 26 | 0013.png The image shows a python code. Is the output of the code 'a dog'? Please answer yes or no. No 27 | 0014.png The image shows a python code. Is the output of the code '7'? Please answer yes or no. Yes 28 | 0014.png The image shows a python code. Is the output of the code '1'? Please answer yes or no. No 29 | 0015.png The image shows a python code. Is the output of the code '11'? Please answer yes or no. Yes 30 | 0015.png The image shows a python code. Is the output of the code '9'? Please answer yes or no. No 31 | 0016.png The image shows a python code. Is the output of the code 'x is smaller than 10'? Please answer yes or no. Yes 32 | 0016.png The image shows a python code. Is the output of the code 'x is larger than 10'? Please answer yes or no. No 33 | 0017.png The image shows a python code. Will the number 3 appear in the output of the code? Please answer yes or no. Yes 34 | 0017.png The image shows a python code. Will the number 6 appear in the output of the code? Please answer yes or no. No 35 | 0018.png The image shows a python code. Is the output of the code '11'? Please answer yes or no. Yes 36 | 0018.png The image shows a python code. Is the output of the code '12'? Please answer yes or no. No 37 | 0019.png The image shows a python code. Is the output of the code 'the list has more than 2 numbers'? Please answer yes or no. Yes 38 | 0019.png The image shows a python code. Is the output of the code 'the list has less than 2 numbers'? Please answer yes or no. No 39 | 0020.png Is a python code shown in the picture? Please answer yes or no. Yes 40 | 0020.png Is a c++ code shown in the picture? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/vlm/eval/mmvet/evaluate_mmvet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import json 14 | import os 15 | import random 16 | 17 | import torch 18 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation 19 | from PIL import Image 20 | from tqdm import tqdm 21 | 22 | ds_collections = { 23 | 'mmvet': { 24 | 'root': 'eval/vlm/data/mm-vet/images', 25 | 'question': 'eval/vlm/data/mm-vet/llava-mm-vet.jsonl', 26 | 'metric': None, 27 | 'max_new_tokens': 1000, 28 | 'min_new_tokens': 1, 29 | } 30 | } 31 | 32 | 33 | class VQADataset(torch.utils.data.Dataset): 34 | 35 | def __init__(self, root, data, prompt): 36 | self.root = root 37 | self.data = open(data).readlines() 38 | self.prompt = prompt 39 | 40 | def __len__(self): 41 | return len(self.data) 42 | 43 | def __getitem__(self, idx): 44 | data = json.loads(self.data[idx].strip()) 45 | image, question, question_id, annotation = data['image'], data[ 46 | 'text'], data['question_id'], data.get('answer', None) 47 | 48 | image = os.path.join(self.root, image) 49 | image = Image.open(image).convert('RGB') 50 | images = [image] 51 | 52 | question = question + ' ' + self.prompt 53 | 54 | images, conversation = process_conversation(images, question) 55 | 56 | return question_id, question, images, conversation, annotation 57 | 58 | 59 | def evaluate_chat_model(): 60 | random.seed(args.seed) 61 | prompt = '' 62 | 63 | for ds_name in args.datasets: 64 | dataset = VQADataset( 65 | root=ds_collections[ds_name]['root'], 66 | data=ds_collections[ds_name]['question'], 67 | prompt=prompt, 68 | ) 69 | 70 | outputs = {} 71 | for _, (question_id, question, images, conversation, annotations) in tqdm(enumerate(dataset)): 72 | pred = model.chat( 73 | tokenizer, 74 | new_token_ids, 75 | image_transform, 76 | images=images, 77 | prompt=conversation, 78 | max_length=ds_collections[ds_name]['max_new_tokens'], # TODO: how to use ds_collections[ds_name]['min_new_tokens'] 79 | ) 80 | 81 | outputs[f'v1_{question_id}'] = pred 82 | 83 | print(f'Evaluating {ds_name} ...') 84 | results_file = os.path.join(args.out_dir, 'results.json') 85 | json.dump(outputs, open(results_file, 'w')) 86 | print('Results saved to {}'.format(results_file)) 87 | 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--datasets', type=str, default='mmvet') 92 | parser.add_argument('--batch-size', type=int, default=1) 93 | parser.add_argument('--num-workers', type=int, default=1) 94 | parser.add_argument('--out-dir', type=str, default='results') 95 | parser.add_argument('--seed', type=int, default=0) 96 | parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/') 97 | args = parser.parse_args() 98 | 99 | if not os.path.exists(args.out_dir): 100 | os.makedirs(args.out_dir, exist_ok=True) 101 | 102 | args.datasets = args.datasets.split(',') 103 | print('datasets:', args.datasets) 104 | assert args.batch_size == 1, 'Only batch size 1 is supported' 105 | 106 | model, tokenizer, new_token_ids = load_model_and_tokenizer(args) 107 | image_transform = build_transform() 108 | 109 | total_params = sum(p.numel() for p in model.parameters()) / 1e9 110 | print(f'[test] total_params: {total_params}B') 111 | 112 | evaluate_chat_model() 113 | -------------------------------------------------------------------------------- /eval/gen/gedit/viescore/mllm_tools/qwen25vl_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import time 4 | from PIL import Image 5 | from typing import List 6 | from transformers import AutoModel, AutoTokenizer 7 | from transformers.utils import is_flash_attn_2_available 8 | from transformers import Qwen2_5_VLForConditionalGeneration 9 | from qwen_vl_utils import process_vision_info 10 | from transformers import AutoProcessor 11 | import requests 12 | from io import BytesIO 13 | import random 14 | import numpy as np 15 | import base64 16 | import magic 17 | import megfile 18 | 19 | def process_image(image): 20 | img_byte_arr = BytesIO() 21 | image.save(img_byte_arr, format='PNG') 22 | img_byte_arr = img_byte_arr.getvalue() 23 | return img_byte_arr 24 | 25 | def convert_image_to_base64(file_content): 26 | mime_type = magic.from_buffer(file_content, mime=True) 27 | base64_encoded_data = base64.b64encode(file_content).decode('utf-8') 28 | return f"data:{mime_type};base64,{base64_encoded_data}" 29 | 30 | 31 | def set_seed(seed: int): 32 | """ 33 | Args: 34 | Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`. 35 | seed (`int`): The seed to set. 36 | """ 37 | random.seed(seed) 38 | np.random.seed(seed) 39 | torch.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | class Qwen25VL(): 43 | def __init__(self) -> None: 44 | attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None 45 | self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( 46 | "/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ", 47 | torch_dtype=torch.float16, 48 | device_map="auto" 49 | ).eval() 50 | self.processor = AutoProcessor.from_pretrained("/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ") 51 | 52 | print(f"Using {attn_implementation} for attention implementation") 53 | 54 | def prepare_prompt(self, image_links: List = [], text_prompt: str = ""): 55 | if not isinstance(image_links, list): 56 | image_links = [image_links] 57 | 58 | image_links_base64 = [] 59 | 60 | for img_link in image_links: 61 | if type(img_link) == str: 62 | image_links_base64.append(convert_image_to_base64(process_image(megfile.smart_open(img_link, 'rb')))) 63 | else: 64 | image_links_base64.append(convert_image_to_base64(process_image(img_link))) 65 | 66 | messages = [ 67 | { 68 | "role": "user", 69 | "content": [ 70 | {"type": "image", "image": img_link} for img_link in image_links_base64 71 | ] + [{"type": "text", "text": text_prompt}] 72 | } 73 | ] 74 | return messages 75 | 76 | def get_parsed_output(self, messages): 77 | set_seed(42) 78 | # Prepare the inputs 79 | text = self.processor.apply_chat_template( 80 | messages, tokenize=False, add_generation_prompt=True 81 | ) 82 | image_inputs, video_inputs = process_vision_info(messages) 83 | 84 | # Process inputs 85 | inputs = self.processor( 86 | text=[text], 87 | images=image_inputs, 88 | videos=video_inputs, 89 | padding=True, 90 | return_tensors="pt" 91 | ) 92 | inputs = inputs.to("cuda") 93 | 94 | # Generate output 95 | generation_config = { 96 | "max_new_tokens": 512, 97 | "num_beams": 1, 98 | "do_sample": False, 99 | "temperature": 0.1, 100 | "top_p": None, 101 | } 102 | generated_ids = self.model.generate(**inputs, **generation_config) 103 | generated_ids_trimmed = [ 104 | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 105 | ] 106 | output_text = self.processor.batch_decode( 107 | generated_ids_trimmed, 108 | skip_special_tokens=True, 109 | clean_up_tokenization_spaces=False 110 | ) 111 | 112 | return output_text[0] if output_text else "" 113 | 114 | if __name__ == "__main__": 115 | model = Qwen25VL() 116 | prompt = model.prepare_prompt( 117 | ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"], 118 | 'Describe the image in detail.' 119 | ) 120 | res = model.get_parsed_output(prompt) 121 | print("result : \n", res) -------------------------------------------------------------------------------- /eval/vlm/eval/pope/eval_pope.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import json 14 | import os 15 | import numpy as np 16 | 17 | 18 | def eval_pope(answers, label_file): 19 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 20 | 21 | for answer in answers: 22 | text = answer['text'] 23 | 24 | # Only keep the first sentence 25 | if text.find('.') != -1: 26 | text = text.split('.')[0] 27 | 28 | text = text.replace(',', '') 29 | words = text.split(' ') 30 | if 'No' in words or 'not' in words or 'no' in words: 31 | answer['text'] = 'no' 32 | else: 33 | answer['text'] = 'yes' 34 | 35 | for i in range(len(label_list)): 36 | if label_list[i] == 'no': 37 | label_list[i] = 0 38 | else: 39 | label_list[i] = 1 40 | 41 | pred_list = [] 42 | for answer in answers: 43 | if answer['text'] == 'no': 44 | pred_list.append(0) 45 | else: 46 | pred_list.append(1) 47 | 48 | pos = 1 49 | neg = 0 50 | yes_ratio = pred_list.count(1) / len(pred_list) 51 | 52 | TP, TN, FP, FN = 0, 0, 0, 0 53 | for pred, label in zip(pred_list, label_list): 54 | if pred == pos and label == pos: 55 | TP += 1 56 | elif pred == pos and label == neg: 57 | FP += 1 58 | elif pred == neg and label == neg: 59 | TN += 1 60 | elif pred == neg and label == pos: 61 | FN += 1 62 | 63 | ret_message = "" 64 | print('TP\tFP\tTN\tFN\t') 65 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 66 | ret_message += 'TP\tFP\tTN\tFN\t\n' 67 | ret_message += '{}\t{}\t{}\t{}\n'.format(TP, FP, TN, FN) 68 | 69 | precision = float(TP) / float(TP + FP) 70 | recall = float(TP) / float(TP + FN) 71 | f1 = 2 * precision * recall / (precision + recall) 72 | acc = (TP + TN) / (TP + TN + FP + FN) 73 | print('Accuracy: {}'.format(acc)) 74 | print('Precision: {}'.format(precision)) 75 | print('Recall: {}'.format(recall)) 76 | print('F1 score: {}'.format(f1)) 77 | print('Yes ratio: {}'.format(yes_ratio)) 78 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio)) 79 | 80 | ret_message += 'Accuracy: {}\n'.format(acc) 81 | ret_message += 'Precision: {}\n'.format(precision) 82 | ret_message += 'Recall: {}\n'.format(recall) 83 | ret_message += 'F1 score: {}\n'.format(f1) 84 | ret_message += 'Yes ratio: {}\n'.format(yes_ratio) 85 | ret_message += '%.3f, %.3f, %.3f, %.3f, %.3f\n' % (f1, acc, precision, recall, yes_ratio) 86 | return f1, ret_message 87 | 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--annotation-dir', type=str) 92 | parser.add_argument('--question-file', type=str) 93 | parser.add_argument('--result-file', type=str) 94 | parser.add_argument('--out-dir', type=str) 95 | args = parser.parse_args() 96 | 97 | questions = [json.loads(line) for line in open(args.question_file)] 98 | questions = {question['question_id']: question for question in questions} 99 | answers = json.loads(open(args.result_file).read()) 100 | avg_f1 = [] 101 | ret_message = "" 102 | for file in os.listdir(args.annotation_dir): 103 | assert file.startswith('coco_pope_') 104 | assert file.endswith('.json') 105 | category = file[10:-5] 106 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 107 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 108 | ret_message += 'Category: {}, # samples: {}\n'.format(category, len(cur_answers)) 109 | f1, ret = eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 110 | ret_message += ret 111 | print('====================================') 112 | ret_message += '====================================\n' 113 | avg_f1.append(f1) 114 | print(f"Avg F1 score: {np.array(avg_f1).mean()}") 115 | ret_message += f"Avg F1 score: {np.array(avg_f1).mean()}\n" 116 | 117 | writer = open(os.path.join(args.out_dir, "results.txt"), 'w') 118 | print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}") 119 | writer.write(ret_message) 120 | writer.close() 121 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/existence.txt: -------------------------------------------------------------------------------- 1 | 000000006040.jpg Is there a train in this image? Please answer yes or no. Yes 2 | 000000006040.jpg Is there a bed in this image? Please answer yes or no. No 3 | 000000006471.jpg Is there a baseball bat in this image? Please answer yes or no. Yes 4 | 000000006471.jpg Is there a giraffe in this image? Please answer yes or no. No 5 | 000000007108.jpg Is there a elephant in this image? Please answer yes or no. Yes 6 | 000000007108.jpg Is there a hair drier in this image? Please answer yes or no. No 7 | 000000007816.jpg Is there a motorcycle in this image? Please answer yes or no. Yes 8 | 000000007816.jpg Is there a airplane in this image? Please answer yes or no. No 9 | 000000007977.jpg Is there a skateboard in this image? Please answer yes or no. Yes 10 | 000000007977.jpg Is there a spoon in this image? Please answer yes or no. No 11 | 000000008844.jpg Is there a person in this image? Please answer yes or no. Yes 12 | 000000008844.jpg Is there a sink in this image? Please answer yes or no. No 13 | 000000009590.jpg Is there a bottle in this image? Please answer yes or no. Yes 14 | 000000009590.jpg Is there a scissors in this image? Please answer yes or no. No 15 | 000000010363.jpg Is there a bottle in this image? Please answer yes or no. Yes 16 | 000000010363.jpg Is there a apple in this image? Please answer yes or no. No 17 | 000000011197.jpg Is there a car in this image? Please answer yes or no. Yes 18 | 000000011197.jpg Is there a fork in this image? Please answer yes or no. No 19 | 000000015254.jpg Is there a spoon in this image? Please answer yes or no. Yes 20 | 000000015254.jpg Is there a donut in this image? Please answer yes or no. No 21 | 000000015517.jpg Is there a bus in this image? Please answer yes or no. Yes 22 | 000000015517.jpg Is there a cow in this image? Please answer yes or no. No 23 | 000000015746.jpg Is there a fire hydrant in this image? Please answer yes or no. Yes 24 | 000000015746.jpg Is there a person in this image? Please answer yes or no. No 25 | 000000037751.jpg Is there a backpack in this image? Please answer yes or no. Yes 26 | 000000037751.jpg Is there a microwave in this image? Please answer yes or no. No 27 | 000000050145.jpg Is there a bicycle in this image? Please answer yes or no. Yes 28 | 000000050145.jpg Is there a apple in this image? Please answer yes or no. No 29 | 000000061418.jpg Is there a chair in this image? Please answer yes or no. Yes 30 | 000000061418.jpg Is there a airplane in this image? Please answer yes or no. No 31 | 000000417779.jpg Is there a car in this image? Please answer yes or no. Yes 32 | 000000417779.jpg Is there a kite in this image? Please answer yes or no. No 33 | 000000424521.jpg Is there a skateboard in this image? Please answer yes or no. Yes 34 | 000000424521.jpg Is there a banana in this image? Please answer yes or no. No 35 | 000000438304.jpg Is there a sports ball in this image? Please answer yes or no. Yes 36 | 000000438304.jpg Is there a horse in this image? Please answer yes or no. No 37 | 000000494427.jpg Is there a laptop in this image? Please answer yes or no. Yes 38 | 000000494427.jpg Is there a potted plant in this image? Please answer yes or no. No 39 | 000000495448.jpg Is there a cake in this image? Please answer yes or no. Yes 40 | 000000495448.jpg Is there a tie in this image? Please answer yes or no. No 41 | 000000498463.jpg Is there a refrigerator in this image? Please answer yes or no. Yes 42 | 000000498463.jpg Is there a donut in this image? Please answer yes or no. No 43 | 000000519039.jpg Is there a truck in this image? Please answer yes or no. Yes 44 | 000000519039.jpg Is there a book in this image? Please answer yes or no. No 45 | 000000523241.jpg Is there a car in this image? Please answer yes or no. Yes 46 | 000000523241.jpg Is there a cell phone in this image? Please answer yes or no. No 47 | 000000530162.jpg Is there a umbrella in this image? Please answer yes or no. Yes 48 | 000000530162.jpg Is there a horse in this image? Please answer yes or no. No 49 | 000000537812.jpg Is there a chair in this image? Please answer yes or no. Yes 50 | 000000537812.jpg Is there a baseball bat in this image? Please answer yes or no. No 51 | 000000541952.jpg Is there a clock in this image? Please answer yes or no. Yes 52 | 000000541952.jpg Is there a bottle in this image? Please answer yes or no. No 53 | 000000546626.jpg Is there a bottle in this image? Please answer yes or no. Yes 54 | 000000546626.jpg Is there a mouse in this image? Please answer yes or no. No 55 | 000000556000.jpg Is there a chair in this image? Please answer yes or no. Yes 56 | 000000556000.jpg Is there a dog in this image? Please answer yes or no. No 57 | 000000557258.jpg Is there a toilet in this image? Please answer yes or no. Yes 58 | 000000557258.jpg Is there a pizza in this image? Please answer yes or no. No 59 | 000000572956.jpg Is there a motorcycle in this image? Please answer yes or no. Yes 60 | 000000572956.jpg Is there a bus in this image? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /eval/gen/gedit/viescore/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, 'viescore') 3 | 4 | from utils import ( 5 | mllm_output_to_dict 6 | ) 7 | import math 8 | import vie_prompts 9 | 10 | class VIEScore: 11 | def __init__(self, backbone="gpt4o", task="t2i", key_path=None, azure_endpoint='') -> None: 12 | self.task = task 13 | self.backbone_name = backbone 14 | 15 | if self.task not in ["t2i", "tie", "t2v"]: 16 | raise ValueError("task must be either 't2i' or 'tie'") 17 | 18 | if self.backbone_name == "gpt4o": 19 | from mllm_tools.openai import GPT4o 20 | self.model = GPT4o(key_path, model_name="gpt-4.1-2025-04-14", azure_endpoint=azure_endpoint) 21 | elif self.backbone_name == "qwen25vl": 22 | from mllm_tools.qwen25vl_eval import Qwen25VL 23 | self.model = Qwen25VL() 24 | else: 25 | raise NotImplementedError("backbone not supported") 26 | self.context = vie_prompts._context_no_delimit 27 | if self.task == "t2i": 28 | self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_image_gen_rule, vie_prompts._prompts_0shot_t2i_rule_SC]) 29 | self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ]) 30 | elif self.task == "tie": 31 | self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC]) 32 | self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ]) 33 | elif self.task == "t2v": 34 | self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_video_gen_rule, vie_prompts._prompts_0shot_t2v_rule_SC]) 35 | self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_t2v_rule_PQ]) 36 | 37 | def evaluate(self, image_prompts, text_prompt, extract_overall_score_only=False, extract_all_score=True, echo_output=False): 38 | if not isinstance(image_prompts, list): 39 | image_prompts = [image_prompts] 40 | if self.backbone_name in ['gpt4o', 'gpt4v']: 41 | self.model.use_encode = False if isinstance(image_prompts[0], str) else True 42 | #print("Using encode:", self.model.use_encode) 43 | if self.task == "t2i": 44 | _SC_prompt = self.SC_prompt.replace("", text_prompt) 45 | elif self.task == "tie": 46 | _SC_prompt = self.SC_prompt.replace("", text_prompt) 47 | elif self.task == "t2v": 48 | _SC_prompt = self.SC_prompt.replace("", text_prompt) 49 | SC_prompt_final = self.model.prepare_prompt(image_prompts, _SC_prompt) 50 | if self.task == "tie": 51 | PQ_prompt_final = self.model.prepare_prompt(image_prompts[-1], self.PQ_prompt) 52 | else: 53 | PQ_prompt_final = self.model.prepare_prompt(image_prompts, self.PQ_prompt) 54 | 55 | results_dict = {} 56 | 57 | SC_dict = False 58 | PQ_dict = False 59 | tries = 0 60 | max_tries = 1 61 | while SC_dict is False or PQ_dict is False: 62 | tries += 1 63 | guess_if_cannot_parse = True if tries > max_tries else False 64 | result_SC = self.model.get_parsed_output(SC_prompt_final) 65 | result_PQ = self.model.get_parsed_output(PQ_prompt_final) 66 | SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse) 67 | PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse) 68 | 69 | if SC_dict == "rate_limit_exceeded" or PQ_dict == "rate_limit_exceeded": 70 | print("rate_limit_exceeded") 71 | raise ValueError("rate_limit_exceeded") 72 | results_dict['SC'] = SC_dict 73 | results_dict['PQ'] = PQ_dict 74 | if echo_output: 75 | print("results_dict", results_dict) 76 | if extract_all_score: 77 | SC_score = min(results_dict['SC']['score']) 78 | PQ_score = min(results_dict['PQ']['score']) 79 | O_score = math.sqrt(SC_score * PQ_score) 80 | return [SC_score, PQ_score, O_score] 81 | if extract_overall_score_only: 82 | SC_scores = results_dict['SC']['score'] 83 | PQ_scores = results_dict['PQ']['score'] 84 | O_score = math.sqrt(min(SC_scores) * min(PQ_scores)) 85 | return O_score 86 | return results_dict 87 | 88 | if __name__ == "__main__": 89 | model = VIEScore(backbone="gemini", task="t2i") 90 | from datasets import load_dataset 91 | dataset = load_dataset("TIGER-Lab/GenAI-Arena-Bench", "image_generation") 92 | dataset = dataset["test"] 93 | print("Now running the VIEScore model") 94 | for idx in range(5): 95 | left_image = dataset['left_image'][idx] 96 | right_image = dataset['right_image'][idx] 97 | prompt = dataset['prompt'][idx] 98 | print(model.evaluate(left_image, prompt, extract_all_score=True)) 99 | print(model.evaluate(right_image, prompt, extract_all_score=True)) 100 | 101 | -------------------------------------------------------------------------------- /modeling/qwen2/tokenization_qwen2_fast.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | """Tokenization classes for Qwen2.""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | from transformers.tokenization_utils import AddedToken 9 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 10 | from transformers.utils import logging 11 | from .tokenization_qwen2 import Qwen2Tokenizer 12 | 13 | 14 | logger = logging.get_logger(__name__) 15 | 16 | VOCAB_FILES_NAMES = { 17 | "vocab_file": "vocab.json", 18 | "merges_file": "merges.txt", 19 | "tokenizer_file": "tokenizer.json", 20 | } 21 | 22 | 23 | MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} 24 | 25 | 26 | class Qwen2TokenizerFast(PreTrainedTokenizerFast): 27 | """ 28 | Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 29 | Byte-Pair-Encoding. 30 | 31 | Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will 32 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 33 | 34 | ```python 35 | >>> from transformers import Qwen2TokenizerFast 36 | 37 | >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") 38 | >>> tokenizer("Hello world")["input_ids"] 39 | [9707, 1879] 40 | 41 | >>> tokenizer(" Hello world")["input_ids"] 42 | [21927, 1879] 43 | ``` 44 | This is expected. 45 | 46 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 47 | refer to this superclass for more information regarding those methods. 48 | 49 | Args: 50 | vocab_file (`str`, *optional*): 51 | Path to the vocabulary file. 52 | merges_file (`str`, *optional*): 53 | Path to the merges file. 54 | tokenizer_file (`str`, *optional*): 55 | Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that 56 | contains everything needed to load the tokenizer. 57 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 58 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 59 | token instead. Not applicable to this tokenizer. 60 | bos_token (`str`, *optional*): 61 | The beginning of sequence token. Not applicable for this tokenizer. 62 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 63 | The end of sequence token. 64 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 65 | The token used for padding, for example when batching sequences of different lengths. 66 | """ 67 | 68 | vocab_files_names = VOCAB_FILES_NAMES 69 | model_input_names = ["input_ids", "attention_mask"] 70 | slow_tokenizer_class = Qwen2Tokenizer 71 | 72 | def __init__( 73 | self, 74 | vocab_file=None, 75 | merges_file=None, 76 | tokenizer_file=None, 77 | unk_token="<|endoftext|>", 78 | bos_token=None, 79 | eos_token="<|endoftext|>", 80 | pad_token="<|endoftext|>", 81 | **kwargs, 82 | ): 83 | # We need to at least pass vocab_file and merges_file to base class 84 | # in case a slow tokenizer needs to be initialized; other can be 85 | # configured through files. 86 | # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token 87 | 88 | bos_token = ( 89 | AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) 90 | if isinstance(bos_token, str) 91 | else bos_token 92 | ) 93 | eos_token = ( 94 | AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) 95 | if isinstance(eos_token, str) 96 | else eos_token 97 | ) 98 | unk_token = ( 99 | AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) 100 | if isinstance(unk_token, str) 101 | else unk_token 102 | ) 103 | pad_token = ( 104 | AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) 105 | if isinstance(pad_token, str) 106 | else pad_token 107 | ) 108 | 109 | super().__init__( 110 | vocab_file=vocab_file, 111 | merges_file=merges_file, 112 | tokenizer_file=tokenizer_file, 113 | unk_token=unk_token, 114 | bos_token=bos_token, 115 | eos_token=eos_token, 116 | pad_token=pad_token, 117 | **kwargs, 118 | ) 119 | 120 | # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary 121 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 122 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 123 | return tuple(files) 124 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/color.txt: -------------------------------------------------------------------------------- 1 | 000000006723.jpg Is there a red brick building in the image? Please answer yes or no. Yes 2 | 000000006723.jpg Is there a yellow brick building in the image? Please answer yes or no. No 3 | 000000008277.jpg Is there a white plate in the image? Please answer yes or no. Yes 4 | 000000008277.jpg Is there a yellow plate in the image? Please answer yes or no. No 5 | 000000012120.jpg Is there a blue court in the image? Please answer yes or no. Yes 6 | 000000012120.jpg Is there a purple court in the image? Please answer yes or no. No 7 | 000000014831.jpg Is there a brown and white animal in the image? Please answer yes or no. Yes 8 | 000000014831.jpg Is there a green and red animal in the image? Please answer yes or no. No 9 | 000000028993.jpg Are there yellow poles in the image? Please answer yes or no. Yes 10 | 000000028993.jpg Are there blue poles in the image? Please answer yes or no. No 11 | 000000029393.jpg Is there a brown dog in the image? Please answer yes or no. Yes 12 | 000000029393.jpg Is there a black dog in the image? Please answer yes or no. No 13 | 000000035770.jpg Is there a black and white toilet in the image? Please answer yes or no. Yes 14 | 000000035770.jpg Is there a red and white toilet in the image? Please answer yes or no. No 15 | 000000038118.jpg Is there a red coat in the image? Please answer yes or no. Yes 16 | 000000038118.jpg Is there a yellow coat in the image? Please answer yes or no. No 17 | 000000047112.jpg Is there a white plate in the image? Please answer yes or no. Yes 18 | 000000047112.jpg Is there a yellow plate in the image? Please answer yes or no. No 19 | 000000047121.jpg Is there a black cat in the image? Please answer yes or no. Yes 20 | 000000047121.jpg Is there a brown cat in the image? Please answer yes or no. No 21 | 000000053529.jpg Is there a green hat in the image? Please answer yes or no. Yes 22 | 000000053529.jpg Is there a red hat in the image? Please answer yes or no. No 23 | 000000053994.jpg Is there a gray wall in the image? Please answer yes or no. Yes 24 | 000000053994.jpg Is there a red wall in the image? Please answer yes or no. No 25 | 000000055072.jpg Is there a brown giraffe in the image? Please answer yes or no. Yes 26 | 000000055072.jpg Is there a black giraffe in the image? Please answer yes or no. No 27 | 000000057597.jpg Are there any red shoes in the image? Please answer yes or no. Yes 28 | 000000057597.jpg Are there any yellow shoes in the image? Please answer yes or no. No 29 | 000000061658.jpg Are there a white dish in the image? Please answer yes or no. Yes 30 | 000000061658.jpg Are there a green dish in the image? Please answer yes or no. No 31 | 000000338560.jpg Is there a blue and yellow fire hydrant in the image? Please answer yes or no. Yes 32 | 000000338560.jpg Is there a blue and orange fire hydrant in the image? Please answer yes or no. No 33 | 000000370208.jpg Is there a red bicycle with white handlebars in the image? Please answer yes or no. Yes 34 | 000000370208.jpg Is there a red bicycle with black handlebars in the image? Please answer yes or no. No 35 | 000000377723.jpg Is there a blue bus in the image? Please answer yes or no. Yes 36 | 000000377723.jpg Is there a orange bus in the image? Please answer yes or no. No 37 | 000000405205.jpg Is there a white bus in the image? Please answer yes or no. Yes 38 | 000000405205.jpg Is there a red bus in the image? Please answer yes or no. No 39 | 000000410612.jpg Is there a red boat in the image? Please answer yes or no. Yes 40 | 000000410612.jpg Is there a gray boat in the image? Please answer yes or no. No 41 | 000000427034.jpg Is there a brown and black dog in the image? Please answer yes or no. Yes 42 | 000000427034.jpg Is there a brown and white dog in the image? Please answer yes or no. No 43 | 000000442456.jpg Is there a man wearing a red shirt in the image? Please answer yes or no. Yes 44 | 000000442456.jpg Is there a man wearing a white shirt in the image? Please answer yes or no. No 45 | 000000492362.jpg Is there a skateboard with red wheels in the image? Please answer yes or no. Yes 46 | 000000492362.jpg Is there a skateboard with black wheels in the image? Please answer yes or no. No 47 | 000000492992.jpg Is there a white bird in the image? Please answer yes or no. Yes 48 | 000000492992.jpg Is there a yellow bird in the image? Please answer yes or no. No 49 | 000000512929.jpg Are there any green beans in the image? Please answer yes or no. Yes 50 | 000000512929.jpg Are there any orange beans in the image? Please answer yes or no. No 51 | 000000530457.jpg Are there any red flowers in the image? Please answer yes or no. Yes 52 | 000000530457.jpg Are there any green flowers in the image? Please answer yes or no. No 53 | 000000532761.jpg Is there a living room painted yellow in the image? Please answer yes or no. Yes 54 | 000000532761.jpg Is there a living room painted black in the image? Please answer yes or no. No 55 | 000000534041.jpg Is there a purple bottle in the image? Please answer yes or no. Yes 56 | 000000534041.jpg Is there a white bottle in the image? Please answer yes or no. No 57 | 000000563758.jpg Is there a red scarf in the image? Please answer yes or no. Yes 58 | 000000563758.jpg Is there a brown scarf in the image? Please answer yes or no. No 59 | 000000564280.jpg Is there a red couch in the image? Please answer yes or no. Yes 60 | 000000564280.jpg Is there a black couch in the image? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /eval/vlm/eval/mmmu/main_eval_only.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | """Parse and Evalate""" 13 | import os 14 | import json 15 | from argparse import ArgumentParser 16 | 17 | from .data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT, save_json 18 | from .eval_utils import calculate_ins_level_acc, evaluate, parse_open_response 19 | 20 | if __name__ == '__main__': 21 | 22 | parser = ArgumentParser() 23 | parser.add_argument('--output_path', type=str, default='./example_outputs/qwen_vl/total_val_output.json', 24 | help='The path to model output file.') 25 | parser.add_argument('--answer_path', type=str, default='./answer_dict_val.json', help='Answer file path.') 26 | parser.add_argument('--out-dir', type=str, default='results') 27 | args = parser.parse_args() 28 | 29 | output_dict = json.load(open(args.output_path)) 30 | answer_dict = json.load(open(args.answer_path)) 31 | 32 | # group by category 33 | output_dict_w_cat = {} 34 | for data_id, parsed_pred in output_dict.items(): 35 | category = '_'.join(data_id.split('_')[1:-1]) 36 | if category not in output_dict_w_cat: 37 | output_dict_w_cat.update({category: {}}) 38 | output_dict_w_cat[category].update({data_id: parsed_pred}) 39 | 40 | # group by category 41 | answer_dict_w_cat = {} 42 | for data_id, parsed_pred in answer_dict.items(): 43 | category = '_'.join(data_id.split('_')[1:-1]) 44 | if category not in answer_dict_w_cat: 45 | answer_dict_w_cat.update({category: {}}) 46 | answer_dict_w_cat[category].update({data_id: parsed_pred}) 47 | 48 | evaluation_result = {} 49 | 50 | for category in CAT_SHORT2LONG.values(): 51 | print('Evaluating: {}'.format(category)) 52 | # get cat_outputs and cat_answers 53 | try: 54 | cat_outputs = output_dict_w_cat[category] 55 | cat_answers = answer_dict_w_cat[category] 56 | except KeyError: 57 | print('Skipping {} for not found'.format(category)) 58 | continue 59 | 60 | exampels_to_eval = [] 61 | for data_id, parsed_pred in cat_outputs.items(): 62 | question_type = cat_answers[data_id]['question_type'] 63 | if question_type != 'multiple-choice': 64 | parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.) 65 | else: 66 | parsed_pred = parsed_pred 67 | 68 | exampels_to_eval.append({ 69 | 'id': data_id, 70 | 'question_type': question_type, 71 | 'answer': cat_answers[data_id]['ground_truth'], 72 | 'parsed_pred': parsed_pred 73 | }) 74 | 75 | judge_dict, metric_dict = evaluate(exampels_to_eval) 76 | metric_dict.update({'num_example': len(exampels_to_eval)}) 77 | 78 | evaluation_result[category] = metric_dict 79 | 80 | printable_results = {} 81 | # pdb.set_trace() 82 | # add domain Subject 83 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items(): 84 | in_domain_cat_results = {} 85 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT 86 | if cat_name in evaluation_result.keys(): 87 | in_domain_cat_results[cat_name] = evaluation_result[cat_name] 88 | else: 89 | pass 90 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results) 91 | in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()]) 92 | printable_results['Overall-' + domain] = {'num': int(in_domain_data_num), 93 | 'acc': round(in_domain_ins_acc, 3) 94 | } 95 | # add sub category 96 | for cat_name, cat_results in in_domain_cat_results.items(): 97 | printable_results[cat_name] = {'num': int(cat_results['num_example']), 98 | 'acc': round(cat_results['acc'], 3) 99 | } 100 | 101 | # table.append(["-----------------------------", "-----", "----"]) 102 | all_ins_acc = calculate_ins_level_acc(evaluation_result) 103 | printable_results['Overall'] = { 104 | 'num': sum([cat_results['num_example'] for cat_results in evaluation_result.values()]), 105 | 'acc': round(all_ins_acc, 3)} 106 | 107 | print(printable_results) 108 | writer = open(os.path.join(args.out_dir, "results.txt"), 'w') 109 | print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}") 110 | for key, value in printable_results.items(): 111 | line = f'{key}: num={value["num"]}, acc={value["acc"]}\n' 112 | writer.write(line) 113 | writer.close() 114 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/count.txt: -------------------------------------------------------------------------------- 1 | 000000006040.jpg Is there a train in the picture? Please answer yes or no. Yes 2 | 000000006040.jpg Are there a total of two trains in the picture? Please answer yes or no. No 3 | 000000044279.jpg Is there a total of two people in the image? Please answer yes or no. Yes 4 | 000000044279.jpg Is there only one people in the image? Please answer yes or no. No 5 | 000000067213.jpg Is there only one dog in the image? Please answer yes or no. Yes 6 | 000000067213.jpg Is there two dogs in the image? Please answer yes or no. No 7 | 000000071226.jpg Is there a total of two dogs in the image? Please answer yes or no. Yes 8 | 000000071226.jpg Is there only one dogs in the image? Please answer yes or no. No 9 | 000000097994.jpg Are there three laptops in the picture? Please answer yes or no. Yes 10 | 000000097994.jpg Are there four laptops in the picture? Please answer yes or no. No 11 | 000000195918.jpg Is there a total of two display devices in the image? Please answer yes or no. Yes 12 | 000000195918.jpg Is there only one display device in the image? Please answer yes or no. No 13 | 000000236721.jpg Are there two bananas in the image? Please answer yes or no. Yes 14 | 000000236721.jpg Are there three bananas in the image? Please answer yes or no. No 15 | 000000261712.jpg Are there two giraffes in this image? Please answer yes or no. Yes 16 | 000000261712.jpg Are there three giraffes in this picture? Please answer yes or no. No 17 | 000000274066.jpg Are there four people appear in this image? Please answer yes or no. Yes 18 | 000000274066.jpg Are there only three people appear in this image? Please answer yes or no. No 19 | 000000276434.jpg Is there a total of three cakes in this image? Please answer yes or no. Yes 20 | 000000276434.jpg Are there only two cakes in this image? Please answer yes or no. No 21 | 000000289059.jpg Is there a total of two person appear in the image? Please answer yes or no. Yes 22 | 000000289059.jpg Is there only one person appear in the image? Please answer yes or no. No 23 | 000000290081.jpg Is there only one bowl in this image? Please answer yes or no. Yes 24 | 000000290081.jpg Are there two bowls in this image? Please answer yes or no. No 25 | 000000301867.jpg Are there three people appear in this image? Please answer yes or no. Yes 26 | 000000301867.jpg Are there only two people appear in this image? Please answer yes or no. No 27 | 000000335954.jpg Are there two bowls in this image? Please answer yes or no. Yes 28 | 000000335954.jpg Are there three bowls in this image? Please answer yes or no. No 29 | 000000357816.jpg Are there four people in this image? Please answer yes or no. Yes 30 | 000000357816.jpg Are there five people in this image? Please answer yes or no. No 31 | 000000372819.jpg Are there four dogs appear in this image? Please answer yes or no. Yes 32 | 000000372819.jpg Are there only three dogs appear in this image? Please answer yes or no. No 33 | 000000410612.jpg Is there only one ship in the picture? Please answer yes or no. Yes 34 | 000000410612.jpg Is there a total of two ships in the picture? Please answer yes or no. No 35 | 000000423944.jpg Is there no person in this picture? Please answer yes or no. Yes 36 | 000000423944.jpg Are there two people appear in this image? Please answer yes or no. No 37 | 000000427034.jpg Is there a dog in the picture? Please answer yes or no. Yes 38 | 000000427034.jpg Are there a total of two dogs in the picture? Please answer yes or no. No 39 | 000000430286.jpg Are there three remotes in this image? Please answer yes or no. Yes 40 | 000000430286.jpg Are there only two remotes in this image? Please answer yes or no. No 41 | 000000432468.jpg Are there three zippers in the picture? Please answer yes or no. Yes 42 | 000000432468.jpg Is there a zipper in the picture? Please answer yes or no. No 43 | 000000434479.jpg Are there two pieces of pizza in this image? Please answer yes or no. Yes 44 | 000000434479.jpg Is there only one piece of pizza in this image? Please answer yes or no. No 45 | 000000438304.jpg Are there two tennis rackets in the picture? Please answer yes or no. Yes 46 | 000000438304.jpg Are there only one tennis racket in the picture? Please answer yes or no. No 47 | 000000450303.jpg Are there six people appear in this image? Please answer yes or no. Yes 48 | 000000450303.jpg Are there seven people appear in this image? Please answer yes or no. No 49 | 000000470121.jpg Is there only one bottle in the image? Please answer yes or no. Yes 50 | 000000470121.jpg Is there two bottles in the image? Please answer yes or no. No 51 | 000000476215.jpg Are there two horses in this image? Please answer yes or no. Yes 52 | 000000476215.jpg Is there only one horse in this image? Please answer yes or no. No 53 | 000000482100.jpg Are there two toilets in the picture? Please answer yes or no. Yes 54 | 000000482100.jpg Is there only one toilet in the picture? Please answer yes or no. No 55 | 000000491867.jpg Is there only one necktie in the image? Please answer yes or no. Yes 56 | 000000491867.jpg Is there three neckties in the image? Please answer yes or no. No 57 | 000000556000.jpg Are there four people in the image? Please answer yes or no. Yes 58 | 000000556000.jpg Are there only three people in the image? Please answer yes or no. No 59 | 000000565045.jpg Are there two bath towels in the picture? Please answer yes or no. Yes 60 | 000000565045.jpg Is there only one bath towel in the picture? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/position.txt: -------------------------------------------------------------------------------- 1 | 000000006471.jpg Is the cricket bat above the batter's body? Please answer yes or no. Yes 2 | 000000006471.jpg Is the cricket bat under the batter's body Please answer yes or no. No 3 | 000000007281.jpg Is the sea behind people in the image? Please answer yes or no. Yes 4 | 000000007281.jpg Is the sea in front of people in the image? Please answer yes or no. No 5 | 000000014038.jpg Is the refrigerator on the left side of the picture? Please answer yes or no. Yes 6 | 000000014038.jpg Is the refrigerator on the right side of the picture Please answer yes or no. No 7 | 000000031248.jpg Is there a sofa in the middle of potted plants in the image? Please answer yes or no. Yes 8 | 000000031248.jpg Is there a sofa in the right side of potted plants in the image? Please answer yes or no. No 9 | 000000048504.jpg Is the gray elephant in front of the brown elephant? Please answer yes or no. Yes 10 | 000000048504.jpg Is the brown elephant in front of the gray elephant? Please answer yes or no. No 11 | 000000052007.jpg Are the pedestrians on the right of the bus? Please answer yes or no. Yes 12 | 000000052007.jpg Are the pedestrians on the left of the bus? Please answer yes or no. No 13 | 000000056127.jpg Is the light above the fire hydrant in the image? Please answer yes or no. Yes 14 | 000000056127.jpg Is the light under the fire hydrant in the image? Please answer yes or no. No 15 | 000000062025.jpg Is the trash can under the cup in the image? Please answer yes or no. Yes 16 | 000000062025.jpg Is the trash can above the cup in the image? Please answer yes or no. No 17 | 000000062808.jpg Is the phone above the pizza in the image? Please answer yes or no. Yes 18 | 000000062808.jpg Is the phone under the pizza in the image? Please answer yes or no. No 19 | 000000067213.jpg Is the dog above the pool in the image? Please answer yes or no. Yes 20 | 000000067213.jpg Is the dog under the pool in the image? Please answer yes or no. No 21 | 000000097994.jpg Is the light above the computer in the image? Please answer yes or no. Yes 22 | 000000097994.jpg Is the light under the computer in the image? Please answer yes or no. No 23 | 000000204871.jpg Is the car on the right side of the fire hydrant in the picture? Please answer yes or no. Yes 24 | 000000204871.jpg Is the car on the left side of the fire hydrant in the picture? Please answer yes or no. No 25 | 000000206487.jpg Is the motorcycle on the right side of the bus? Please answer yes or no. Yes 26 | 000000206487.jpg Is the motorcycle on the left side of the bus Please answer yes or no. No 27 | 000000211825.jpg Is the cake on the left side of the camera? Please answer yes or no. Yes 28 | 000000211825.jpg Is the cake on the right side of the camera? Please answer yes or no. No 29 | 000000212800.jpg Is the blue umbrella under the black umbrella? Please answer yes or no. Yes 30 | 000000212800.jpg Is the blue umbrella above the black umbrella? Please answer yes or no. No 31 | 000000395701.jpg Is the TV on the left of the bookshelf? Please answer yes or no. Yes 32 | 000000395701.jpg Is the TV on the right of the bookshelf? Please answer yes or no. No 33 | 000000395801.jpg Is the clock above people? Please answer yes or no. Yes 34 | 000000395801.jpg Is the clock under people? Please answer yes or no. No 35 | 000000405970.jpg Is the grey sofa on the right of the TV? Please answer yes or no. Yes 36 | 000000405970.jpg Is the grey sofa on the left of the TV? Please answer yes or no. No 37 | 000000426241.jpg Is the white mouse on the right of the black keyboard? Please answer yes or no. Yes 38 | 000000426241.jpg Is the white mouse on the left of the black keyboard? Please answer yes or no. No 39 | 000000450303.jpg Is the monitor on top of a person? Please answer yes or no. Yes 40 | 000000450303.jpg Is the monitor under the person? Please answer yes or no. No 41 | 000000458410.jpg Is the TV on the left of the lamp? Please answer yes or no. Yes 42 | 000000458410.jpg Is the TV on the right of the lamp? Please answer yes or no. No 43 | 000000472046.jpg Is the pineapple on the left of the pot in the image? Please answer yes or no. Yes 44 | 000000472046.jpg Is the pineapple on the right of the pot in the image? Please answer yes or no. No 45 | 000000477955.jpg Is the person under the kite? Please answer yes or no. Yes 46 | 000000477955.jpg Is the person above the kite? Please answer yes or no. No 47 | 000000482585.jpg Is the person on the right of the train? Please answer yes or no. Yes 48 | 000000482585.jpg Is the person on the left of the train? Please answer yes or no. No 49 | 000000494869.jpg Is the baby on the right of the dog in the image? Please answer yes or no. Yes 50 | 000000494869.jpg Is the baby on the left of the dog in the image? Please answer yes or no. No 51 | 000000509699.jpg Is the mirror above the TV? Please answer yes or no. Yes 52 | 000000509699.jpg Is the mirror under the TV? Please answer yes or no. No 53 | 000000519569.jpg Is the vase on the left of the bottle? Please answer yes or no. Yes 54 | 000000519569.jpg Is the vase on the right of the bottle? Please answer yes or no. No 55 | 000000530162.jpg Is the big red and black umbrella on the top of people? Please answer yes or no. Yes 56 | 000000530162.jpg Is the big red and black umbrella under people? Please answer yes or no. No 57 | 000000551660.jpg Is the spoon in the bowl? Please answer yes or no. Yes 58 | 000000551660.jpg Is the spoon out of the bowl? Please answer yes or no. No 59 | 000000578922.jpg Is the vase on the left of the toothbrush? Please answer yes or no. Yes 60 | 000000578922.jpg Is the vase on the right of the toothbrush? Please answer yes or no. No 61 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/Your_Results/text_translation.txt: -------------------------------------------------------------------------------- 1 | 0001.png Is it appropriate to translate the Chinese in the image into English 'classic taste' in the picture? Please answer yes or no. Yes 2 | 0001.png Is it appropriate to translate the Chinese in the image into English 'classic strawberry flavor' in the picture? Please answer yes or no. No 3 | 0002.png Is it appropriate to translate the Chinese in the image into English 'a delicious dinner' in the picture? Please answer yes or no. Yes 4 | 0002.png Is it appropriate to translate the Chinese in the image into English 'hamburger and chips' in the picture? Please answer yes or no. No 5 | 0003.png Is it appropriate to translate the Chinese in the image into English 'sunny weather' in the picture? Please answer yes or no. Yes 6 | 0003.png Is it appropriate to translate the Chinese in the image into English 'cold weather' in the picture? Please answer yes or no. No 7 | 0004.png Is it appropriate to translate the Chinese in the image into English 'run very fast' in the picture? Please answer yes or no. Yes 8 | 0004.png Is it appropriate to translate the Chinese in the image into English 'run very slow' in the picture? Please answer yes or no. No 9 | 0005.png Is it appropriate to translate the Chinese in the image into English 'feeling happy' in the picture? Please answer yes or no. Yes 10 | 0005.png Is it appropriate to translate the Chinese in the image into English 'feeling bored' in the picture? Please answer yes or no. No 11 | 0006.png Is it appropriate to translate the Chinese in the image into English 'work hard together' in the picture? Please answer yes or no. Yes 12 | 0006.png Is it appropriate to translate the Chinese in the image into English 'be filled with intrigue' in the picture? Please answer yes or no. No 13 | 0007.png Is it appropriate to translate the Chinese in the image into English 'walking very slowly' in the picture? Please answer yes or no. Yes 14 | 0007.png Is it appropriate to translate the Chinese in the image into English 'runing very slowly' in the picture? Please answer yes or no. No 15 | 0008.png Is it appropriate to translate the Chinese in the image into English 'very proud' in the picture? Please answer yes or no. Yes 16 | 0008.png Is it appropriate to translate the Chinese in the image into English 'very thankful' in the picture? Please answer yes or no. No 17 | 0009.png Is it appropriate to translate the Chinese in the image into English 'creative people' in the picture? Please answer yes or no. Yes 18 | 0009.png Is it appropriate to translate the Chinese in the image into English 'leading people' in the picture? Please answer yes or no. No 19 | 0010.png Is it appropriate to translate the Chinese in the image into English 'a beautiful garden' in the picture? Please answer yes or no. Yes 20 | 0010.png Is it appropriate to translate the Chinese in the image into English 'a beautiful campus' in the picture? Please answer yes or no. No 21 | 0011.png Is it appropriate to translate the Chinese in the image into English 'a difficult work' in the picture? Please answer yes or no. Yes 22 | 0011.png Is it appropriate to translate the Chinese in the image into English 'a easy work' in the picture? Please answer yes or no. No 23 | 0012.png Is it appropriate to translate the Chinese in the image into English 'a small amount' in the picture? Please answer yes or no. Yes 24 | 0012.png Is it appropriate to translate the Chinese in the image into English 'difficult and dangerous' in the picture? Please answer yes or no. No 25 | 0013.png Is it appropriate to translate the Chinese in the image into English 'feeling frustrated' in the picture? Please answer yes or no. Yes 26 | 0013.png Is it appropriate to translate the Chinese in the image into English 'feeling relaxed' in the picture? Please answer yes or no. No 27 | 0014.png Is it appropriate to translate the Chinese in the image into English 'waiting for a long time' in the picture? Please answer yes or no. Yes 28 | 0014.png Is it appropriate to translate the Chinese in the image into English 'sleeping for a long time' in the picture? Please answer yes or no. No 29 | 0015.png Is it appropriate to translate the Chinese in the image into English 'very powerful' in the picture? Please answer yes or no. Yes 30 | 0015.png Is it appropriate to translate the Chinese in the image into English 'to be fragile throughout the world' in the picture? Please answer yes or no. No 31 | 0016.png Is it appropriate to translate the Chinese in the image into English 'all talk and no action' in the picture? Please answer yes or no. Yes 32 | 0016.png Is it appropriate to translate the Chinese in the image into English 'hands-on practice' in the picture? Please answer yes or no. No 33 | 0017.png Is it appropriate to translate the Chinese in the image into English 'delicious fruit' in the picture? Please answer yes or no. Yes 34 | 0017.png Is it appropriate to translate the Chinese in the image into English 'banana' in the picture? Please answer yes or no. No 35 | 0018.png Is it appropriate to translate the Chinese in the image into English 'very unforgettable' in the picture? Please answer yes or no. Yes 36 | 0018.png Is it appropriate to translate the Chinese in the image into English 'very happy' in the picture? Please answer yes or no. No 37 | 0019.png Is it appropriate to translate the Chinese in the image into English 'get along well' in the picture? Please answer yes or no. Yes 38 | 0019.png Is it appropriate to translate the Chinese in the image into English 'for own self-interest' in the picture? Please answer yes or no. No 39 | 0020.png Is it appropriate to translate the Chinese in the image into English 'rank first' in the picture? Please answer yes or no. Yes 40 | 0020.png Is it appropriate to translate the Chinese in the image into English 'to add the finishing touches' in the picture? Please answer yes or no. No 41 | -------------------------------------------------------------------------------- /eval/vlm/eval/mathvista/extract_answer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | 14 | from tqdm import tqdm 15 | from utilities import * 16 | 17 | openai.api_key = os.getenv('OPENAI_API_KEY') 18 | print(openai.api_key) 19 | 20 | # load demo prompt 21 | from prompts.ext_ans import demo_prompt 22 | 23 | 24 | def verify_extraction(extraction): 25 | extraction = extraction.strip() 26 | if extraction == '' or extraction is None: 27 | return False 28 | return True 29 | 30 | 31 | def create_test_prompt(demo_prompt, query, response): 32 | demo_prompt = demo_prompt.strip() 33 | test_prompt = f'{query}\n\n{response}' 34 | full_prompt = f'{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: ' 35 | return full_prompt 36 | 37 | 38 | def _extract_answer(text): 39 | match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE) 40 | if match: 41 | return match.group(2).strip() 42 | return text 43 | 44 | 45 | def extract_answer(response, problem, quick_extract=False): 46 | question_type = problem['question_type'] 47 | answer_type = problem['answer_type'] 48 | choices = problem['choices'] 49 | query = problem['query'] 50 | 51 | if response == '': 52 | return '' 53 | 54 | if question_type == 'multi_choice' and response in choices: 55 | return response 56 | 57 | if answer_type == 'integer': 58 | try: 59 | extraction = int(response) 60 | return str(extraction) 61 | except: 62 | pass 63 | 64 | if answer_type == 'float': 65 | try: 66 | extraction = str(float(response)) 67 | return extraction 68 | except: 69 | pass 70 | 71 | # quick extraction 72 | if quick_extract: 73 | print('Quickly extracting answer...') 74 | # The answer is "text". -> "text" 75 | try: 76 | result = _extract_answer(response) 77 | return result 78 | # result = re.search(r'The answer is "(.*)"\.', response) 79 | # if result: 80 | # extraction = result.group(1) 81 | # return extraction 82 | except: 83 | pass 84 | 85 | # general extraction 86 | try: 87 | full_prompt = create_test_prompt(demo_prompt, query, response) 88 | extraction = get_chat_response(full_prompt, openai.api_key, patience=5) 89 | return extraction 90 | except Exception as e: 91 | print(e) 92 | print(f'Error in extracting answer for {pid}') 93 | 94 | return '' 95 | 96 | 97 | if __name__ == '__main__': 98 | parser = argparse.ArgumentParser() 99 | # input 100 | parser.add_argument('--output_dir', type=str, default='./results') 101 | parser.add_argument('--output_file', type=str, default='mathvista_answer.json') 102 | parser.add_argument('--response_label', type=str, default='response', help='response label for the input file') 103 | # model 104 | parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine', 105 | choices=['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613']) 106 | parser.add_argument('--number', type=int, default=-1, help='number of problems to run') 107 | parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems') 108 | parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction') 109 | # output 110 | parser.add_argument('--save_every', type=int, default=10, help='save every n problems') 111 | parser.add_argument('--output_label', type=str, default='', help='label for the output file') 112 | args = parser.parse_args() 113 | 114 | # args 115 | label = args.response_label 116 | result_file = os.path.join(args.output_dir, args.output_file) 117 | 118 | if args.output_label != '': 119 | output_file = result_file.replace('.json', f'_{args.output_label}.json') 120 | else: 121 | output_file = result_file 122 | 123 | # read results 124 | print(f'Reading {result_file}...') 125 | results = read_json(result_file) 126 | 127 | # full pids 128 | full_pids = list(results.keys()) 129 | if args.number > 0: 130 | full_pids = full_pids[:min(args.number, len(full_pids))] 131 | print('Number of testing problems:', len(full_pids)) 132 | 133 | # test pids 134 | if args.rerun: 135 | test_pids = full_pids 136 | else: 137 | test_pids = [] 138 | for pid in full_pids: 139 | # print(pid) 140 | if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']): 141 | test_pids.append(pid) 142 | 143 | test_num = len(test_pids) 144 | print('Number of problems to run:', test_num) 145 | # print(test_pids) 146 | 147 | # tqdm, enumerate results 148 | for i, pid in enumerate(tqdm(test_pids)): 149 | problem = results[pid] 150 | 151 | assert label in problem 152 | response = problem[label] 153 | 154 | extraction = extract_answer(response, problem, args.quick_extract) 155 | results[pid]['extraction'] = extraction 156 | 157 | if i % args.save_every == 0 or i == test_num - 1: 158 | print(f'Saving results to {output_file}...') 159 | save_json(results, output_file) 160 | print(f'Results saved.') 161 | -------------------------------------------------------------------------------- /modeling/bagel/modeling_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Facebook, Inc. and its affiliates. 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: CC BY-NC 4.0 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under CC BY-NC 4.0, with the full license text 8 | # available at https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import math 13 | 14 | import numpy as np 15 | import torch 16 | from torch import nn 17 | from transformers.activations import ACT2FN 18 | 19 | # -------------------------------------------------------- 20 | # 2D sine-cosine position embedding 21 | # References: 22 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py 23 | # -------------------------------------------------------- 24 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): 25 | grid_h = np.arange(grid_size, dtype=np.float32) 26 | grid_w = np.arange(grid_size, dtype=np.float32) 27 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 28 | grid = np.stack(grid, axis=0) 29 | 30 | grid = grid.reshape([2, 1, grid_size, grid_size]) 31 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 32 | if cls_token and extra_tokens > 0: 33 | pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) 34 | return pos_embed 35 | 36 | 37 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 38 | assert embed_dim % 2 == 0 39 | 40 | # use half of dimensions to encode grid_h 41 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 42 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 43 | 44 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 45 | return emb 46 | 47 | 48 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 49 | """ 50 | embed_dim: output dimension for each position 51 | pos: a list of positions to be encoded: size (M,) 52 | out: (M, D) 53 | """ 54 | assert embed_dim % 2 == 0 55 | omega = np.arange(embed_dim // 2, dtype=np.float64) 56 | omega /= embed_dim / 2. 57 | omega = 1. / 10000**omega # (D/2,) 58 | 59 | pos = pos.reshape(-1) # (M,) 60 | out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product 61 | 62 | emb_sin = np.sin(out) # (M, D/2) 63 | emb_cos = np.cos(out) # (M, D/2) 64 | 65 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 66 | return emb 67 | 68 | 69 | # -------------------------------------------------------- 70 | # TimestepEmbedder 71 | # Reference: 72 | # DiT: https://github.com/facebookresearch/DiT/blob/main/models.py 73 | # -------------------------------------------------------- 74 | class TimestepEmbedder(nn.Module): 75 | """ 76 | Embeds scalar timesteps into vector representations. 77 | """ 78 | def __init__(self, hidden_size, frequency_embedding_size=256): 79 | super().__init__() 80 | self.mlp = nn.Sequential( 81 | nn.Linear(frequency_embedding_size, hidden_size, bias=True), 82 | nn.SiLU(), 83 | nn.Linear(hidden_size, hidden_size, bias=True), 84 | ) 85 | self.frequency_embedding_size = frequency_embedding_size 86 | 87 | @staticmethod 88 | def timestep_embedding(t, dim, max_period=10000): 89 | """ 90 | Create sinusoidal timestep embeddings. 91 | :param t: a 1-D Tensor of N indices, one per batch element. 92 | These may be fractional. 93 | :param dim: the dimension of the output. 94 | :param max_period: controls the minimum frequency of the embeddings. 95 | :return: an (N, D) Tensor of positional embeddings. 96 | """ 97 | half = dim // 2 98 | freqs = torch.exp( 99 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 100 | ).to(device=t.device) 101 | args = t[:, None].float() * freqs[None] 102 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 103 | if dim % 2: 104 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 105 | return embedding 106 | 107 | def forward(self, t): 108 | t_freq = self.timestep_embedding(t, self.frequency_embedding_size) 109 | t_emb = self.mlp(t_freq) 110 | return t_emb 111 | 112 | 113 | class MLPconnector(nn.Module): 114 | def __init__(self, in_dim: int, out_dim: int, hidden_act: str): 115 | super().__init__() 116 | self.activation_fn = ACT2FN[hidden_act] 117 | self.fc1 = nn.Linear(in_dim, out_dim) 118 | self.fc2 = nn.Linear(out_dim, out_dim) 119 | 120 | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: 121 | hidden_states = self.fc1(hidden_states) 122 | hidden_states = self.activation_fn(hidden_states) 123 | hidden_states = self.fc2(hidden_states) 124 | return hidden_states 125 | 126 | 127 | class PositionEmbedding(nn.Module): 128 | def __init__(self, max_num_patch_per_side, hidden_size): 129 | super().__init__() 130 | self.max_num_patch_per_side = max_num_patch_per_side 131 | self.hidden_size = hidden_size 132 | self.pos_embed = nn.Parameter( 133 | torch.zeros(max_num_patch_per_side ** 2, hidden_size), 134 | requires_grad=False 135 | ) 136 | self._init_weights() 137 | 138 | def _init_weights(self): 139 | # Initialize (and freeze) pos_embed by sin-cos embedding: 140 | pos_embed = get_2d_sincos_pos_embed(self.hidden_size, self.max_num_patch_per_side) 141 | self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float()) 142 | 143 | def forward(self, position_ids): 144 | return self.pos_embed[position_ids] -------------------------------------------------------------------------------- /eval/gen/wise/cal_score.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import os 6 | import argparse 7 | from collections import defaultdict 8 | 9 | 10 | def calculate_wiscore(consistency, realism, aesthetic_quality): 11 | return 0.7 * consistency + 0.2 * realism + 0.1 * aesthetic_quality 12 | 13 | 14 | def cal_culture(file_path): 15 | all_scores = [] 16 | total_objects = 0 17 | has_9_9 = False 18 | 19 | with open(file_path, 'r') as file: 20 | for line in file: 21 | total_objects += 1 22 | data = json.loads(line) 23 | if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]: 24 | has_9_9 = True 25 | wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality']) 26 | all_scores.append(wiscore) 27 | 28 | if has_9_9 or total_objects < 400: 29 | print(f"Skipping file {file_path}: Contains 9.9 or has less than 400 objects.") 30 | return None 31 | 32 | total_score = sum(all_scores) 33 | avg_score = total_score / (len(all_scores)*2) if len(all_scores) > 0 else 0 34 | 35 | score = { 36 | 'total': total_score, 37 | 'average': avg_score 38 | } 39 | 40 | print(f" Cultural - Total: {score['total']:.2f}, Average: {score['average']:.2f}") 41 | 42 | return avg_score 43 | 44 | 45 | def cal_space_time(file_path): 46 | categories = defaultdict(list) 47 | total_objects = 0 48 | has_9_9 = False 49 | 50 | with open(file_path, 'r') as file: 51 | for line in file: 52 | total_objects += 1 53 | data = json.loads(line) 54 | if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]: 55 | has_9_9 = True 56 | subcategory = data['Subcategory'] 57 | wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality']) 58 | if subcategory in ['Longitudinal time', 'Horizontal time']: 59 | categories['Time'].append(wiscore) 60 | else: 61 | categories['Space'].append(wiscore) 62 | 63 | if has_9_9 or total_objects < 300: 64 | print(f"Skipping file {file_path}: Contains 9.9 or has less than 400 objects.") 65 | return None 66 | 67 | total_scores = {category: sum(scores) for category, scores in categories.items()} 68 | avg_scores = {category: sum(scores) / (len(scores) * 2 )if len(scores) > 0 else 0 for category, scores in categories.items()} 69 | 70 | scores = { 71 | 'total': total_scores, 72 | 'average': avg_scores 73 | } 74 | 75 | print(f" Time - Total: {scores['total'].get('Time', 0):.2f}, Average: {scores['average'].get('Time', 0):.2f}") 76 | print(f" Space - Total: {scores['total'].get('Space', 0):.2f}, Average: {scores['average'].get('Space', 0):.2f}") 77 | 78 | return avg_scores 79 | 80 | 81 | def cal_science(file_path): 82 | categories = defaultdict(list) 83 | total_objects = 0 84 | has_9_9 = False 85 | 86 | with open(file_path, 'r') as file: 87 | for line in file: 88 | total_objects += 1 89 | data = json.loads(line) 90 | if 9.9 in [data['consistency'], data['realism'], data['aesthetic_quality']]: 91 | has_9_9 = True 92 | 93 | prompt_id = data.get('prompt_id', 0) 94 | if 701 <= prompt_id <= 800: 95 | category = 'Biology' 96 | elif 801 <= prompt_id <= 900: 97 | category = 'Physics' 98 | elif 901 <= prompt_id <= 1000: 99 | category = 'Chemistry' 100 | else: 101 | category = "?" 102 | 103 | wiscore = calculate_wiscore(data['consistency'], data['realism'], data['aesthetic_quality']) 104 | categories[category].append(wiscore) 105 | 106 | if has_9_9 or total_objects < 300: 107 | print(f"Skipping file {file_path}: Contains 9.9 or has less than 300 objects.") 108 | return None 109 | 110 | total_scores = {category: sum(scores) for category, scores in categories.items()} 111 | avg_scores = {category: sum(scores) / (len(scores)*2) if len(scores) > 0 else 0 for category, scores in categories.items()} 112 | 113 | scores = { 114 | 'total': total_scores, 115 | 'average': avg_scores 116 | } 117 | 118 | for category in ['Biology', 'Physics', 'Chemistry']: 119 | print(f" {category} - Total: {scores['total'].get(category, 0):.2f}, Average: {scores['average'].get(category, 0):.2f}") 120 | 121 | return avg_scores 122 | 123 | 124 | if __name__ == "__main__": 125 | parser = argparse.ArgumentParser(description='Image Quality Assessment Tool') 126 | parser.add_argument('--output_dir', required=True, 127 | help='Path to the output directory') 128 | args = parser.parse_args() 129 | 130 | avg_score = dict() 131 | 132 | score = cal_culture( 133 | os.path.join(args.output_dir, "cultural_common_sense_scores.jsonl") 134 | ) 135 | avg_score['Cultural'] = score 136 | 137 | scores = cal_space_time( 138 | os.path.join(args.output_dir, "spatio-temporal_reasoning_scores.jsonl") 139 | ) 140 | avg_score.update(scores) 141 | 142 | scores = cal_science( 143 | os.path.join(args.output_dir, "natural_science_scores.jsonl") 144 | ) 145 | avg_score.update(scores) 146 | 147 | avg_all = sum(avg_score.values()) / len(avg_score) 148 | 149 | avg_score['Overall'] = avg_all 150 | keys = "" 151 | values = "" 152 | for k, v in avg_score.items(): 153 | keys += f"{k} " 154 | values += f"{v:.2f} " 155 | print(keys) 156 | print(values) 157 | 158 | writer = open(os.path.join(args.output_dir, "results.txt"), 'w') 159 | print(f"write results to file {os.path.join(args.output_dir, 'results.txt')}") 160 | writer.write(keys + "\n") 161 | writer.write(values + "\n") 162 | writer.close() -------------------------------------------------------------------------------- /eval/vlm/eval/mathvista/extract_answer_mp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | 13 | import argparse 14 | import os 15 | import re 16 | import openai 17 | from concurrent.futures import ThreadPoolExecutor, as_completed 18 | from tqdm import tqdm 19 | from utilities import * 20 | from prompts.ext_ans import demo_prompt 21 | 22 | openai.api_key = os.getenv('OPENAI_API_KEY') 23 | print(openai.api_key) 24 | 25 | def verify_extraction(extraction): 26 | extraction = extraction.strip() 27 | if extraction == '' or extraction is None: 28 | return False 29 | return True 30 | 31 | def create_test_prompt(demo_prompt, query, response): 32 | demo_prompt = demo_prompt.strip() 33 | test_prompt = f'{query}\n\n{response}' 34 | full_prompt = f'{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: ' 35 | return full_prompt 36 | 37 | def _extract_answer(text): 38 | match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE) 39 | if match: 40 | return match.group(2).strip() 41 | return text 42 | 43 | def extract_answer(response, problem, quick_extract=False): 44 | question_type = problem['question_type'] 45 | answer_type = problem['answer_type'] 46 | choices = problem['choices'] 47 | query = problem['query'] 48 | 49 | if response == '': 50 | return '' 51 | 52 | if question_type == 'multi_choice' and response in choices: 53 | return response 54 | 55 | if answer_type == 'integer': 56 | try: 57 | extraction = int(response) 58 | return str(extraction) 59 | except: 60 | pass 61 | 62 | if answer_type == 'float': 63 | try: 64 | extraction = str(float(response)) 65 | return extraction 66 | except: 67 | pass 68 | 69 | # quick extraction 70 | if quick_extract: 71 | print('Quickly extracting answer...') 72 | try: 73 | result = _extract_answer(response) 74 | return result 75 | except: 76 | pass 77 | 78 | try: 79 | full_prompt = create_test_prompt(demo_prompt, query, response) 80 | extraction = get_chat_response(full_prompt, openai.api_key, patience=5, model=args.llm_engine) 81 | return extraction 82 | except Exception as e: 83 | print(e) 84 | 85 | return '' 86 | 87 | def process_problem(pid, results, label, args): 88 | problem = results[pid] 89 | response = problem[label] 90 | extraction = extract_answer(response, problem, args.quick_extract) 91 | return pid, extraction 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser() 95 | # input 96 | parser.add_argument('--output_dir', type=str, default='./results') 97 | parser.add_argument('--output_file', type=str, default='mathvista_answer.json') 98 | parser.add_argument('--response_label', type=str, default='response', help='response label for the input file') 99 | # model 100 | parser.add_argument('--llm_engine', type=str, default='gpt-4o-2024-11-20', help='llm engine', 101 | choices=['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613', 102 | 'gpt-4o-2024-08-06', 'gpt-4o-2024-11-20']) 103 | parser.add_argument('--number', type=int, default=-1, help='number of problems to run') 104 | parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems') 105 | parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction') 106 | # output 107 | parser.add_argument('--save_every', type=int, default=100, help='save every n problems') 108 | parser.add_argument('--output_label', type=str, default='', help='label for the output file') 109 | parser.add_argument('--max_workers', type=int, default=40, help='max workers for ThreadPoolExecutor') 110 | args = parser.parse_args() 111 | 112 | label = args.response_label 113 | result_file = os.path.join(args.output_dir, args.output_file) 114 | 115 | if args.output_label != '': 116 | output_file = result_file.replace('.json', f'_{args.output_label}.json') 117 | else: 118 | output_file = result_file 119 | 120 | print(f'Reading {result_file}...') 121 | results = read_json(result_file) 122 | 123 | full_pids = list(results.keys()) 124 | if args.number > 0: 125 | full_pids = full_pids[:min(args.number, len(full_pids))] 126 | print('Number of total problems:', len(full_pids)) 127 | 128 | if args.rerun: 129 | test_pids = full_pids 130 | else: 131 | test_pids = [] 132 | for pid in full_pids: 133 | if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']): 134 | test_pids.append(pid) 135 | 136 | test_num = len(test_pids) 137 | print('Number of problems to run:', test_num) 138 | 139 | with ThreadPoolExecutor(max_workers=args.max_workers) as executor: 140 | future_to_pid = {} 141 | for pid in test_pids: 142 | future = executor.submit(process_problem, pid, results, label, args) 143 | future_to_pid[future] = pid 144 | 145 | completed_count = 0 146 | for future in tqdm(as_completed(future_to_pid), total=test_num): 147 | pid = future_to_pid[future] 148 | try: 149 | pid_result, extraction = future.result() 150 | results[pid_result]['extraction'] = extraction 151 | except Exception as e: 152 | print(f'Error processing pid={pid}: {e}') 153 | 154 | completed_count += 1 155 | if (completed_count % args.save_every == 0) or (completed_count == test_num): 156 | print(f'Saving results to {output_file}... [{completed_count}/{test_num}]') 157 | save_json(results, output_file) 158 | print('Results saved.') 159 | 160 | print('All done!') 161 | -------------------------------------------------------------------------------- /eval/vlm/evaluate.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | set -x 13 | 14 | export PYTHONPATH="$(pwd):${PYTHONPATH}" 15 | export TF_CPP_MIN_LOG_LEVEL=3 16 | export LAUNCHER=pytorch 17 | 18 | DATASET=${1} 19 | echo "CHECKPOINT: ${CHECKPOINT}" 20 | 21 | # Save original arguments 22 | ARGS=("$@") 23 | 24 | # Parse options 25 | while [[ $# -gt 0 ]]; do 26 | case "$1" in 27 | --auto) 28 | GPUS=1 29 | shift 30 | ;; 31 | *) 32 | shift 33 | ;; 34 | esac 35 | done 36 | echo "GPUS: ${GPUS}" 37 | 38 | if [ ${DATASET} == "mme" ]; then 39 | python -m eval.vlm.eval.mme.eval "${ARGS[@]:1}" 40 | fi 41 | 42 | if [ ${DATASET} == "mmvet" ]; then 43 | python -m eval.vlm.eval.mmvet.evaluate_mmvet --datasets mmvet "${ARGS[@]:1}" 44 | fi 45 | 46 | if [ ${DATASET} == "mmbench-dev-en" ]; then 47 | torchrun \ 48 | --nnodes=$ARNOLD_WORKER_NUM \ 49 | --node_rank=$ARNOLD_ID \ 50 | --master_addr=$ARNOLD_WORKER_0_HOST \ 51 | --nproc_per_node=${GPUS} \ 52 | --master_port=${MASTER_PORT} \ 53 | -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_dev_20230712 "${ARGS[@]:1}" 54 | fi 55 | 56 | if [ ${DATASET} == "mmbench-dev-cn" ]; then 57 | torchrun \ 58 | --nnodes=$ARNOLD_WORKER_NUM \ 59 | --node_rank=$ARNOLD_ID \ 60 | --master_addr=$ARNOLD_WORKER_0_HOST \ 61 | --nproc_per_node=${GPUS} \ 62 | --master_port=${MASTER_PORT} \ 63 | -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_dev_cn_20231003 "${ARGS[@]:1}" 64 | fi 65 | 66 | if [ ${DATASET} == "mmbench-test-en" ]; then 67 | torchrun \ 68 | --nnodes=$ARNOLD_WORKER_NUM \ 69 | --node_rank=$ARNOLD_ID \ 70 | --master_addr=$ARNOLD_WORKER_0_HOST \ 71 | --nproc_per_node=${GPUS} \ 72 | --master_port=${MASTER_PORT} \ 73 | -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_test_en_20231003 "${ARGS[@]:1}" 74 | fi 75 | 76 | if [ ${DATASET} == "mmbench-test-cn" ]; then 77 | torchrun \ 78 | --nnodes=$ARNOLD_WORKER_NUM \ 79 | --node_rank=$ARNOLD_ID \ 80 | --master_addr=$ARNOLD_WORKER_0_HOST \ 81 | --nproc_per_node=${GPUS} \ 82 | --master_port=${MASTER_PORT} \ 83 | -m eval.vlm.eval.mmbench.evaluate_mmbench --datasets mmbench_test_cn_20231003 "${ARGS[@]:1}" 84 | fi 85 | 86 | if [ ${DATASET} == "mmmu-dev" ]; then 87 | torchrun \ 88 | --nnodes=$ARNOLD_WORKER_NUM \ 89 | --node_rank=$ARNOLD_ID \ 90 | --master_addr=$ARNOLD_WORKER_0_HOST \ 91 | --nproc_per_node=${GPUS} \ 92 | --master_port=${MASTER_PORT} \ 93 | -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_dev "${ARGS[@]:1}" 94 | fi 95 | 96 | if [ ${DATASET} == "mmmu-val" ]; then 97 | torchrun \ 98 | --nnodes=$ARNOLD_WORKER_NUM \ 99 | --node_rank=$ARNOLD_ID \ 100 | --master_addr=$ARNOLD_WORKER_0_HOST \ 101 | --nproc_per_node=${GPUS} \ 102 | --master_port=${MASTER_PORT} \ 103 | -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_validation "${ARGS[@]:1}" 104 | fi 105 | 106 | if [ ${DATASET} == "mmmu-val_cot" ]; then 107 | torchrun \ 108 | --nnodes=$ARNOLD_WORKER_NUM \ 109 | --node_rank=$ARNOLD_ID \ 110 | --master_addr=$ARNOLD_WORKER_0_HOST \ 111 | --nproc_per_node=${GPUS} \ 112 | --master_port=${MASTER_PORT} \ 113 | -m eval.vlm.eval.mmmu.evaluate_mmmu_cot --datasets MMMU_validation_cot "${ARGS[@]:1}" 114 | fi 115 | 116 | if [ ${DATASET} == "mmmu-test" ]; then 117 | torchrun \ 118 | --nnodes=$ARNOLD_WORKER_NUM \ 119 | --node_rank=$ARNOLD_ID \ 120 | --master_addr=$ARNOLD_WORKER_0_HOST \ 121 | --nproc_per_node=${GPUS} \ 122 | --master_port=${MASTER_PORT} \ 123 | -m eval.vlm.eval.mmmu.evaluate_mmmu --datasets MMMU_test "${ARGS[@]:1}" 124 | fi 125 | 126 | if [ ${DATASET} == "mathvista-testmini" ]; then 127 | torchrun \ 128 | --nnodes=$ARNOLD_WORKER_NUM \ 129 | --node_rank=$ARNOLD_ID \ 130 | --master_addr=$ARNOLD_WORKER_0_HOST \ 131 | --nproc_per_node=${GPUS} \ 132 | --master_port=${MASTER_PORT} \ 133 | -m eval.vlm.eval.mathvista.evaluate_mathvista --datasets MathVista_testmini "${ARGS[@]:1}" 134 | fi 135 | 136 | if [ ${DATASET} == "mathvista-test" ]; then 137 | torchrun \ 138 | --nnodes=$ARNOLD_WORKER_NUM \ 139 | --node_rank=$ARNOLD_ID \ 140 | --master_addr=$ARNOLD_WORKER_0_HOST \ 141 | --nproc_per_node=${GPUS} \ 142 | --master_port=${MASTER_PORT} \ 143 | -m eval.vlm.eval.mathvista.evaluate_mathvista --datasets MathVista_test "${ARGS[@]:1}" 144 | fi 145 | 146 | if [ ${DATASET} == "pope" ]; then 147 | torchrun \ 148 | --nnodes=$ARNOLD_WORKER_NUM \ 149 | --node_rank=$ARNOLD_ID \ 150 | --master_addr=$ARNOLD_WORKER_0_HOST \ 151 | --nproc_per_node=${GPUS} \ 152 | --master_port=${MASTER_PORT} \ 153 | -m eval.vlm.eval.pope.evaluate_pope --datasets pope "${ARGS[@]:1}" 154 | fi 155 | 156 | if [ ${DATASET} == "pope_cot" ]; then 157 | torchrun \ 158 | --nnodes=$ARNOLD_WORKER_NUM \ 159 | --node_rank=$ARNOLD_ID \ 160 | --master_addr=$ARNOLD_WORKER_0_HOST \ 161 | --nproc_per_node=${GPUS} \ 162 | --master_port=${MASTER_PORT} \ 163 | -m eval.vlm.eval.pope.evaluate_pope --datasets pope_cot --cot "${ARGS[@]:1}" 164 | fi 165 | 166 | if [ ${DATASET} == "vqa-gqa-testdev" ]; then 167 | torchrun \ 168 | --nnodes=$ARNOLD_WORKER_NUM \ 169 | --node_rank=$ARNOLD_ID \ 170 | --master_addr=$ARNOLD_WORKER_0_HOST \ 171 | --nproc_per_node=${GPUS} \ 172 | --master_port=${MASTER_PORT} \ 173 | -m eval.vlm.eval.vqa.evaluate_vqa --datasets gqa_testdev_llava "${ARGS[@]:1}" 174 | fi 175 | 176 | if [ ${DATASET} == "mmvp" ]; then 177 | torchrun \ 178 | --nnodes=$ARNOLD_WORKER_NUM \ 179 | --node_rank=$ARNOLD_ID \ 180 | --master_addr=$ARNOLD_WORKER_0_HOST \ 181 | --nproc_per_node=${GPUS} \ 182 | --master_port=${MASTER_PORT} \ 183 | -m eval.vlm.eval.mmvp.evaluate_mmvp --datasets MMVP "${ARGS[@]:1}" 184 | fi 185 | -------------------------------------------------------------------------------- /data/t2i_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import io 5 | import json 6 | import pyarrow.parquet as pq 7 | import random 8 | from PIL import Image 9 | 10 | from .data_utils import pil_img2rgb 11 | from .distributed_iterable_dataset import DistributedIterableDataset 12 | from .parquet_utils import get_parquet_data_paths, init_arrow_pf_fs 13 | 14 | Image.MAX_IMAGE_PIXELS = 20_000_000 15 | 16 | 17 | class T2IIterableDataset(DistributedIterableDataset): 18 | def __init__( 19 | self, dataset_name, transform, tokenizer, data_dir_list, num_used_data, 20 | local_rank=0, world_size=1, num_workers=8, data_status=None, 21 | ): 22 | """ 23 | data_dir_list: list of data directories contains parquet files 24 | num_used_data: list of number of sampled data paths for each data directory 25 | """ 26 | super().__init__(dataset_name, local_rank, world_size, num_workers) 27 | self.transform = transform 28 | self.tokenizer = tokenizer 29 | self.data_status = data_status 30 | self.data_paths = self.get_data_paths(data_dir_list, num_used_data) 31 | self.set_epoch() 32 | 33 | def get_data_paths(self, data_dir_list, num_used_data): 34 | return get_parquet_data_paths(data_dir_list, num_used_data) 35 | 36 | def __iter__(self): 37 | data_paths_per_worker, worker_id = self.get_data_paths_per_worker() 38 | if self.data_status is not None: 39 | parquet_start_id = self.data_status[worker_id][0] 40 | row_group_start_id = self.data_status[worker_id][1] 41 | row_start_id = self.data_status[worker_id][2] + 1 42 | else: 43 | parquet_start_id = 0 44 | row_group_start_id = 0 45 | row_start_id = 0 46 | transform_stride = self.transform.stride 47 | 48 | print( 49 | f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: " 50 | f"resuming data at parquet#{parquet_start_id}, rg#{row_group_start_id}, row#{row_start_id}" 51 | ) 52 | 53 | while True: 54 | data_paths_per_worker_ = data_paths_per_worker[parquet_start_id:] 55 | for parquet_idx, parquet_file_path in enumerate(data_paths_per_worker_, start=parquet_start_id): 56 | fs = init_arrow_pf_fs(parquet_file_path) 57 | with fs.open_input_file(parquet_file_path) as f: 58 | fr = pq.ParquetFile(f) 59 | row_group_ids = list(range(fr.num_row_groups)) 60 | row_group_ids_ = row_group_ids[row_group_start_id:] 61 | 62 | for row_group_id in row_group_ids_: 63 | df = fr.read_row_group(row_group_id).to_pandas() 64 | df = df.iloc[row_start_id:] 65 | 66 | for row_idx, row in df.iterrows(): 67 | num_tokens = 0 68 | try: 69 | image_byte = row['image'] 70 | image = pil_img2rgb(Image.open(io.BytesIO(image_byte))) 71 | except Exception as e: 72 | print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}') 73 | continue 74 | image_tensor = self.transform(image) 75 | height, width = image_tensor.shape[1:] 76 | num_tokens += width * height // transform_stride ** 2 77 | 78 | try: 79 | caption_dict = row['captions'] 80 | caption_dict = json.loads(caption_dict) 81 | except Exception as e: 82 | print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}') 83 | continue 84 | 85 | caps_token = [self.tokenizer.encode(v) for _, v in caption_dict.items()] 86 | if len(caps_token) == 0: 87 | print(f'no caption in rg#{row_group_id}, {parquet_file_path}') 88 | caption_token = self.tokenizer.encode(' ') 89 | else: 90 | caption_token = random.choice(caps_token) 91 | 92 | sequence_plan, text_ids_list = [], [] 93 | text_ids = caption_token 94 | num_tokens += len(caption_token) 95 | text_ids_list.append(text_ids) 96 | sequence_plan.append({ 97 | 'type': 'text', 98 | 'enable_cfg': 1, 99 | 'loss': 0, 100 | 'special_token_loss': 0, 101 | 'special_token_label': None, 102 | }) 103 | 104 | sequence_plan.append({ 105 | 'type': 'vae_image', 106 | 'enable_cfg': 0, 107 | 'loss': 1, 108 | 'special_token_loss': 0, 109 | 'special_token_label': None, 110 | }) 111 | 112 | sample = dict( 113 | image_tensor_list=[image_tensor], 114 | text_ids_list=text_ids_list, 115 | num_tokens=num_tokens, 116 | sequence_plan=sequence_plan, 117 | data_indexes={ 118 | "data_indexes": [parquet_idx, row_group_id, row_idx], 119 | "worker_id": worker_id, 120 | "dataset_name": self.dataset_name, 121 | } 122 | ) 123 | yield sample 124 | 125 | row_start_id = 0 126 | row_group_start_id = 0 127 | parquet_start_id = 0 128 | print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}") 129 | -------------------------------------------------------------------------------- /eval/vlm/eval/mme/calculation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import os 14 | 15 | from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 16 | recall_score) 17 | 18 | parser = argparse.ArgumentParser() 19 | # parser.add_argument('--results_dir', default='./LaVIN', type=str) 20 | parser.add_argument('--out-dir', default='./', type=str) 21 | 22 | eval_type_dict = { 23 | 'Perception': ['existence', 'count', 'position', 'color', 'posters', 'celebrity', 'scene', 'landmark', 'artwork', 'OCR'], 24 | 'Cognition': ['commonsense_reasoning', 'numerical_calculation', 'text_translation', 'code_reasoning'] 25 | } 26 | 27 | 28 | class calculate_metrics: 29 | def divide_chunks(self, l, n=2): 30 | # looping till length l 31 | for i in range(0, len(l), n): 32 | yield l[i:i + n] 33 | 34 | return 35 | 36 | def parse_pred_ans(self, pred_ans): 37 | pred_label = None 38 | if pred_ans in ['yes', 'no']: 39 | pred_label = pred_ans 40 | else: 41 | prefix_pred_ans = pred_ans[:4] 42 | 43 | if 'yes' in prefix_pred_ans: 44 | pred_label = 'yes' 45 | elif 'no' in prefix_pred_ans: 46 | pred_label = 'no' 47 | else: 48 | pred_label = 'other' 49 | 50 | return pred_label 51 | 52 | def compute_metric(self, gts, preds): 53 | assert len(gts) == len(preds) 54 | 55 | label_map = { 56 | 'yes': 1, 57 | 'no': 0, 58 | 'other': -1, 59 | } 60 | 61 | gts = [label_map[x] for x in gts] 62 | preds = [label_map[x] for x in preds] 63 | 64 | acc = accuracy_score(gts, preds) 65 | 66 | clean_gts = [] 67 | clean_preds = [] 68 | other_num = 0 69 | for gt, pred in zip(gts, preds): 70 | if pred == -1: 71 | other_num += 1 72 | continue 73 | clean_gts.append(gt) 74 | clean_preds.append(pred) 75 | 76 | conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0]) 77 | precision = precision_score(clean_gts, clean_preds, average='binary') 78 | recall = recall_score(clean_gts, clean_preds, average='binary') 79 | tp, fn = conf_mat[0] 80 | fp, tn = conf_mat[1] 81 | 82 | metric_dict = dict() 83 | metric_dict = { 84 | 'TP': tp, 85 | 'FN': fn, 86 | 'TN': tn, 87 | 'FP': fp, 88 | 'precision': precision, 89 | 'recall': recall, 90 | 'other_num': other_num, 91 | 'acc': acc, 92 | } 93 | 94 | return metric_dict 95 | 96 | def process_result(self, results_dir): 97 | ret_message = "" 98 | model_score_dict = dict() 99 | for eval_type, task_name_list in eval_type_dict.items(): 100 | print('===========', eval_type, '===========') 101 | ret_message += f"=========== {eval_type} ===========\n" 102 | 103 | scores = 0 104 | task_score_dict = dict() 105 | 106 | for task_name in task_name_list: 107 | 108 | task_txt = os.path.join(results_dir, task_name + '.txt') 109 | lines = open(task_txt, 'r').readlines() 110 | chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions 111 | 112 | img_num = len(chunk_lines) 113 | task_other_ans_num = 0 114 | task_score = 0 115 | acc_plus_correct_num = 0 116 | gts = [] 117 | preds = [] 118 | 119 | for img_items in chunk_lines: 120 | assert len(img_items) == 2 121 | img_correct_num = 0 122 | 123 | for img_item in img_items: 124 | try: 125 | img_name, question, gt_ans, pred_ans = img_item.split('\t') 126 | except: 127 | print(img_item) 128 | continue 129 | gt_ans = gt_ans.lower() 130 | pred_ans = pred_ans.lower() 131 | 132 | assert gt_ans in ['yes', 'no'] # gt can only be yes or no. 133 | 134 | pred_ans = self.parse_pred_ans(pred_ans) 135 | assert pred_ans in ['yes', 'no', 'other'] 136 | 137 | gts.append(gt_ans) 138 | preds.append(pred_ans) 139 | 140 | if gt_ans == pred_ans: 141 | img_correct_num += 1 142 | 143 | if pred_ans not in ['yes', 'no']: 144 | task_other_ans_num += 1 145 | 146 | if img_correct_num == 2: 147 | acc_plus_correct_num += 1 148 | 149 | # cal TP precision acc, etc. 150 | metric_dict = self.compute_metric(gts, preds) 151 | acc_plus = acc_plus_correct_num / img_num 152 | metric_dict['acc_plus'] = acc_plus 153 | 154 | for k, v in metric_dict.items(): 155 | if k in ['acc', 'acc_plus']: 156 | task_score += v*100 157 | 158 | task_score_dict[task_name] = task_score 159 | 160 | scores += task_score 161 | 162 | print('total score:', scores, '\n') 163 | ret_message += f"total score: {scores} \n\n" 164 | for task_name, score in task_score_dict.items(): 165 | print('\t', task_name, ' score:', score) 166 | ret_message += f"\t {task_name} score: {score}\n" 167 | print('\n') 168 | ret_message += "\n\n" 169 | 170 | return ret_message 171 | 172 | 173 | if __name__ == '__main__': 174 | cal = calculate_metrics() 175 | 176 | args = parser.parse_args() 177 | # results_dir = args.results_dir 178 | results_dir = args.out_dir 179 | ret_message = cal.process_result(results_dir) 180 | 181 | writer = open(os.path.join(args.out_dir, "results.txt"), 'w') 182 | print(f"write results to file {os.path.join(args.out_dir, 'results.txt')}") 183 | writer.write(ret_message) 184 | writer.close() -------------------------------------------------------------------------------- /data/video_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | 13 | import io 14 | import os 15 | import random 16 | import re 17 | 18 | import numpy as np 19 | import decord 20 | from PIL import Image 21 | 22 | 23 | def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1): 24 | if sample in ['rand', 'middle']: # uniform sampling 25 | acc_samples = min(num_frames, vlen) 26 | # split the video into `acc_samples` intervals, and sample from each interval. 27 | intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) 28 | ranges = [] 29 | for idx, interv in enumerate(intervals[:-1]): 30 | ranges.append((interv, intervals[idx + 1] - 1)) 31 | if sample == 'rand': 32 | try: 33 | frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] 34 | except: 35 | frame_indices = np.random.permutation(vlen)[:acc_samples] 36 | frame_indices.sort() 37 | frame_indices = list(frame_indices) 38 | elif fix_start is not None: 39 | frame_indices = [x[0] + fix_start for x in ranges] 40 | elif sample == 'middle': 41 | frame_indices = [(x[0] + x[1]) // 2 for x in ranges] 42 | else: 43 | raise NotImplementedError 44 | 45 | if len(frame_indices) < num_frames: # padded with last frame 46 | padded_frame_indices = [frame_indices[-1]] * num_frames 47 | padded_frame_indices[:len(frame_indices)] = frame_indices 48 | frame_indices = padded_frame_indices 49 | elif 'fps' in sample: # fps0.5, sequentially sample frames at 0.5 fps 50 | output_fps = float(sample[3:]) 51 | duration = float(vlen) / input_fps 52 | delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents 53 | frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) 54 | frame_indices = np.around(frame_seconds * input_fps).astype(int) 55 | frame_indices = [e for e in frame_indices if e < vlen] 56 | if max_num_frames > 0 and len(frame_indices) > max_num_frames: 57 | frame_indices = frame_indices[:max_num_frames] 58 | else: 59 | raise ValueError 60 | return frame_indices 61 | 62 | 63 | def read_frames_decord(video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4): 64 | video_reader = decord.VideoReader(video_path, num_threads=1) 65 | vlen = len(video_reader) 66 | fps = video_reader.get_avg_fps() 67 | duration = vlen / float(fps) 68 | if clip: 69 | start, end = clip 70 | duration = end - start 71 | vlen = int(duration * fps) 72 | start_index = int(start * fps) 73 | 74 | t_num_frames = np.random.randint(min_num_frames, num_frames + 1) 75 | 76 | frame_indices = get_frame_indices( 77 | t_num_frames, vlen, sample=sample, fix_start=fix_start, 78 | input_fps=fps 79 | ) 80 | if clip: 81 | frame_indices = [f + start_index for f in frame_indices] 82 | frames = video_reader.get_batch(frame_indices).asnumpy() # (T, H, W, C), np.uint8 83 | frames = [Image.fromarray(frames[i]) for i in range(frames.shape[0])] 84 | return frames 85 | 86 | 87 | def extract_frame_number(filename): 88 | # Extract the numeric part from the filename using regular expressions 89 | match = re.search(r'_(\d+).jpg$', filename) 90 | return int(match.group(1)) if match else -1 91 | 92 | 93 | def sort_frames(frame_paths): 94 | # Extract filenames from each path and sort by their numeric part 95 | return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x))) 96 | 97 | 98 | def read_frames_folder(video_path, num_frames, sample='rand', fix_start=None, min_num_frames=4): 99 | image_list = sort_frames(list(os.listdir(video_path))) 100 | frames = [] 101 | for image in image_list: 102 | fp = os.path.join(video_path, image) 103 | frame = Image.open(fp).convert('RGB') 104 | frames.append(frame) 105 | vlen = len(frames) 106 | 107 | t_num_frames = np.random.randint(min_num_frames, num_frames + 1) 108 | 109 | if vlen > t_num_frames: 110 | frame_indices = get_frame_indices( 111 | t_num_frames, vlen, sample=sample, fix_start=fix_start 112 | ) 113 | frames = [frames[i] for i in frame_indices] 114 | return frames 115 | 116 | 117 | class FrameSampler: 118 | def __init__(self, max_num_frames=-1, min_num_frames=8, sample='rand'): 119 | self.max_num_frames = max_num_frames 120 | self.min_num_frames = min_num_frames 121 | self.sample = sample 122 | 123 | def __call__(self, file_name): 124 | fn = read_frames_folder if file_name.endswith('/') else read_frames_decord 125 | frames = fn(file_name, num_frames=self.max_num_frames, min_num_frames=self.min_num_frames, sample=self.sample) 126 | return frames 127 | 128 | 129 | def decode_video_byte(video_bytes): 130 | video_stream = io.BytesIO(video_bytes) 131 | vr = decord.VideoReader(video_stream) 132 | return vr 133 | 134 | 135 | def sample_mp4_frames(mp4_p, n_frames=None, fps=None, return_frame_indices=False, random_sample=False): 136 | if isinstance(mp4_p, str): 137 | vr = decord.VideoReader(mp4_p, num_threads=1) 138 | elif isinstance(mp4_p, decord.video_reader.VideoReader): 139 | vr = mp4_p 140 | video_fps = vr.get_avg_fps() # 获取视频的帧率 141 | video_duration = len(vr) / video_fps 142 | if n_frames is not None: 143 | if random_sample: 144 | frame_indices = sorted(random.sample(range(len(vr)), n_frames)) 145 | else: 146 | frame_indices = np.linspace(0, len(vr)-1, n_frames, dtype=int).tolist() 147 | else: 148 | frame_indices = [int(i) for i in np.arange(0, len(vr)-1, video_fps/fps)] 149 | frames = vr.get_batch(frame_indices).asnumpy() # 转换为 numpy 数组 150 | frames = [Image.fromarray(frame).convert("RGB") for frame in frames] 151 | if not return_frame_indices: 152 | return frames, video_duration 153 | else: 154 | return frames, video_duration, frame_indices 155 | 156 | 157 | def sample_mp4_frames_by_indices(mp4_p, frame_indices: list): 158 | if isinstance(mp4_p, str): 159 | vr = decord.VideoReader(mp4_p, num_threads=1) 160 | elif isinstance(mp4_p, decord.video_reader.VideoReader): 161 | vr = mp4_p 162 | # sample the frames in frame_indices 163 | frames = vr.get_batch(frame_indices).asnumpy() # 转换为 numpy 数组 164 | frames = [Image.fromarray(frame).convert("RGB") for frame in frames] 165 | return frames -------------------------------------------------------------------------------- /eval/gen/imgedit/basic_bench.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import base64 5 | import os 6 | import json 7 | import argparse 8 | import openai 9 | from tqdm import tqdm 10 | from concurrent.futures import ThreadPoolExecutor, as_completed 11 | import threading 12 | 13 | openai.api_key = os.getenv('OPENAI_API_KEY') 14 | 15 | lock = threading.Lock() # For thread-safe file writing 16 | 17 | def load_prompts(prompts_json_path): 18 | with open(prompts_json_path, 'r') as f: 19 | return json.load(f) 20 | 21 | def image_to_base64(image_path): 22 | try: 23 | with open(image_path, "rb") as image_file: 24 | return base64.b64encode(image_file.read()).decode('utf-8') 25 | except FileNotFoundError: 26 | print(f"File {image_path} not found.") 27 | return None 28 | 29 | def call_gpt(original_image_path, result_image_path, edit_prompt, edit_type, prompts): 30 | try: 31 | original_image_base64 = image_to_base64(original_image_path) 32 | result_image_base64 = image_to_base64(result_image_path) 33 | 34 | if not original_image_base64 or not result_image_base64: 35 | return {"error": "Image conversion failed"} 36 | 37 | prompt = prompts[edit_type] 38 | full_prompt = prompt.replace('', edit_prompt) 39 | 40 | response = openai_client.chat.completions.create( 41 | model=model, 42 | stream=False, 43 | messages=[{ 44 | "role": "user", 45 | "content": [ 46 | {"type": "text", "text": full_prompt}, 47 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{original_image_base64}"}}, 48 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{result_image_base64}"}} 49 | ] 50 | }] 51 | ) 52 | return response 53 | except Exception as e: 54 | print(f"Error in calling GPT API: {e}") 55 | raise 56 | 57 | def save_result_jsonl(result, key, output_jsonl_path): 58 | with lock: 59 | with open(output_jsonl_path, 'a', encoding='utf-8') as f: 60 | data = { 61 | "key": key, 62 | "result": result 63 | } 64 | f.write(json.dumps(data, ensure_ascii=False) + '\n') 65 | 66 | def load_processed_keys(jsonl_path): 67 | processed_keys = set() 68 | if os.path.exists(jsonl_path): 69 | with open(jsonl_path, 'r', encoding='utf-8') as f: 70 | for line in f: 71 | try: 72 | data = json.loads(line) 73 | processed_keys.add(data["key"]) 74 | except Exception as e: 75 | print(f"Error loading line: {e}") 76 | return processed_keys 77 | 78 | def collect_jsonl_to_dict(jsonl_path): 79 | result_dict = {} 80 | if os.path.exists(jsonl_path): 81 | with open(jsonl_path, 'r', encoding='utf-8') as f: 82 | for line in f: 83 | try: 84 | data = json.loads(line) 85 | result_dict[data["key"]] = data["result"] 86 | except Exception as e: 87 | print(f"Error parsing line: {e}") 88 | return result_dict 89 | 90 | def process_single_item(key, item, result_img_folder, origin_img_root, prompts, output_jsonl_path): 91 | result_img_name = f"{key}.png" 92 | result_img_path = os.path.join(result_img_folder, result_img_name) 93 | origin_img_path = os.path.join(origin_img_root, item['id']) 94 | edit_prompt = item['prompt'] 95 | edit_type = item['edit_type'] 96 | 97 | response = call_gpt(origin_img_path, result_img_path, edit_prompt, edit_type, prompts) 98 | # Ensure 'choices' attribute exists in response 99 | result = response.choices[0].message.content if hasattr(response, "choices") else str(response) 100 | save_result_jsonl(result, key, output_jsonl_path) 101 | return key, result 102 | 103 | def process_json(edit_json, result_img_folder, origin_img_root, num_threads, prompts): 104 | output_jsonl_path = os.path.join(result_img_folder, 'result.jsonl') 105 | output_json_path = os.path.join(result_img_folder, 'result.json') 106 | with open(edit_json, 'r') as f: 107 | edit_infos = json.load(f) 108 | # Load already processed keys 109 | processed_keys = load_processed_keys(output_jsonl_path) 110 | print(f"{len(processed_keys)} items already processed, {len(edit_infos) - len(processed_keys)} remaining...") 111 | # Filter out tasks that have already been processed 112 | left_edit_infos = {k: v for k, v in edit_infos.items() if k not in processed_keys} 113 | total = len(left_edit_infos) 114 | if total == 0: 115 | print("Nothing to process. All items are completed.") 116 | else: 117 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 118 | future_to_key = { 119 | executor.submit(process_single_item, key, item, result_img_folder, origin_img_root, prompts, output_jsonl_path): key 120 | for key, item in left_edit_infos.items() 121 | } 122 | for future in tqdm(as_completed(future_to_key), total=total, desc="Processing edits"): 123 | key = future_to_key[future] 124 | try: 125 | future.result() # Already saved in jsonl 126 | except Exception as e: 127 | print(f"Error processing key {key}: {e}") 128 | # Failed keys will not be saved to jsonl 129 | # After all finished, collect jsonl to dict and save to json 130 | final_results = collect_jsonl_to_dict(output_jsonl_path) 131 | with open(output_json_path, 'w', encoding='utf-8') as f: 132 | json.dump(final_results, f, indent=4, ensure_ascii=False) 133 | print(f"All processing completed. Final result saved in {output_json_path}") 134 | 135 | def main(): 136 | parser = argparse.ArgumentParser(description="Evaluate image edits using GPT") 137 | parser.add_argument('--result_img_folder', type=str, required=True, help="Folder with subfolders of edited images") 138 | parser.add_argument('--edit_json', type=str, required=True, help="Path to JSON file mapping keys to metadata") 139 | parser.add_argument('--origin_img_root', type=str, required=True, help="Root path where original images are stored") 140 | parser.add_argument('--num_processes', type=int, default=32, help="Number of parallel threads") 141 | parser.add_argument('--prompts_json', type=str, required=True, help="JSON file containing prompts") 142 | args = parser.parse_args() 143 | 144 | prompts = load_prompts(args.prompts_json) 145 | process_json(args.edit_json, args.result_img_folder, args.origin_img_root, args.num_processes, prompts) 146 | 147 | if __name__ == "__main__": 148 | base_url = "your_api_url" 149 | api_version = "2024-03-01-preview" 150 | api_key = openai.api_key 151 | model = "gpt-4o-2024-11-20" 152 | openai_client = openai.AzureOpenAI( 153 | azure_endpoint=base_url, 154 | api_version=api_version, 155 | api_key=api_key, 156 | ) 157 | main() 158 | -------------------------------------------------------------------------------- /data/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Bytedance Ltd. and/or its affiliates. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | 5 | import math 6 | import random 7 | from PIL import Image 8 | 9 | import torch 10 | from torch.nn.attention.flex_attention import or_masks, and_masks 11 | 12 | 13 | def create_sparse_mask(document_lens, split_lens, attn_modes, device): 14 | def causal_mask(b, h, q_idx, kv_idx): 15 | return q_idx >= kv_idx 16 | 17 | def full_and_noise_mask(b, h, q_idx, kv_idx): 18 | return (full_and_noise_seq_id[q_idx] == full_and_noise_seq_id[kv_idx]) & (full_and_noise_seq_id[q_idx] >= 0) 19 | 20 | def remove_noise_mask(b, h, q_idx, kv_idx): 21 | return (~((noise_seq_id[kv_idx] >= 0) & (noise_seq_id[q_idx] != noise_seq_id[kv_idx]))) 22 | 23 | def sample_mask(b, h, q_idx, kv_idx): 24 | return document_id[q_idx] == document_id[kv_idx] 25 | 26 | full_and_noise_tmp = [] 27 | noise_tmp = [] 28 | 29 | for i, (length, model) in enumerate(zip(split_lens, attn_modes)): 30 | value = i if model in ['full', 'noise'] else -1 31 | full_and_noise_tmp.extend([value] * length) 32 | value_noise = i if model == 'noise' else -1 33 | noise_tmp.extend([value_noise] * length) 34 | 35 | full_and_noise_seq_id = torch.Tensor(full_and_noise_tmp).to(device) 36 | noise_seq_id = torch.Tensor(noise_tmp).to(device) 37 | 38 | document_id = torch.cat([torch.full((l,), i) for i, l in enumerate(document_lens, start=1)]).to(device) 39 | 40 | return and_masks(or_masks(causal_mask, full_and_noise_mask), remove_noise_mask, sample_mask) 41 | 42 | 43 | def patchify(image, patch_size): 44 | p = patch_size 45 | c, h, w = image.shape 46 | assert h % p == 0 and w % p == 0 47 | image = image.reshape(c, h // p, p, w // p, p) 48 | image = torch.einsum("chpwq->hwpqc", image) 49 | image = image.reshape(-1, p**2 * c) 50 | return image 51 | 52 | 53 | def get_flattened_position_ids_extrapolate(img_h, img_w, patch_size, max_num_patches_per_side): 54 | num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size 55 | coords_h = torch.arange(0, num_patches_h) 56 | coords_w = torch.arange(0, num_patches_w) 57 | pos_ids = (coords_h[:, None] * max_num_patches_per_side + coords_w).flatten() 58 | return pos_ids 59 | 60 | 61 | def get_flattened_position_ids_interpolate(img_h, img_w, patch_size, max_num_patches_per_side): 62 | num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size 63 | boundaries = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side) 64 | fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h) 65 | fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w) 66 | bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) 67 | bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) 68 | pos_ids = (bucket_coords_h[:, None] * max_num_patches_per_side + bucket_coords_w).flatten() 69 | return pos_ids 70 | 71 | 72 | def prepare_attention_mask_per_sample(split_lens, attn_modes, device="cpu"): 73 | """ 74 | nested_split_lens: A list of N lists of ints. Each int indicates the length of a split within 75 | a sample, where each sample contains multiple splits with different attn modes. 76 | nested_attn_modes: whether to use full attn in each split. 77 | """ 78 | sample_len = sum(split_lens) 79 | attention_mask = torch.zeros((sample_len, sample_len), dtype=torch.bool, device=device) 80 | 81 | csum = 0 82 | for s, attn_mode in zip(split_lens, attn_modes): 83 | assert attn_mode in ['causal', 'full', 'noise'] 84 | if attn_mode == "causal": 85 | attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s), device=device).tril() 86 | attention_mask[csum:csum + s, :csum] = 1 87 | else: 88 | attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s)) 89 | attention_mask[csum:csum + s, :csum] = 1 90 | csum += s 91 | 92 | csum = 0 93 | for s, attn_mode in zip(split_lens, attn_modes): 94 | if attn_mode == "noise": 95 | attention_mask[:, csum : csum + s] = torch.zeros((sample_len, s)) 96 | attention_mask[csum : csum + s, csum : csum + s] = torch.ones((s, s)) 97 | csum += s 98 | 99 | attention_mask = torch.zeros_like(attention_mask, dtype=torch.float).masked_fill_( 100 | ~attention_mask, float("-inf") 101 | ) 102 | 103 | return attention_mask 104 | 105 | 106 | def split_integer_exp_decay(S, ng_sample_decay=1.0): 107 | if ng_sample_decay == 1.0: 108 | N = random.randint(1, S) 109 | else: 110 | base = (1 - ng_sample_decay) / (1 - math.pow(ng_sample_decay, S)) 111 | p = [base * math.pow(ng_sample_decay, i) for i in range(S)] 112 | N = random.choices(list(range(1, S + 1)), p, k=1)[0] 113 | cumsum = [0] + sorted(random.sample(range(1, S), N - 1)) + [S] 114 | result = [cumsum[i+1] - cumsum[i] for i in range(len(cumsum) - 1)] 115 | return result, cumsum 116 | 117 | 118 | def pil_img2rgb(image): 119 | if image.mode == "RGBA" or image.info.get("transparency", None) is not None: 120 | image = image.convert("RGBA") 121 | white = Image.new(mode="RGB", size=image.size, color=(255, 255, 255)) 122 | white.paste(image, mask=image.split()[3]) 123 | image = white 124 | else: 125 | image = image.convert("RGB") 126 | 127 | return image 128 | 129 | 130 | def add_special_tokens(tokenizer): 131 | all_special_tokens = [] 132 | for k, v in tokenizer.special_tokens_map.items(): 133 | if isinstance(v, str): 134 | all_special_tokens.append(v) 135 | elif isinstance(v, list): 136 | all_special_tokens += v 137 | 138 | new_tokens = [] 139 | 140 | if '<|im_start|>' not in all_special_tokens: 141 | new_tokens.append('<|im_start|>') 142 | 143 | if '<|im_end|>' not in all_special_tokens: 144 | new_tokens.append('<|im_end|>') 145 | 146 | if '<|vision_start|>' not in all_special_tokens: 147 | new_tokens.append('<|vision_start|>') 148 | 149 | if '<|vision_end|>' not in all_special_tokens: 150 | new_tokens.append('<|vision_end|>') 151 | 152 | num_new_tokens = tokenizer.add_tokens(new_tokens) 153 | bos_token_id = tokenizer.convert_tokens_to_ids('<|im_start|>') 154 | eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') 155 | start_of_image = tokenizer.convert_tokens_to_ids('<|vision_start|>') 156 | end_of_image = tokenizer.convert_tokens_to_ids('<|vision_end|>') 157 | 158 | new_token_ids = dict( 159 | bos_token_id=bos_token_id, 160 | eos_token_id=eos_token_id, 161 | start_of_image=start_of_image, 162 | end_of_image=end_of_image, 163 | ) 164 | 165 | return tokenizer, new_token_ids, num_new_tokens 166 | 167 | 168 | def len2weight(x, loss_reduction='square'): 169 | if x == 0: 170 | return x 171 | if loss_reduction == 'token': 172 | return 1 173 | if loss_reduction == 'sample': 174 | return 1 / x 175 | if loss_reduction == 'square': 176 | return 1 / (x ** 0.5) 177 | raise NotImplementedError(loss_reduction) 178 | -------------------------------------------------------------------------------- /modeling/siglip/processing_siglip.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Inc. team. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | """ 5 | Image/Text processor class for SigLIP. 6 | """ 7 | 8 | from typing import List, Optional, Union 9 | 10 | from transformers.feature_extraction_utils import BatchFeature 11 | from transformers.image_utils import ImageInput 12 | from transformers.processing_utils import ProcessorMixin 13 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy 14 | from transformers.utils import TensorType 15 | 16 | 17 | class SiglipProcessor(ProcessorMixin): 18 | r""" 19 | Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor. 20 | 21 | [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the 22 | [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information. 23 | 24 | Args: 25 | image_processor ([`SiglipImageProcessor`]): 26 | The image processor is a required input. 27 | tokenizer ([`SiglipTokenizer`]): 28 | The tokenizer is a required input. 29 | """ 30 | 31 | attributes = ["image_processor", "tokenizer"] 32 | image_processor_class = "SiglipImageProcessor" 33 | tokenizer_class = "SiglipTokenizer" 34 | 35 | def __init__(self, image_processor, tokenizer): 36 | super().__init__(image_processor, tokenizer) 37 | 38 | def __call__( 39 | self, 40 | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, 41 | images: ImageInput = None, 42 | padding: Union[bool, str, PaddingStrategy] = False, 43 | truncation: Union[bool, str, TruncationStrategy] = None, 44 | max_length: int = None, 45 | return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, 46 | ) -> BatchFeature: 47 | """ 48 | Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` 49 | and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode 50 | the text. To prepare the image(s), this method forwards the `images` argument to 51 | SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring 52 | of the above two methods for more information. 53 | 54 | Args: 55 | text (`str`, `List[str]`, `List[List[str]]`): 56 | The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings 57 | (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set 58 | `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). 59 | images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): 60 | The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch 61 | tensor. Both channels-first and channels-last formats are supported. 62 | padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): 63 | Select a strategy to pad the returned sequences (according to the model's padding side and padding 64 | index) among: 65 | - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single 66 | sequence if provided). 67 | - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum 68 | acceptable input length for the model if that argument is not provided. 69 | - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different 70 | lengths). 71 | max_length (`int`, *optional*): 72 | Maximum length of the returned list and optionally padding length (see above). 73 | truncation (`bool`, *optional*): 74 | Activates truncation to cut input sequences longer than `max_length` to `max_length`. 75 | return_tensors (`str` or [`~utils.TensorType`], *optional*): 76 | If set, will return tensors of a particular framework. Acceptable values are: 77 | 78 | - `'tf'`: Return TensorFlow `tf.constant` objects. 79 | - `'pt'`: Return PyTorch `torch.Tensor` objects. 80 | - `'np'`: Return NumPy `np.ndarray` objects. 81 | - `'jax'`: Return JAX `jnp.ndarray` objects. 82 | 83 | Returns: 84 | [`BatchFeature`]: A [`BatchFeature`] with the following fields: 85 | 86 | - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. 87 | - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when 88 | `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not 89 | `None`). 90 | - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. 91 | """ 92 | 93 | if text is None and images is None: 94 | raise ValueError("You have to specify either text or images. Both cannot be none.") 95 | 96 | if text is not None: 97 | encoding = self.tokenizer( 98 | text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length 99 | ) 100 | 101 | if images is not None: 102 | image_features = self.image_processor(images, return_tensors=return_tensors) 103 | 104 | if text is not None and images is not None: 105 | encoding["pixel_values"] = image_features.pixel_values 106 | return encoding 107 | elif text is not None: 108 | return encoding 109 | else: 110 | return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) 111 | 112 | def decode(self, *args, **kwargs): 113 | """ 114 | This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to 115 | the docstring of this method for more information. 116 | """ 117 | return self.tokenizer.decode(*args, **kwargs) 118 | 119 | def batch_decode(self, *args, **kwargs): 120 | """ 121 | This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please 122 | refer to the docstring of this method for more information. 123 | """ 124 | return self.tokenizer.batch_decode(*args, **kwargs) 125 | 126 | @property 127 | # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip 128 | def model_input_names(self): 129 | tokenizer_input_names = self.tokenizer.model_input_names 130 | image_processor_input_names = self.image_processor.model_input_names 131 | return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) 132 | -------------------------------------------------------------------------------- /eval/vlm/eval/mmmu/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | """Utils for data load, save, and process (e.g., prompt construction)""" 13 | 14 | import json 15 | import os 16 | import re 17 | 18 | import yaml 19 | 20 | DOMAIN_CAT2SUB_CAT = { 21 | 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'], 22 | 'Business': ['Accounting', 'Economics', 'Finance', 'Manage', 'Marketing'], 23 | 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics', ], 24 | 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 25 | 'Pharmacy', 'Public_Health'], 26 | 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'], 27 | 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 28 | 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'], 29 | } 30 | 31 | CAT_SHORT2LONG = { 32 | 'acc': 'Accounting', 33 | 'agri': 'Agriculture', 34 | 'arch': 'Architecture_and_Engineering', 35 | 'art': 'Art', 36 | 'art_theory': 'Art_Theory', 37 | 'bas_med': 'Basic_Medical_Science', 38 | 'bio': 'Biology', 39 | 'chem': 'Chemistry', 40 | 'cli_med': 'Clinical_Medicine', 41 | 'cs': 'Computer_Science', 42 | 'design': 'Design', 43 | 'diag_med': 'Diagnostics_and_Laboratory_Medicine', 44 | 'econ': 'Economics', 45 | 'elec': 'Electronics', 46 | 'ep': 'Energy_and_Power', 47 | 'fin': 'Finance', 48 | 'geo': 'Geography', 49 | 'his': 'History', 50 | 'liter': 'Literature', 51 | 'manage': 'Manage', 52 | 'mark': 'Marketing', 53 | 'mate': 'Materials', 54 | 'math': 'Math', 55 | 'mech': 'Mechanical_Engineering', 56 | 'music': 'Music', 57 | 'phar': 'Pharmacy', 58 | 'phys': 'Physics', 59 | 'psy': 'Psychology', 60 | 'pub_health': 'Public_Health', 61 | 'socio': 'Sociology' 62 | } 63 | 64 | 65 | # DATA SAVING 66 | def save_json(filename, ds): 67 | with open(filename, 'w') as f: 68 | json.dump(ds, f, indent=4) 69 | 70 | 71 | def get_multi_choice_info(options): 72 | """ 73 | Given the list of options for multiple choice question 74 | Return the index2ans and all_choices 75 | """ 76 | 77 | start_chr = 'A' 78 | all_choices = [] 79 | index2ans = {} 80 | for i, option in enumerate(options): 81 | index2ans[chr(ord(start_chr) + i)] = option 82 | all_choices.append(chr(ord(start_chr) + i)) 83 | 84 | return index2ans, all_choices 85 | 86 | 87 | def load_yaml(file_path): 88 | with open(file_path, 'r') as stream: 89 | try: 90 | yaml_dict = yaml.safe_load(stream) 91 | except yaml.YAMLError as exc: 92 | print(exc) 93 | 94 | return yaml_dict 95 | 96 | 97 | def parse_img_path(text): 98 | matches = re.findall("", text) 99 | return matches 100 | 101 | 102 | def process_single_sample(data): 103 | question = data['question'] 104 | o_imgs_paths = [] 105 | for option in data['options']: 106 | current_o_imgs_paths = parse_img_path(option) 107 | for img_path in current_o_imgs_paths: 108 | o_imgs_paths.append(img_path) 109 | images = [data['image_1'], data['image_2'], data['image_3'], data['image_4'], 110 | data['image_5'], data['image_6'], data['image_7']] 111 | return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'], 112 | 'image': images, 'question_type': data['question_type']} 113 | 114 | 115 | # DATA SAVING 116 | def save_json(filename, ds): 117 | with open(filename, 'w') as f: 118 | json.dump(ds, f, indent=4) 119 | 120 | 121 | def save_jsonl(filename, data): 122 | """ 123 | Save a dictionary of data to a JSON Lines file with the filename as key and caption as value. 124 | 125 | Args: 126 | filename (str): The path to the file where the data should be saved. 127 | data (dict): The dictionary containing the data to save where key is the image path and value is the caption. 128 | """ 129 | with open(filename, 'w', encoding='utf-8') as f: 130 | for img_path, caption in data.items(): 131 | # Extract the base filename without the extension 132 | base_filename = os.path.basename(img_path) 133 | # Create a JSON object with the filename as the key and caption as the value 134 | json_record = json.dumps({base_filename: caption}, ensure_ascii=False) 135 | # Write the JSON object to the file, one per line 136 | f.write(json_record + '\n') 137 | 138 | 139 | def save_args(args, path_dir): 140 | argsDict = args.__dict__ 141 | with open(path_dir + 'setting.txt', 'w') as f: 142 | f.writelines('------------------ start ------------------' + '\n') 143 | for eachArg, value in argsDict.items(): 144 | f.writelines(eachArg + ' : ' + str(value) + '\n') 145 | f.writelines('------------------- end -------------------') 146 | 147 | 148 | # DATA PROCESSING 149 | def construct_prompt(sample, config): 150 | question = sample['question'] 151 | options = eval(sample['options']) 152 | example = '' 153 | if sample['question_type'] == 'multiple-choice': 154 | start_chr = 'A' 155 | prediction_range = [] 156 | index2ans = {} 157 | for option in options: 158 | prediction_range.append(start_chr) 159 | example += f'({start_chr}) {option}\n' 160 | index2ans[start_chr] = option 161 | start_chr = chr(ord(start_chr) + 1) 162 | empty_prompt_sample_structure = config['multi_choice_example_format'] 163 | empty_prompt = empty_prompt_sample_structure.format(question, example) 164 | res_dict = {} 165 | res_dict['index2ans'] = index2ans 166 | res_dict['correct_choice'] = sample['answer'] 167 | res_dict['all_choices'] = prediction_range 168 | res_dict['empty_prompt'] = empty_prompt 169 | if config['task_instructions']: 170 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt 171 | else: 172 | res_dict['final_input_prompt'] = empty_prompt 173 | 174 | res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')] 175 | else: 176 | empty_prompt_sample_structure = config['short_ans_example_format'] 177 | empty_prompt = empty_prompt_sample_structure.format(question) 178 | res_dict = {} 179 | res_dict['empty_prompt'] = empty_prompt 180 | if config['task_instructions']: 181 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt 182 | else: 183 | res_dict['final_input_prompt'] = empty_prompt 184 | res_dict['gt_content'] = sample['answer'] 185 | 186 | res_dict.update(sample) 187 | return res_dict 188 | -------------------------------------------------------------------------------- /eval/gen/geneval/prompts/create_prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Dhruba Ghosh 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/djghosh13/geneval/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | """ 13 | Generate prompts for evaluation 14 | """ 15 | 16 | import argparse 17 | import json 18 | import os 19 | import yaml 20 | 21 | import numpy as np 22 | 23 | # Load classnames 24 | 25 | with open("object_names.txt") as cls_file: 26 | classnames = [line.strip() for line in cls_file] 27 | 28 | # Proper a vs an 29 | 30 | def with_article(name: str): 31 | if name[0] in "aeiou": 32 | return f"an {name}" 33 | return f"a {name}" 34 | 35 | # Proper plural 36 | 37 | def make_plural(name: str): 38 | if name[-1] in "s": 39 | return f"{name}es" 40 | return f"{name}s" 41 | 42 | # Generates single object samples 43 | 44 | def generate_single_object_sample(rng: np.random.Generator, size: int = None): 45 | TAG = "single_object" 46 | if size > len(classnames): 47 | size = len(classnames) 48 | print(f"Not enough distinct classes, generating only {size} samples") 49 | return_scalar = size is None 50 | size = size or 1 51 | idxs = rng.choice(len(classnames), size=size, replace=False) 52 | samples = [dict( 53 | tag=TAG, 54 | include=[ 55 | {"class": classnames[idx], "count": 1} 56 | ], 57 | prompt=f"a photo of {with_article(classnames[idx])}" 58 | ) for idx in idxs] 59 | if return_scalar: 60 | return samples[0] 61 | return samples 62 | 63 | # Generate two object samples 64 | 65 | def generate_two_object_sample(rng: np.random.Generator): 66 | TAG = "two_object" 67 | idx_a, idx_b = rng.choice(len(classnames), size=2, replace=False) 68 | return dict( 69 | tag=TAG, 70 | include=[ 71 | {"class": classnames[idx_a], "count": 1}, 72 | {"class": classnames[idx_b], "count": 1} 73 | ], 74 | prompt=f"a photo of {with_article(classnames[idx_a])} and {with_article(classnames[idx_b])}" 75 | ) 76 | 77 | # Generate counting samples 78 | 79 | numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"] 80 | 81 | def generate_counting_sample(rng: np.random.Generator, max_count=4): 82 | TAG = "counting" 83 | idx = rng.choice(len(classnames)) 84 | num = int(rng.integers(2, max_count, endpoint=True)) 85 | return dict( 86 | tag=TAG, 87 | include=[ 88 | {"class": classnames[idx], "count": num} 89 | ], 90 | exclude=[ 91 | {"class": classnames[idx], "count": num + 1} 92 | ], 93 | prompt=f"a photo of {numbers[num]} {make_plural(classnames[idx])}" 94 | ) 95 | 96 | # Generate color samples 97 | 98 | colors = ["red", "orange", "yellow", "green", "blue", "purple", "pink", "brown", "black", "white"] 99 | 100 | def generate_color_sample(rng: np.random.Generator): 101 | TAG = "colors" 102 | idx = rng.choice(len(classnames) - 1) + 1 103 | idx = (idx + classnames.index("person")) % len(classnames) # No "[COLOR] person" prompts 104 | color = colors[rng.choice(len(colors))] 105 | return dict( 106 | tag=TAG, 107 | include=[ 108 | {"class": classnames[idx], "count": 1, "color": color} 109 | ], 110 | prompt=f"a photo of {with_article(color)} {classnames[idx]}" 111 | ) 112 | 113 | # Generate position samples 114 | 115 | positions = ["left of", "right of", "above", "below"] 116 | 117 | def generate_position_sample(rng: np.random.Generator): 118 | TAG = "position" 119 | idx_a, idx_b = rng.choice(len(classnames), size=2, replace=False) 120 | position = positions[rng.choice(len(positions))] 121 | return dict( 122 | tag=TAG, 123 | include=[ 124 | {"class": classnames[idx_b], "count": 1}, 125 | {"class": classnames[idx_a], "count": 1, "position": (position, 0)} 126 | ], 127 | prompt=f"a photo of {with_article(classnames[idx_a])} {position} {with_article(classnames[idx_b])}" 128 | ) 129 | 130 | # Generate color attribution samples 131 | 132 | def generate_color_attribution_sample(rng: np.random.Generator): 133 | TAG = "color_attr" 134 | idxs = rng.choice(len(classnames) - 1, size=2, replace=False) + 1 135 | idx_a, idx_b = (idxs + classnames.index("person")) % len(classnames) # No "[COLOR] person" prompts 136 | cidx_a, cidx_b = rng.choice(len(colors), size=2, replace=False) 137 | return dict( 138 | tag=TAG, 139 | include=[ 140 | {"class": classnames[idx_a], "count": 1, "color": colors[cidx_a]}, 141 | {"class": classnames[idx_b], "count": 1, "color": colors[cidx_b]} 142 | ], 143 | prompt=f"a photo of {with_article(colors[cidx_a])} {classnames[idx_a]} and {with_article(colors[cidx_b])} {classnames[idx_b]}" 144 | ) 145 | 146 | 147 | # Generate evaluation suite 148 | 149 | def generate_suite(rng: np.random.Generator, n: int = 100, output_path: str = ""): 150 | samples = [] 151 | # Generate single object samples for all COCO classnames 152 | samples.extend(generate_single_object_sample(rng, size=len(classnames))) 153 | # Generate two object samples (~100) 154 | for _ in range(n): 155 | samples.append(generate_two_object_sample(rng)) 156 | # Generate counting samples 157 | for _ in range(n): 158 | samples.append(generate_counting_sample(rng, max_count=4)) 159 | # Generate color samples 160 | for _ in range(n): 161 | samples.append(generate_color_sample(rng)) 162 | # Generate position samples 163 | for _ in range(n): 164 | samples.append(generate_position_sample(rng)) 165 | # Generate color attribution samples 166 | for _ in range(n): 167 | samples.append(generate_color_attribution_sample(rng)) 168 | # De-duplicate 169 | unique_samples, used_samples = [], set() 170 | for sample in samples: 171 | sample_text = yaml.safe_dump(sample) 172 | if sample_text not in used_samples: 173 | unique_samples.append(sample) 174 | used_samples.add(sample_text) 175 | 176 | # Write to files 177 | os.makedirs(output_path, exist_ok=True) 178 | with open(os.path.join(output_path, "generation_prompts.txt"), "w") as fp: 179 | for sample in unique_samples: 180 | print(sample['prompt'], file=fp) 181 | with open(os.path.join(output_path, "evaluation_metadata.jsonl"), "w") as fp: 182 | for sample in unique_samples: 183 | print(json.dumps(sample), file=fp) 184 | 185 | 186 | if __name__ == "__main__": 187 | parser = argparse.ArgumentParser() 188 | parser.add_argument("--seed", type=int, default=43, help="generation seed (default: 43)") 189 | parser.add_argument("--num-prompts", "-n", type=int, default=100, help="number of prompts per task (default: 100)") 190 | parser.add_argument("--output-path", "-o", type=str, default="prompts", help="output folder for prompts and metadata (default: 'prompts/')") 191 | args = parser.parse_args() 192 | rng = np.random.default_rng(args.seed) 193 | generate_suite(rng, args.num_prompts, args.output_path) 194 | 195 | -------------------------------------------------------------------------------- /eval/vlm/eval/mathvista/utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import json 13 | import os 14 | import pickle 15 | import re 16 | import time 17 | 18 | import cv2 19 | import openai 20 | from word2number import w2n 21 | 22 | openai_client = None 23 | 24 | 25 | def create_dir(output_dir): 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | 30 | def read_csv(file): 31 | data = [] 32 | with open(file, 'r') as f: 33 | for line in f: 34 | data.append(line.strip()) 35 | return data 36 | 37 | 38 | def read_pandas_csv(csv_path): 39 | # read a pandas csv sheet 40 | import pandas as pd 41 | df = pd.read_csv(csv_path) 42 | return df 43 | 44 | 45 | def read_json(path): 46 | with open(path, 'r', encoding='utf-8') as f: 47 | return json.load(f) 48 | 49 | 50 | def read_jsonl(file): 51 | with open(file, 'r') as f: 52 | data = [json.loads(line) for line in f] 53 | return data 54 | 55 | 56 | def read_pickle(path): 57 | with open(path, 'rb') as f: 58 | return pickle.load(f) 59 | 60 | 61 | def save_json(data, path): 62 | with open(path, 'w') as f: 63 | json.dump(data, f, indent=4) 64 | 65 | 66 | def save_array_img(path, image): 67 | cv2.imwrite(path, image) 68 | 69 | 70 | def contains_digit(text): 71 | # check if text contains a digit 72 | if any(char.isdigit() for char in text): 73 | return True 74 | return False 75 | 76 | 77 | def contains_number_word(text): 78 | # check if text contains a number word 79 | ignore_words = ['a', 'an', 'point'] 80 | words = re.findall(r'\b\w+\b', text) # This regex pattern matches any word in the text 81 | for word in words: 82 | if word in ignore_words: 83 | continue 84 | try: 85 | w2n.word_to_num(word) 86 | return True # If the word can be converted to a number, return True 87 | except ValueError: 88 | continue # If the word can't be converted to a number, continue with the next word 89 | 90 | # check if text contains a digit 91 | if any(char.isdigit() for char in text): 92 | return True 93 | 94 | return False # If none of the words could be converted to a number, return False 95 | 96 | 97 | def contains_quantity_word(text, special_keep_words=[]): 98 | # check if text contains a quantity word 99 | quantity_words = ['most', 'least', 'fewest' 100 | 'more', 'less', 'fewer', 101 | 'largest', 'smallest', 'greatest', 102 | 'larger', 'smaller', 'greater', 103 | 'highest', 'lowest', 'higher', 'lower', 104 | 'increase', 'decrease', 105 | 'minimum', 'maximum', 'max', 'min', 106 | 'mean', 'average', 'median', 107 | 'total', 'sum', 'add', 'subtract', 108 | 'difference', 'quotient', 'gap', 109 | 'half', 'double', 'twice', 'triple', 110 | 'square', 'cube', 'root', 111 | 'approximate', 'approximation', 112 | 'triangle', 'rectangle', 'circle', 'square', 'cube', 'sphere', 'cylinder', 'cone', 'pyramid', 113 | 'multiply', 'divide', 114 | 'percentage', 'percent', 'ratio', 'proportion', 'fraction', 'rate', 115 | ] 116 | 117 | quantity_words += special_keep_words # dataset specific words 118 | 119 | words = re.findall(r'\b\w+\b', text) # This regex pattern matches any word in the text 120 | if any(word in quantity_words for word in words): 121 | return True 122 | 123 | return False # If none of the words could be converted to a number, return False 124 | 125 | 126 | def is_bool_word(text): 127 | if text in ['Yes', 'No', 'True', 'False', 128 | 'yes', 'no', 'true', 'false', 129 | 'YES', 'NO', 'TRUE', 'FALSE']: 130 | return True 131 | return False 132 | 133 | 134 | def is_digit_string(text): 135 | # remove ".0000" 136 | text = text.strip() 137 | text = re.sub(r'\.0+$', '', text) 138 | try: 139 | int(text) 140 | return True 141 | except ValueError: 142 | return False 143 | 144 | 145 | def is_float_string(text): 146 | # text is a float string if it contains a "." and can be converted to a float 147 | if '.' in text: 148 | try: 149 | float(text) 150 | return True 151 | except ValueError: 152 | return False 153 | return False 154 | 155 | 156 | def copy_image(image_path, output_image_path): 157 | from shutil import copyfile 158 | copyfile(image_path, output_image_path) 159 | 160 | 161 | def copy_dir(src_dir, dst_dir): 162 | from shutil import copytree 163 | 164 | # copy the source directory to the target directory 165 | copytree(src_dir, dst_dir) 166 | 167 | 168 | import PIL.Image as Image 169 | 170 | 171 | def get_image_size(img_path): 172 | img = Image.open(img_path) 173 | width, height = img.size 174 | return width, height 175 | 176 | 177 | def get_chat_response( 178 | promot="", api_key="", 179 | base_url="your_api_url", 180 | api_version="2024-03-01-preview", model="gpt-4-0613", 181 | temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0 182 | ): 183 | openai_client = openai.AzureOpenAI( 184 | azure_endpoint=base_url, 185 | api_version=api_version, 186 | api_key=api_key, 187 | ) 188 | 189 | messages = [ 190 | {'role': 'user', 'content': promot}, 191 | ] 192 | while patience > 0: 193 | patience -= 1 194 | try: 195 | response = openai_client.chat.completions.create( 196 | model=model, 197 | messages=messages, 198 | # api_key=api_key, 199 | temperature=temperature, 200 | max_tokens=max_tokens, 201 | n=n, 202 | ) 203 | response = response.to_dict() 204 | if n == 1: 205 | prediction = response['choices'][0]['message']['content'].strip() 206 | if prediction != '' and prediction is not None: 207 | return prediction 208 | else: 209 | prediction = [choice['message']['content'].strip() for choice in response['choices']] 210 | if prediction[0] != '' and prediction[0] is not None: 211 | return prediction 212 | 213 | except Exception as e: 214 | if 'Rate limit' not in str(e): 215 | print(e) 216 | 217 | if 'Please reduce the length of the messages' in str(e): 218 | print('!!Reduce promot size') 219 | # reduce input prompt and keep the tail 220 | new_size = int(len(promot) * 0.9) 221 | new_start = len(promot) - new_size 222 | promot = promot[new_start:] 223 | messages = [ 224 | {'role': 'user', 'content': promot}, 225 | ] 226 | 227 | if sleep_time > 0: 228 | time.sleep(sleep_time) 229 | return '' 230 | -------------------------------------------------------------------------------- /eval/vlm/eval/mathvista/evaluate_mathvista.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 OpenGVLab 2 | # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20. 6 | # 7 | # Original file was released under MIT, with the full license text 8 | # available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE. 9 | # 10 | # This modified file is released under the same license. 11 | 12 | import argparse 13 | import itertools 14 | import json 15 | import os 16 | import random 17 | 18 | import torch 19 | from datasets import load_dataset 20 | from eval.vlm.utils import load_model_and_tokenizer, build_transform, process_conversation 21 | from tqdm import tqdm 22 | 23 | ds_collections = { 24 | 'MathVista_testmini': { 25 | 'root': 'AI4Math/MathVista', 26 | 'max_new_tokens': 4096, 27 | 'min_new_tokens': 1, 28 | 'split': 'testmini' 29 | }, 30 | 'MathVista_test': { 31 | 'root': 'AI4Math/MathVista', 32 | 'max_new_tokens': 4096, 33 | 'min_new_tokens': 1, 34 | 'split': 'test' 35 | }, 36 | } 37 | 38 | 39 | COT_INSTRUCTION = ( 40 | 'Your task is to answer the question below. ' 41 | "Give step by step reasoning before you answer, and when you're ready to answer, " 42 | "please use the format \"Final answer: ..\"" 43 | '\n\n' 44 | 'Question:' 45 | '\n\n' 46 | '{question}' 47 | ) 48 | 49 | 50 | def collate_fn(batches): 51 | images = [_['images'] for _ in batches] 52 | data_items = [_['data_item'] for _ in batches] 53 | return images, data_items 54 | 55 | 56 | class MathVistaDataset(torch.utils.data.Dataset): 57 | 58 | def __init__(self, root, split): 59 | dataset = load_dataset(root, cache_dir=os.path.join(os.getcwd(), 'eval/vlm/data/MathVista/')) 60 | self.data = dataset[split] 61 | 62 | def __len__(self): 63 | return len(self.data) 64 | 65 | def __getitem__(self, idx): 66 | data_item = self.data[idx] 67 | image = data_item['decoded_image'] 68 | del data_item['decoded_image'] 69 | 70 | images = [image.convert('RGB') if image.mode != 'RGB' else image] 71 | 72 | return { 73 | 'images': images, 74 | 'data_item': data_item, 75 | } 76 | 77 | 78 | class InferenceSampler(torch.utils.data.sampler.Sampler): 79 | 80 | def __init__(self, size): 81 | self._size = int(size) 82 | assert size > 0 83 | self._rank = torch.distributed.get_rank() 84 | self._world_size = torch.distributed.get_world_size() 85 | self._local_indices = self._get_local_indices(size, self._world_size, self._rank) 86 | 87 | @staticmethod 88 | def _get_local_indices(total_size, world_size, rank): 89 | shard_size = total_size // world_size 90 | left = total_size % world_size 91 | shard_sizes = [shard_size + int(r < left) for r in range(world_size)] 92 | 93 | begin = sum(shard_sizes[:rank]) 94 | end = min(sum(shard_sizes[:rank + 1]), total_size) 95 | return range(begin, end) 96 | 97 | def __iter__(self): 98 | yield from self._local_indices 99 | 100 | def __len__(self): 101 | return len(self._local_indices) 102 | 103 | 104 | def evaluate_chat_model(): 105 | random.seed(args.seed) 106 | 107 | for ds_name in args.datasets: 108 | dataset = MathVistaDataset( 109 | root=ds_collections[ds_name]['root'], 110 | split=ds_collections[ds_name]['split'], 111 | ) 112 | dataloader = torch.utils.data.DataLoader( 113 | dataset=dataset, 114 | sampler=InferenceSampler(len(dataset)), 115 | batch_size=args.batch_size, 116 | num_workers=args.num_workers, 117 | pin_memory=True, 118 | drop_last=False, 119 | collate_fn=collate_fn, 120 | ) 121 | 122 | outputs = [] 123 | for _, (images, data_items) in tqdm(enumerate(dataloader)): 124 | if args.cot: 125 | question = COT_INSTRUCTION.format(question=data_items[0]['query']) 126 | else: 127 | question = data_items[0]['query'] 128 | 129 | images = images[0] 130 | images, conversation = process_conversation(images, question) 131 | 132 | pred = model.chat( 133 | tokenizer, 134 | new_token_ids, 135 | image_transform, 136 | images=images, 137 | prompt=conversation, 138 | max_length=ds_collections[ds_name]['max_new_tokens'] if not args.cot else 4096, # TODO: how to use ds_collections[ds_name]['min_new_tokens'] 139 | ) 140 | 141 | data_item = data_items[0] 142 | data_item['response'] = pred 143 | outputs.append(data_item) 144 | 145 | torch.distributed.barrier() 146 | 147 | world_size = torch.distributed.get_world_size() 148 | merged_outputs = [None for _ in range(world_size)] 149 | torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs)) 150 | 151 | merged_outputs = [json.loads(_) for _ in merged_outputs] 152 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 153 | 154 | if torch.distributed.get_rank() == 0: 155 | temp = {} 156 | for data_item in merged_outputs: 157 | pid = data_item['pid'] 158 | temp[pid] = data_item 159 | 160 | print(f'Evaluating {ds_name} ...') 161 | results_file = 'results.json' 162 | output_path = os.path.join(args.out_dir, 'results.json') 163 | json.dump(temp, open(output_path, 'w'), indent=4) 164 | print('Results saved to {}'.format(output_path)) 165 | 166 | if args.cot: 167 | cmd = f'python eval/vlm/eval/mathvista/extract_answer_mp.py --output_file {results_file} --output_dir {args.out_dir}' 168 | else: 169 | cmd = f'python eval/vlm/eval/mathvista/extract_answer_mp.py --output_file {results_file} --output_dir {args.out_dir}' 170 | print(cmd) 171 | os.system(cmd) 172 | 173 | cmd = f'python eval/vlm/eval/mathvista/calculate_score.py --output_file {results_file} --output_dir {args.out_dir} --score_file score.json' 174 | print(cmd) 175 | os.system(cmd) 176 | 177 | 178 | if __name__ == '__main__': 179 | parser = argparse.ArgumentParser() 180 | parser.add_argument('--datasets', type=str, default='MathVista_testmini') 181 | parser.add_argument('--batch-size', type=int, default=1) 182 | parser.add_argument('--num-workers', type=int, default=1) 183 | parser.add_argument('--out-dir', type=str, default='results') 184 | parser.add_argument('--seed', type=int, default=0) 185 | parser.add_argument('--cot', action='store_true') 186 | parser.add_argument('--model-path', type=str, default='hf/BAGEL-7B-MoT/') 187 | args = parser.parse_args() 188 | 189 | if not os.path.exists(args.out_dir): 190 | os.makedirs(args.out_dir, exist_ok=True) 191 | 192 | args.datasets = args.datasets.split(',') 193 | print('datasets:', args.datasets) 194 | assert args.batch_size == 1, 'Only batch size 1 is supported' 195 | 196 | torch.distributed.init_process_group( 197 | backend='nccl', 198 | world_size=int(os.getenv('WORLD_SIZE', '1')), 199 | rank=int(os.getenv('RANK', '0')), 200 | ) 201 | 202 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 203 | 204 | model, tokenizer, new_token_ids = load_model_and_tokenizer(args) 205 | image_transform = build_transform() 206 | 207 | total_params = sum(p.numel() for p in model.parameters()) / 1e9 208 | print(f'[test] total_params: {total_params}B') 209 | 210 | evaluate_chat_model() 211 | --------------------------------------------------------------------------------