├── data └── .gitkeep ├── .gitignore ├── caparena_metrics.py ├── caparena_auto_eval.py ├── caparena_auto_scores.py ├── cal_ranking.py ├── readme.md └── vlm_as_a_judge.py /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 忽略 data/caparena_auto/ 和 data/eval/ 目录下的所有内容 2 | data/caparena_auto/* 3 | data/eval/* 4 | 5 | # 允许 data 目录本身存在,即使它是空的 6 | !data/ 7 | 8 | # 忽略 .DS_Store 文件 9 | .DS_Store 10 | 11 | # 忽略 Python 相关缓存文件 12 | __pycache__/ 13 | *.pyc 14 | *.pyo 15 | *.pyd 16 | 17 | # 忽略虚拟环境(如果有) 18 | venv/ 19 | .env 20 | 21 | # 忽略临时脚本 22 | temp_script.py 23 | metrics_vis.py 24 | -------------------------------------------------------------------------------- /caparena_metrics.py: -------------------------------------------------------------------------------- 1 | # Calculate caption-level agreement and model-level agreement based on metrics annotation results 2 | # Usage: python caparena_metrics.py --eval_dir data/eval/caparena_annots_eval_gpt_ref.json 3 | import json 4 | from cal_ranking import calculate_elo_rankings 5 | from vlm_as_a_judge import cal_agreement 6 | from scipy.stats import spearmanr, kendalltau 7 | import argparse 8 | 9 | 10 | def cal_model_level_agreement(sorted_model_names, ranking_human=["GPT-4o-0806", "human", "Gemini-2.0-flash-exp", "InternVL2-26B", "Gemini-1.5-pro-002", 11 | "Claude-3.5-Sonnet-0620", "GPT-4o-mini-0718", "LLama-3.2-90B", "Qwen2-VL-72B-Instruct", 12 | "CogVLM2-llama3-chat-19B", "MiniCPM-V2.6-8B", "Qwen2-VL-7B-Instruct", "Qwen2-VL-2B-Instruct", 13 | "LLaVA-1.6-34B", "LLaVA-1.5-7B"]): 14 | print(f"Num models: {len(ranking_human)}") 15 | print("Human ranking:") 16 | print(ranking_human) 17 | 18 | if "human" in sorted_model_names: 19 | sorted_model_names.remove("human") 20 | print("Metrics ranking:") 21 | print(sorted_model_names) 22 | sorted_ranking = [i+1 for i in range(len(sorted_model_names))] # Model ranking positions 23 | 24 | # Convert ranking_human to rankings 25 | human_ranking = [ranking_human.index(model) + 1 for model in sorted_model_names] 26 | 27 | # Calculate Spearman correlation coefficient 28 | rho, p_value = spearmanr(human_ranking, sorted_ranking) 29 | print(f"Spearman ρ: {rho}") 30 | 31 | # Calculate Kendall Tau correlation coefficient 32 | tau, kendall_p_value = kendalltau(human_ranking, sorted_ranking) 33 | print(f"Kendall Tau: {tau}") 34 | 35 | 36 | def cal_metrics_agreement(eval_dir): 37 | metrics_annot = json.load(open(eval_dir, 'r')) 38 | 39 | # Calculate caption-level agreement 40 | print("Caption-level agreement:") 41 | cal_agreement(metrics_annot, include_tie=True, in_400=False) 42 | 43 | # Calculate Elo ranking 44 | print("Model-level agreement:") 45 | sorted_model_names = calculate_elo_rankings(eval_dir) 46 | print(sorted_model_names) 47 | 48 | # Calculate model-level agreement 49 | cal_model_level_agreement(sorted_model_names) 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser(description='Calculate metrics agreement') 53 | parser.add_argument('--eval_dir', type=str, required=True, help='Path to JSON file containing caption evaluation candidates') 54 | args = parser.parse_args() 55 | eval_dir = args.eval_dir 56 | cal_metrics_agreement(eval_dir) 57 | 58 | -------------------------------------------------------------------------------- /caparena_auto_eval.py: -------------------------------------------------------------------------------- 1 | # Calculate CapArena-Auto Scores by performing pairwise evaluation 2 | # usage: python caparena_auto_eval.py --test_model Model-Test --result_path xxx/test_model_result.json --imgs_dir xxx/all_images 3 | import json 4 | import os 5 | import random 6 | import argparse 7 | from tqdm import tqdm 8 | from vlm_as_a_judge import mllm_judge_pairs 9 | 10 | 11 | def calculate_caparena_auto_scores(test_model, caparena_eval_dir, result_path, imgs_dir, caparena_auto_600_path): 12 | 13 | test_model_save = os.path.join(caparena_eval_dir, f"{test_model}.json") 14 | # Step1: convert the caption result to caparena_auto format 15 | caparena_auto_600 = json.load(open(caparena_auto_600_path, 'r')) 16 | test_model_data = json.load(open(result_path, 'r')) 17 | caparena_eval = [] 18 | 19 | for img_filename, img_data in tqdm(caparena_auto_600.items()): 20 | if img_data["ref_model"] == test_model: 21 | continue 22 | 23 | data_item = dict() 24 | data_item["img"] = img_filename 25 | 26 | ref_model = img_data["ref_model"] 27 | eval_model = test_model 28 | ref_caption = img_data["captions"][ref_model] 29 | eval_caption = test_model_data[img_filename] 30 | 31 | if random.randint(0, 1) == 0: 32 | data_item["source1"] = ref_model 33 | data_item["source2"] = eval_model 34 | data_item["caption1"] = ref_caption 35 | data_item["caption2"] = eval_caption 36 | else: 37 | data_item["source1"] = eval_model 38 | data_item["source2"] = ref_model 39 | data_item["caption1"] = eval_caption 40 | data_item["caption2"] = ref_caption 41 | 42 | data_item["ref"] = img_data["captions"]["human"] 43 | data_item["ref_model"] = ref_model 44 | 45 | caparena_eval.append(data_item) 46 | 47 | print(f"Num of eval for model {test_model}: {len(caparena_eval)}") 48 | json.dump(caparena_eval, open(test_model_save, "w")) 49 | 50 | # Step2: use GPT-4o-as-a-Judge to perform pairwise judgment for the model's generated results 51 | print(f"Evaluating {test_model} ...") 52 | mllm_judge_pairs( 53 | caption_eval_cand_dir=test_model_save, 54 | imgs_dir=imgs_dir, 55 | with_ref=True, 56 | cal_agree=False, 57 | eval_model_name=test_model 58 | ) 59 | 60 | def main(): 61 | # Set up argument parser 62 | parser = argparse.ArgumentParser(description="Calculate CapArena-Auto Scores and perform pairwise evaluation.") 63 | parser.add_argument('--caparena_eval_dir', type=str, default='data/caparena_auto', help="Directory to save evaluation data.") 64 | parser.add_argument('--caparena_auto_600_path', type=str, default='data/caparena_auto/caparena_auto_600.json', help="Path to the CapArena Auto 600 JSON file.") 65 | parser.add_argument('--test_model', type=str, required=True, help="The name of the model to test.") 66 | parser.add_argument('--result_path', type=str, required=True, help="Path to the result JSON file.") 67 | parser.add_argument('--imgs_dir', type=str, required=True, help="Directory containing the images.") 68 | 69 | args = parser.parse_args() 70 | 71 | calculate_caparena_auto_scores( 72 | test_model=args.test_model, 73 | caparena_eval_dir=args.caparena_eval_dir, 74 | result_path=args.result_path, 75 | imgs_dir=args.imgs_dir, 76 | caparena_auto_600_path=args.caparena_auto_600_path 77 | ) 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /caparena_auto_scores.py: -------------------------------------------------------------------------------- 1 | # Calculate CapArena-Auto Leaderboard scores 2 | # usage: python caparena_auto_scores.py --caparena_auto_dir data/caparena_auto 3 | # usage: python caparena_auto_scores.py --caparena_auto_dir data/caparena_auto --new_model_name Model-Test 4 | import json 5 | import os 6 | import argparse 7 | 8 | def calculate_caparena_scores(caparena_auto_dir, new_model_name=None): 9 | model_list_default = [ 10 | "GPT-4o-0806", "Claude-3.5-Sonnet-0620", "Gemini-1.5-pro-002", "InternVL2-26B", 11 | "Gemini-2.0-flash-exp", "Qwen2-VL-72B-Instruct", "CogVLM2-llama3-chat-19B", 12 | "GPT-4o-mini-0718", "LLama-3.2-90B", "MiniCPM-V2.6-8B", "LLaVA-1.6-34B", 13 | "Qwen2-VL-2B-Instruct", "Qwen2-VL-7B-Instruct", "LLaVA-1.5-7B", "cambrian-34b", 14 | "LLaVA-OV-72b", "Ovis-1_6-27b", "Ovis-2-34b", "Internvl2-5-8b", "Qwen2.5VL-72B", 15 | "Hunyuan-standard-vision", "GLM-4V-Plus" 16 | ] 17 | # model_list_default = [ 18 | # "GPT-4o-0806", "Claude-3.5-Sonnet-0620", "Gemini-1.5-pro-002", "InternVL2-26B", 19 | # "Gemini-2.0-flash-exp", "Qwen2-VL-72B-Instruct", "CogVLM2-llama3-chat-19B", 20 | # "GPT-4o-mini-0718", "LLama-3.2-90B", "MiniCPM-V2.6-8B", "LLaVA-1.6-34B", 21 | # "Qwen2-VL-2B-Instruct", "Qwen2-VL-7B-Instruct", "LLaVA-1.5-7B" 22 | # ] 23 | 24 | if new_model_name: 25 | new_model_list = [new_model_name] 26 | else: 27 | new_model_list = None 28 | 29 | model_list = model_list_default if new_model_list is None else model_list_default + new_model_list 30 | 31 | score_all = {} 32 | for model_name in model_list: 33 | caparena_auto_eval = json.load(open(os.path.join(caparena_auto_dir, model_name+".json"), "r")) 34 | score_refs = {"GPT-4o-0806": [0, 0], "CogVLM2-llama3-chat-19B": [0, 0], "MiniCPM-V2.6-8B": [0, 0]} 35 | caption_length = [] 36 | for data_item in caparena_auto_eval: 37 | 38 | if model_name == data_item["source1"]: 39 | caption_length.append(len(data_item["caption1"].split(' '))) 40 | else: 41 | caption_length.append(len(data_item["caption2"].split(' '))) 42 | 43 | if "judge" not in data_item: 44 | continue 45 | 46 | if data_item["judge"] not in ["Caption 1 is better.", "Caption 1 is better", "Caption 2 is better.", 47 | "Caption 2 is better", "Tie", "Tie."]: 48 | print("GPT judgment not 1 or 2 or tie") 49 | continue 50 | score_refs[data_item["ref_model"]][0] += 1 51 | 52 | if data_item["judge"] in "Caption 1 is better.": 53 | winner = data_item["source1"] 54 | elif data_item["judge"] in "Caption 2 is better.": 55 | winner = data_item["source2"] 56 | else: 57 | winner = "Tie" 58 | 59 | if winner == "Tie": 60 | pass 61 | elif winner == model_name: 62 | score_refs[data_item["ref_model"]][1] += 1 63 | else: 64 | score_refs[data_item["ref_model"]][1] -= 1 65 | 66 | avg_score = 0 67 | for k, v in score_refs.items(): 68 | score_refs[k][1] = score_refs[k][1]/2 69 | avg_score += score_refs[k][1] 70 | score_refs["Score_Avg"] = avg_score/3 71 | 72 | # Calculate average length 73 | score_refs["Length_Avg"] = sum(caption_length)/len(caption_length) 74 | 75 | score_all[model_name] = score_refs 76 | 77 | sorted_models = sorted(score_all.items(), key=lambda x: x[1]['Score_Avg'], reverse=True) 78 | 79 | print("CapArena-Auto Leaderboard:") 80 | print(f"{'Model':<30} | {'Score_avg':<8} | {'Score_gpt':<10} | {'Score_cog':<10} | {'Score_cpm':<10} | {'Length_Avg':<10} |") 81 | print("-" * 85) 82 | 83 | for model, data in sorted_models: 84 | score_gpt = data['GPT-4o-0806'][1] 85 | score_cog = data['CogVLM2-llama3-chat-19B'][1] 86 | score_cpm = data['MiniCPM-V2.6-8B'][1] 87 | 88 | # Format each line to align with columns 89 | print(f"{model:<30} | {data['Score_Avg']:<8.2f} | {score_gpt:<10.2f} | {score_cog:<10.2f} | {score_cpm:<10.2f} | {data['Length_Avg']:<10.2f} |") 90 | 91 | return [model for model, data in sorted_models] 92 | 93 | 94 | def main(): 95 | parser = argparse.ArgumentParser(description='Calculate CapArena Auto Scores') 96 | parser.add_argument('--caparena_auto_dir', type=str, default="data/caparena_auto", help='Directory containing CapArena auto evaluation files') 97 | parser.add_argument('--new_model_name', type=str, default=None, help='Name of new model to add to the leaderboard') 98 | args = parser.parse_args() 99 | 100 | sorted_model_names = calculate_caparena_scores(args.caparena_auto_dir, args.new_model_name) 101 | print("\nSorted model names:") 102 | print(sorted_model_names) 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | -------------------------------------------------------------------------------- /cal_ranking.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | from collections import defaultdict 4 | import json, math, gdown 5 | import numpy as np 6 | import pandas as pd 7 | import plotly.express as px 8 | from tqdm import tqdm 9 | 10 | pd.options.display.float_format = '{:.2f}'.format 11 | 12 | 13 | def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000): 14 | rating = defaultdict(lambda: INIT_RATING) 15 | 16 | for rd, source1, source2, winner in battles[['source1', 'source2', "winner_gpt"]].itertuples(): 17 | ra = rating[source1] 18 | rb = rating[source2] 19 | # if "human" in [source1, source2]: 20 | # continue 21 | ea = 1 / (1 + BASE ** ((rb - ra) / SCALE)) 22 | eb = 1 / (1 + BASE ** ((ra - rb) / SCALE)) 23 | if winner == source1: 24 | sa = 1 25 | elif winner == source2: 26 | sa = 0 27 | elif winner == "tie" or winner == "tie (bothbad)" or winner == "equal": 28 | sa = 0.5 29 | else: 30 | raise Exception(f"unexpected vote {winner}") 31 | rating[source1] += K * (sa - ea) 32 | rating[source2] += K * (1 - sa - eb) 33 | 34 | return rating 35 | 36 | 37 | def preety_print_elo_ratings(ratings): 38 | df = pd.DataFrame([ 39 | [n, ratings[n]] for n in ratings.keys() 40 | ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True) 41 | df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int) 42 | df.index = df.index + 1 43 | return df 44 | 45 | 46 | def get_bootstrap_result(battles, func_compute_elo, num_round): 47 | rows = [] 48 | for i in tqdm(range(num_round), desc="bootstrap"): 49 | rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True))) 50 | df = pd.DataFrame(rows) 51 | return df[df.median().sort_values(ascending=False).index] 52 | 53 | 54 | def compute_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000): 55 | from sklearn.linear_model import LogisticRegression 56 | models = pd.concat([df["source1"], df["source2"]]).unique() 57 | models = pd.Series(np.arange(len(models)), index=models) 58 | p = len(models.index) 59 | n = df.shape[0] 60 | 61 | X = np.zeros([n, p]) 62 | X[np.arange(n), models[df["source1"]]] = +math.log(BASE) 63 | X[np.arange(n), models[df["source2"]]] = -math.log(BASE) 64 | 65 | Y = np.zeros(n) 66 | Y[df["winner_gpt"] == df["source1"]] = 1.0 67 | 68 | lr = LogisticRegression(fit_intercept=False) 69 | lr.fit(X, Y) 70 | 71 | elo_scores = SCALE * lr.coef_[0] + INIT_RATING 72 | 73 | return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) 74 | 75 | 76 | def visualize_bootstrap_scores(df, title): 77 | bars = pd.DataFrame(dict( 78 | lower=df.quantile(.025), 79 | rating=df.quantile(.5), 80 | upper=df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False) 81 | bars['error_y'] = bars['upper'] - bars["rating"] 82 | bars['error_y_minus'] = bars['rating'] - bars["lower"] 83 | bars['rating_rounded'] = np.round(bars['rating'], 2) 84 | fig = px.scatter(bars, x="model", y="rating", error_y="error_y", 85 | error_y_minus="error_y_minus", text="rating_rounded", 86 | title=title) 87 | print(list(bars["model"]), bars) 88 | fig.update_layout(xaxis_title="Model", yaxis_title="Rating") 89 | return fig 90 | 91 | 92 | def calculate_elo_rankings(json_path): 93 | # Read JSON file 94 | with open(json_path, 'r', encoding='utf-8') as json_file: 95 | data = json.load(json_file) # data is a list of dictionaries 96 | 97 | result = [] 98 | 99 | for item in data: 100 | if "human" in [item["source1"], item["source2"]]: 101 | continue 102 | if "judge" not in item: 103 | continue 104 | if "1" in item["judge"]: 105 | item["winner_gpt"] = item["source1"] 106 | elif "2" in item["judge"]: 107 | item["winner_gpt"] = item["source2"] 108 | else: 109 | item["winner_gpt"] = "equal" 110 | result.append(item) 111 | 112 | # Convert JSON data to DataFrame 113 | df = pd.DataFrame(result) 114 | 115 | # Extract required columns 116 | battles = df[['img', 'source1', 'source2', 'winner_gpt']] 117 | 118 | # # Save as CSV file 119 | # battles.to_csv('check.csv', index=False, encoding='utf-8') 120 | 121 | # Calculate ELO scores 122 | elo_ratings = compute_elo(battles) 123 | preety_print_elo_ratings(elo_ratings) 124 | 125 | # Calculate bootstrap results 126 | BOOTSTRAP_ROUNDS = 1000 127 | np.random.seed(42) 128 | bootstrap_elo_lu = get_bootstrap_result(battles, compute_elo, BOOTSTRAP_ROUNDS) 129 | bootstrap_lu_median = bootstrap_elo_lu.median().reset_index().set_axis(["model", "Elo rating"], axis=1) 130 | bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int) 131 | 132 | print("Elo ranking by metrics:") 133 | print(bootstrap_lu_median) 134 | 135 | return list(bootstrap_lu_median["model"]) 136 | 137 | 138 | if __name__ == "__main__": 139 | sorted_model_names = calculate_elo_rankings('data/eval/caparena_annots_eval_qwen25vl72b.json') 140 | print(sorted_model_names) 141 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era 2 | 3 | [![arXiv](https://img.shields.io/badge/arXiv-2503.12329-b31b1b.svg)](https://arxiv.org/abs/2503.12329) 4 | [![project](https://img.shields.io/badge/Project-Page-blue?logo=github)](https://caparena.github.io/) 5 | [![huggingface](https://img.shields.io/badge/🤗%20HF-Leaderboard-orange)](https://huggingface.co/spaces/yan111222/CapArena_Auto) 6 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) 7 | [![PR's Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat)](http://makeapullrequest.com) 8 | 9 | This repository contains the data, code, and resources for the paper: **CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era** 10 | [\[Arxiv Link\]](https://arxiv.org/abs/2503.12329) 11 | [\[Project Page\]](https://caparena.github.io/) 12 | 13 | News: This paper was accepted by ACL 2025 findings. 14 | 15 | Release Plans: 16 | 17 | - [x] Usage of *CapArena-Auto* 18 | - [x] Code and data to reproduce the results in the paper 19 | - [x] Other resources 20 | 21 | 29 | 30 | 31 | *** 32 | ## 🏆 CapArena-Auto Benchmark 33 | 34 | *CapArena-Auto* is an arena-style automated evaluation benchmark for detailed image captioning. It features: 35 | - 600 evaluation images 36 | - Pairwise battles against three baseline models 37 | - GPT-4o-as-a-Judge scoring system 38 | - high correlation with human rankings 39 | 40 | **Current Leaderboard**: [🤗 CapArena-Auto Leaderboard](https://huggingface.co/spaces/yan111222/CapArena_Auto) (22 models evaluated) 41 | 42 | --- 43 | 44 | ## 🔍 Evaluation Steps 45 | 46 | ### Evaluate Your Own Model on CapArena-Auto 47 | 48 | #### 📥 Step 1: Download Required Files 49 | 1. Download the [evaluation images](https://box.nju.edu.cn/f/a79c42c9c10e4acb83e7/) (600 images) 50 | - Contains all 600 images used for CapArena-Auto evaluation 51 | - These are the test images your model will need to generate caption 52 | - File structure: `all_images/` folder with `test_XXXXX.jpg` files 53 | 54 | 2. Download the [result template](https://box.nju.edu.cn/f/43eb761488734c638824/) 55 | - Example JSON file showing the required output format 56 | - Ensures compatibility with our evaluation scripts 57 | 3. Download [human reference captions & existing model results](https://box.nju.edu.cn/f/707c01ccdb724d2f925f/) 58 | - Contains two critical components: 59 | - `caparena_auto_600.json`: 60 | - Human-annotated reference captions for all 600 images 61 | - Structure: `{"image_id": ...,"captions": {"human": ..., "gpt": "...", "cog": "...", "cpm": "..."}, "ref_model": ...}` 62 | - Leaderboard models' results (e.g. `GPT-4o-0806.json`, `Claude-3.5.json` etc.) 63 | - Pre-computed pair-wise battle results for current models in our leaderboard (used for results visualization) 64 | 68 | 69 | #### 🗂️ Step 2: Prepare Your Environment 70 | Create the following directory structure: 71 | ``` 72 | data/ 73 | └── caparena_auto/ 74 | ├── GPT-4o-0806.json 75 | ├── Claude-3.5-Sonnet-0620.json 76 | ├── CogVLM2-llama3-chat-19B.json 77 | └── ... 78 | ``` 79 | 80 | #### 📊 Step 3: Generate Captions 81 | Generate detailed captions for all 600 images using your model. Format your results as: 82 | ```json 83 | { 84 | "test_01258.jpg": "The image features a cle… day with good weather.", 85 | "test_04765.jpg": "The image shows a small,…on the brick structure.", 86 | "test_02788.jpg": "The scene depicts a pair… and recycling efforts.", 87 | "test_02765.jpg": "The photo captures a str…al beauty to the scene.", 88 | ... 89 | } 90 | ``` 91 | 92 | #### ⚖️ Step 4: Run Evaluation (Cost: ~$4) 93 | 1. Set your OpenAI API key: 94 | ```bash 95 | export OPENAI_API_KEY="sk-xxxx" 96 | ``` 97 | 2. Run evaluation: 98 | ```bash 99 | python caparena_auto_eval.py \ 100 | --test_model YourModelName \ % defined by yourself 101 | --result_path path/to/your_results.json \ 102 | --imgs_dir path/to/images 103 | ``` 104 | #### 🏅 Step 5: View Leaderboard Results 105 | ```bash 106 | python caparena_auto_scores.py \ 107 | --caparena_auto_dir data/caparena_auto \ 108 | --new_model_name YourModelName 109 | ``` 110 | > Note: If you would like to submit your results to the [online leaderboard](https://huggingface.co/spaces/yan111222/CapArena_Auto), please raise an issue or contact us! 111 | 112 | 113 | 114 | ``` 115 | Model | Score_avg | Score_gpt | Score_cog | Score_cpm | Length_Avg | 116 | ------------------------------------------------------------------------------------- 117 | Gemini-1.5-pro-002 | 56.17 | 29.00 | 61.00 | 78.50 | 168.56 | 118 | GPT-4o-0806 | 44.00 | 0.00 | 55.50 | 76.50 | 115.80 | 119 | Qwen2.5VL-72B | 35.33 | -1.00 | 49.00 | 58.00 | 163.67 | 120 | Gemini-2.0-flash-exp | 30.83 | -2.00 | 39.50 | 55.00 | 416.99 | 121 | Ovis-2-34b | 27.00 | -15.00 | 33.50 | 62.50 | 120.20 | 122 | Claude-3.5-Sonnet-0620 | 21.50 | -14.00 | 30.00 | 48.50 | 147.93 | 123 | InternVL2-26B | 13.00 | -38.50 | 20.00 | 57.50 | 236.32 | 124 | GPT-4o-mini-0718 | 9.33 | -36.00 | 17.00 | 47.00 | 139.83 | 125 | Ovis-1_6-27b | 3.00 | -49.50 | 14.50 | 44.00 | 94.16 | 126 | GLM-4V-Plus | -0.17 | -51.50 | 13.00 | 38.00 | 109.27 | 127 | CogVLM2-llama3-chat-19B | -8.50 | -56.50 | 0.00 | 31.00 | 115.87 | 128 | Qwen2-VL-72B-Instruct | -9.00 | -50.50 | -4.50 | 28.00 | 114.45 | 129 | LLaVA-OV-72b | -12.33 | -57.50 | -6.00 | 26.50 | 200.88 | 130 | LLama-3.2-90B | -25.67 | -72.00 | -13.00 | 8.00 | 160.25 | 131 | Hunyuan-standard-vision | -26.00 | -63.00 | -19.00 | 4.00 | 354.10 | 132 | Internvl2-5-8b | -29.83 | -71.00 | -29.00 | 10.50 | 117.77 | 133 | MiniCPM-V2.6-8B | -38.00 | -80.00 | -34.00 | 0.00 | 106.74 | 134 | Qwen2-VL-2B-Instruct | -48.67 | -86.00 | -49.50 | -10.50 | 116.84 | 135 | Qwen2-VL-7B-Instruct | -49.00 | -78.00 | -59.00 | -10.00 | 97.81 | 136 | LLaVA-1.6-34B | -67.50 | -92.00 | -53.50 | -57.00 | 124.81 | 137 | cambrian-34b | -75.00 | -93.00 | -76.00 | -56.00 | 120.23 | 138 | LLaVA-1.5-7B | -94.00 | -99.50 | -92.00 | -90.50 | 74.38 | 139 | ``` 140 | 141 | 142 | *** 143 | ### Reproduce Paper Results 144 | 145 | #### 📥 Step 1: Download Annotation Data 146 | 147 | Download the [human pair-wise battle annotation](https://box.nju.edu.cn/f/0fd0a0d3dce243ab8c12/) of *CapArena* and put them under `data/eval`. 148 | 149 | ``` 150 | data/ 151 | └── eval/ 152 | ├── caparena_annots_eval.json 153 | ├── caparena_annots_eval_gpt_ref.json 154 | ├── caparena_annots_eval_gpt.json 155 | └── ... 156 | ``` 157 | 158 | `caparena_annots_eval.json` is the human annotation results of *CapArena*, which contains 6523 pair-wise battle/judgment given by our human annotators. 159 | 160 | Other files are the results of the annotation of these 6523 pairs by captioning metrics (e.g., GPT-4o, GPT-4o with ref, LLaVA-OneVision). Each item in these files include a `judge` key to represent the judgment given by the metric. 161 | 162 | #### 🎯 Step 2: Calculate the caption-level agreement and model-level agreement 163 | 164 | Calculate caption-level agreement and model-level agreement based on metrics annotation results: 165 | 166 | ```bash 167 | python caparena_metrics.py \ 168 | --eval_dir data/eval/caparena_annots_eval_gpt_ref.json 169 | ``` 170 | 171 | ### ⚖️ VLM-as-a-Judge 172 | The above provides the VLM-as-a-Judge results that we have generated. 173 | If you want to reproduce our VLM-as-a-Judge process, first download the total [5100 images](https://box.nju.edu.cn/f/9d2b9ded47d54999926c/) from DOCCI. 174 | Then you can conduct GPT-4o-as-a-Judge by: 175 | ``` 176 | python vlm_as_a_judge.py --caption_eval_cand_dir data/eval/caparena_annots_eval.json --eval_save_path data/eval/caparena_annots_eval_gpt_ref.json --imgs_dir xxx/images 177 | ``` 178 | #### 📊 Calculating Additional Metrics with Human References 179 | 180 | If you need human-annotated references for calculating traditional metrics (e.g., BLEU, CIDEr, SPICE), you can obtain the DOCCI human descriptions from: 181 | [docci_descriptions](https://storage.googleapis.com/docci/data/docci_descriptions.jsonlines) 182 | 183 | *** 184 | ### Acknowledge 185 | 186 | Thanks to [DOCCI](https://google.github.io/docci/) for their high-quality human annotation and wonderful open-sourced work. 187 | 188 | Thanks to all the annotators who participated in compiling our CapArena dataset. 189 | 190 | *** 191 | ### Citation 192 | If you find this work helpful, please consider to star 🌟 this repo and cite our paper. 193 | ``` 194 | @article{cheng2025caparena, 195 | title={CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era}, 196 | author={Cheng, Kanzhi and Song, Wenpo and Fan, Jiaxin and Ma, Zheng and Sun, Qiushi and Xu, Fangzhi and Yan, Chenyang and Chen, Nuo and Zhang, Jianbing and Chen, Jiajun}, 197 | journal={arXiv preprint arXiv:2503.12329}, 198 | year={2025} 199 | } 200 | ``` -------------------------------------------------------------------------------- /vlm_as_a_judge.py: -------------------------------------------------------------------------------- 1 | # usage: python vlm_as_a_judge.py --caption_eval_cand_dir data/eval/caparena_annots_eval.json --eval_save_path data/eval/caparena_annots_eval_gpt_ref.json --imgs_dir data/eval/images 2 | import os 3 | import time 4 | import json 5 | import requests 6 | import base64 7 | from tqdm import tqdm 8 | import re 9 | import argparse 10 | import shutil 11 | 12 | 13 | # Helper functions for image encoding 14 | def encode_image(image_content): 15 | return base64.b64encode(image_content).decode('utf-8') 16 | 17 | 18 | def convert_image_to_base64(image_path): 19 | with open(image_path, 'rb') as f: 20 | image_bytes = f.read() 21 | return encode_image(image_bytes) 22 | 23 | 24 | # Function to call OpenAI API 25 | def call_llm(model_name, payload): 26 | headers = { 27 | "Content-Type": "application/json", 28 | "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}" 29 | } 30 | print("Generating content with GPT model: {}".format(model_name)) 31 | response = requests.post( 32 | "https://api.openai.com/v1/chat/completions", 33 | headers=headers, 34 | json={**payload, "temperature": 0.1} 35 | ) 36 | if response.status_code != 200: 37 | if response.json()['error']['code'] == "context_length_exceeded": 38 | print("Context length exceeded. Retrying with a smaller context.") 39 | payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:] 40 | retry_response = requests.post( 41 | "https://api.openai.com/v1/chat/completions", 42 | headers=headers, 43 | json={**payload, "temperature": 0.1} 44 | ) 45 | if retry_response.status_code != 200: 46 | print( 47 | "Failed to call LLM even after attempt on shortening the history: " + retry_response.text) 48 | return "" 49 | 50 | print("Failed to call LLM: " + response.text) 51 | time.sleep(2) 52 | return "" 53 | else: 54 | return response.json()['choices'][0]['message']['content'] 55 | 56 | 57 | # Calculate agreement between model predictions and human judgments 58 | def cal_agreement(caption_eval_pair_list, include_tie=False, in_400=False): 59 | agreement_level = {"overall": [], "level 1": [], "level 2": [], "level 3": [], "level 4": []} 60 | tie_num = {"level 1": 0, "level 2": 0, "level 3": 0, "level 4": 0} 61 | for item in caption_eval_pair_list: 62 | if "judge" not in item: 63 | continue 64 | 65 | if item["source1"] == "human" or item["source2"] == "human": 66 | continue 67 | 68 | if in_400 == True: 69 | if not item["in-400"]: 70 | continue 71 | 72 | if item["judge"] not in ["Caption 1 is better.", "Caption 1 is better", "Caption 2 is better.", "Caption 2 is better", "Tie", "Tie."]: 73 | print("GPT judgment not 1 or 2") 74 | continue 75 | 76 | if item["winner"] == item["source1"]: 77 | judge_human = "Caption 1 is better." 78 | elif item["winner"] == item["source2"]: 79 | judge_human = "Caption 2 is better." 80 | else: 81 | judge_human = "Tie." 82 | 83 | if not include_tie: 84 | if judge_human != "Tie." and item["judge"] != "Tie." and item["judge"] != "Tie": 85 | agree = 1 if item["judge"] in judge_human else 0 86 | agreement_level["overall"].append(agree) 87 | agreement_level[item["cluster"]].append(agree) 88 | else: 89 | agree = 1 if item["judge"] in judge_human else 0 90 | agreement_level["overall"].append(agree) 91 | agreement_level[item["cluster"]].append(agree) 92 | 93 | if item["judge"] == "Tie" or item["judge"] == "Tie.": 94 | tie_num[item["cluster"]] += 1 95 | 96 | overall = sum(agreement_level["overall"]) / len(agreement_level["overall"]) if len( 97 | agreement_level["overall"]) > 0 else None 98 | level1 = sum(agreement_level["level 1"]) / len(agreement_level["level 1"]) if len( 99 | agreement_level["level 1"]) > 0 else None 100 | level2 = sum(agreement_level["level 2"]) / len(agreement_level["level 2"]) if len( 101 | agreement_level["level 2"]) > 0 else None 102 | level3 = sum(agreement_level["level 3"]) / len(agreement_level["level 3"]) if len( 103 | agreement_level["level 3"]) > 0 else None 104 | level4 = sum(agreement_level["level 4"]) / len(agreement_level["level 4"]) if len( 105 | agreement_level["level 4"]) > 0 else None 106 | overall_num = len(agreement_level["overall"]) 107 | level1_num = len(agreement_level["level 1"]) 108 | level2_num = len(agreement_level["level 2"]) 109 | level3_num = len(agreement_level["level 3"]) 110 | level4_num = len(agreement_level["level 4"]) 111 | 112 | result = ( 113 | f"Overall: {overall if overall is None else f'{overall:.3f}'} ({overall_num}), " 114 | f"Level 1: {level1 if level1 is None else f'{level1:.3f}'} ({level1_num}), " 115 | f"Level 2: {level2 if level2 is None else f'{level2:.3f}'} ({level2_num}), " 116 | f"Level 3: {level3 if level3 is None else f'{level3:.3f}'} ({level3_num}), " 117 | f"Level 4: {level4 if level4 is None else f'{level4:.3f}'} ({level4_num})" 118 | ) 119 | print(result) 120 | 121 | if include_tie: 122 | level1_num = tie_num["level 1"] 123 | level2_num = tie_num["level 2"] 124 | level3_num = tie_num["level 3"] 125 | level4_num = tie_num["level 4"] 126 | result_tie_num = f"Level 1: {level1_num}, Level 2: {level2_num}, Level 3: {level3_num}, Level 4: {level4_num}" 127 | print(result_tie_num) 128 | 129 | 130 | system_prompt_without_ref = """ 131 | You are a highly capable multimodal AI assistant tasked with evaluating image captions. 132 | 133 | Given an image and two candidate captions, you are require to determine which of the two captions is better. 134 | 135 | Below are some guidelines for your reference: 136 | 137 | 1. **Precision**: The caption should accurately correspond to the content of the image, providing precise information about it. Common examples of imprecision include errors in color, quantity, spatial relationships, or the posture of people. 138 | 139 | 2. **Informativeness**: Salient information in the image should be reflected in the caption. Since it is impossible to include every detail, you will need to subjectively judge which aspects of the image are important. For instance, describing an otter as "a small animal" is precise, but it is less informative than specifying "an otter". 140 | 141 | 3. **Hallucination**: Captions that include descriptions of objects or elements that are clearly absent from the image should be significantly penalized. 142 | 143 | 4. **Attention to detail**: Annotators should pay close attention to the details in the image to distinguish the quality of the descriptions. 144 | 145 | 5. **Assistive description**: Imagine a visually impaired person asking you to describe the image for them. How would you convey the image to them? 146 | 147 | 6. **Reverse thinking**: What image does the caption lead us to imagine? Does the caption effectively lead you to imagine the intended image? 148 | 149 | 7. **Ties are acceptable**: If you find it genuinely difficult to determine which caption is better (e.g., both captions are excellent), marking a tie is acceptable. 150 | 151 | While the above guidelines provide a framework, they cannot cover all possible cases. Therefore, we encourage you to make **subjective judgments** based on the specific circumstances and your own reasoning about which caption is better. 152 | 153 | ### Response Format: 154 | Format your response into two lines as shown below: 155 | Reason: 156 | Judgment: // 157 | """ 158 | 159 | system_prompt_with_ref = """ 160 | You are a highly capable multimodal AI assistant tasked with evaluating image captions. 161 | 162 | Given an image, two candidate captions and one reference caption annotated by human expert, you are require to determine which of the two captions is better. 163 | 164 | Below are some guidelines for your reference: 165 | 166 | 1. **Precision**: The caption should accurately correspond to the content of the image, providing precise information about it. Common examples of imprecision include errors in color, quantity, spatial relationships, or the posture of people. 167 | 168 | 2. **Informativeness**: Salient information in the image should be reflected in the caption. Since it is impossible to include every detail, you will need to subjectively judge which aspects of the image are important. For instance, describing an otter as "a small animal" is precise, but it is less informative than specifying "an otter". 169 | 170 | 3. **Hallucination**: Captions that include descriptions of objects or elements that are clearly absent from the image should be significantly penalized. 171 | 172 | 4. **Attention to detail**: Annotators should pay close attention to the details in the image to distinguish the quality of the descriptions. 173 | 174 | 5. **Assistive description**: Imagine a visually impaired person asking you to describe the image for them. How would you convey the image to them? 175 | 176 | 6. **Reverse thinking**: What image does the caption lead us to imagine? Does the caption effectively lead you to imagine the intended image? 177 | 178 | 7. **Ties are acceptable**: If you find it genuinely difficult to determine which caption is better (e.g., both captions are excellent), marking a tie is acceptable. 179 | 180 | While the above guidelines provide a framework, they cannot cover all possible cases. Therefore, we encourage you to make **subjective judgments** based on the specific circumstances and your own reasoning about which caption is better. 181 | 182 | **Reference caption**: The reference caption is annotated by a human expert. When you're uncertain about which description is better (e.g., when unsure about specific details in the image), you can use the reference caption to assist your judgment. The content in the reference caption can be considered correct; however, it is not perfect, and descriptions not included in the reference caption can still be reasonable. 183 | 184 | ### Response Format: 185 | Format your response into two lines as shown below: 186 | Reason: 187 | Judgment: // 188 | """ 189 | 190 | def mllm_judge_pairs(caption_eval_cand_dir, imgs_dir, with_ref=True, cal_agree=True, eval_model_name=None): 191 | 192 | caption_eval_cand = json.load(open(caption_eval_cand_dir, 'r')) 193 | print(f"Num of All Caption Pair: {len(caption_eval_cand)}") 194 | 195 | for i, item in tqdm(enumerate(caption_eval_cand)): 196 | 197 | if "judge" in item: 198 | print("processed") 199 | continue 200 | 201 | if i % 20 == 0: 202 | json.dump(caption_eval_cand, open(caption_eval_cand_dir, 'w')) 203 | 204 | if item["source1"] == "human" or item["source2"] == "human": 205 | continue 206 | 207 | # if not item["in-400"]: 208 | # continue 209 | 210 | img_filename = item["img"] 211 | img_path = os.path.join(imgs_dir, img_filename) 212 | if not os.path.exists(img_path): 213 | print("img not exist") 214 | image = convert_image_to_base64(img_path) 215 | 216 | caption_1 = item["caption1"] 217 | caption_2 = item["caption2"] 218 | caption_ref = item["ref"] 219 | 220 | if with_ref: 221 | compare_prompt = f"Caption 1:\n{caption_1}\nCaption 2:\n{caption_2}\nCaption Reference:\n{caption_ref}\nDetermine which is better and answer with the given format. Only mark a tie if it is truly difficult to decide which caption is better based on their quality, informativeness, and precision." 222 | else: 223 | compare_prompt = f"Caption 1:\n{caption_1}\nCaption 2:\n{caption_2}\nDetermine which is better and answer with the given format. Only mark a tie if there is no discernible difference in quality, informativeness, and precision after careful evaluation." 224 | 225 | messages = [] 226 | 227 | messages.append({ 228 | "role": "system", 229 | "content": [ 230 | { 231 | "type": "text", 232 | "text": system_prompt_with_ref if with_ref else system_prompt_without_ref 233 | }, 234 | ] 235 | }) 236 | 237 | action_text_image = [] 238 | action_text_image.append( 239 | { 240 | "type": "image_url", 241 | "image_url": { 242 | "url": f"data:image/png;base64,{image}", 243 | "detail": "high" 244 | } 245 | } 246 | ) 247 | action_text_image.append( 248 | { 249 | "type": "text", 250 | "text": compare_prompt 251 | } 252 | ) 253 | 254 | messages.append({ 255 | "role": "user", 256 | "content": action_text_image 257 | }) 258 | 259 | print(compare_prompt) 260 | 261 | model_name = "gpt-4o-2024-08-06" 262 | try_num = 0 263 | while try_num < 5: 264 | try_num += 1 265 | try: 266 | response = call_llm(model_name, { 267 | "model": model_name, 268 | "messages": messages, 269 | "max_tokens": 1500, 270 | "top_p": 0.9, 271 | "temperature": 0.5 272 | }) 273 | except: 274 | print("error call") 275 | time.sleep(1.0) 276 | continue 277 | try: 278 | print(response) 279 | reason_match = re.search(r"Reason:\s*(.+?)\s*Judgment:", response, re.DOTALL) 280 | judge_match = re.search(r"Judgment:\s*(.+)", response) 281 | reason = reason_match.group(1).strip() if reason_match else None 282 | judgment = judge_match.group(1).strip() if judge_match else None 283 | 284 | if reason and judgment: 285 | item["judge_reason"] = response 286 | item["judge"] = judgment 287 | break 288 | else: 289 | print("Invalid response format, retrying...") 290 | time.sleep(1.0) 291 | 292 | except json.JSONDecodeError: 293 | # If response is not valid JSON, continue generating 294 | print("Invalid response received, retrying...") 295 | time.sleep(1.0) 296 | 297 | num_processed = len([item for item in caption_eval_cand if ("judge" in item)]) 298 | if eval_model_name != None: 299 | print("Eval Model: {} Num of total: {} Num of success: {}".format(eval_model_name, len(caption_eval_cand), num_processed)) 300 | else: 301 | print("Num of total: {} Num of success: {}".format(len(caption_eval_cand), num_processed)) 302 | 303 | if cal_agree: 304 | cal_agreement(caption_eval_cand, include_tie=True) 305 | 306 | json.dump(caption_eval_cand, open(caption_eval_cand_dir, 'w')) 307 | print("Done") 308 | 309 | def main(): 310 | parser = argparse.ArgumentParser(description='Evaluate the quality of image captions') 311 | parser.add_argument('--caption_eval_cand_dir', type=str, required=True, help='Path to JSON file containing caption evaluation candidates') 312 | parser.add_argument('--eval_save_path', type=str, required=True, help='Path to save evaluation results') 313 | parser.add_argument('--imgs_dir', type=str, required=True, help='Path to directory containing images') 314 | parser.add_argument('--with_ref', type=bool, default=True, help='Whether to use reference captions for evaluation') 315 | parser.add_argument('--cal_agree', type=bool, default=True, help='Whether to calculate agreement') 316 | parser.add_argument('--eval_model_name', type=str, default=None, help='Name of evaluation model') 317 | 318 | args = parser.parse_args() 319 | # Copy original evaluation file to new save path 320 | shutil.copy(args.caption_eval_cand_dir, args.eval_save_path) 321 | print(f"Evaluation file copied to: {args.eval_save_path}") 322 | 323 | mllm_judge_pairs( 324 | caption_eval_cand_dir=args.eval_save_path, 325 | imgs_dir=args.imgs_dir, 326 | with_ref=args.with_ref, 327 | cal_agree=args.cal_agree, 328 | eval_model_name=args.eval_model_name 329 | ) 330 | 331 | if __name__ == "__main__": 332 | main() --------------------------------------------------------------------------------