├── data
    └── .gitkeep
├── .gitignore
├── caparena_metrics.py
├── caparena_auto_eval.py
├── caparena_auto_scores.py
├── cal_ranking.py
├── readme.md
└── vlm_as_a_judge.py


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # 忽略 data/caparena_auto/ 和 data/eval/ 目录下的所有内容
 2 | data/caparena_auto/*
 3 | data/eval/*
 4 | 
 5 | # 允许 data 目录本身存在，即使它是空的
 6 | !data/
 7 | 
 8 | # 忽略 .DS_Store 文件
 9 | .DS_Store
10 | 
11 | # 忽略 Python 相关缓存文件
12 | __pycache__/
13 | *.pyc
14 | *.pyo
15 | *.pyd
16 | 
17 | # 忽略虚拟环境（如果有）
18 | venv/
19 | .env
20 | 
21 | # 忽略临时脚本
22 | temp_script.py
23 | metrics_vis.py
24 | 


--------------------------------------------------------------------------------
/caparena_metrics.py:
--------------------------------------------------------------------------------
 1 | # Calculate caption-level agreement and model-level agreement based on metrics annotation results
 2 | # Usage: python caparena_metrics.py --eval_dir data/eval/caparena_annots_eval_gpt_ref.json
 3 | import json
 4 | from cal_ranking import calculate_elo_rankings
 5 | from vlm_as_a_judge import cal_agreement
 6 | from scipy.stats import spearmanr, kendalltau
 7 | import argparse
 8 | 
 9 | 
10 | def cal_model_level_agreement(sorted_model_names, ranking_human=["GPT-4o-0806", "human", "Gemini-2.0-flash-exp", "InternVL2-26B", "Gemini-1.5-pro-002",
11 |                                                                 "Claude-3.5-Sonnet-0620", "GPT-4o-mini-0718", "LLama-3.2-90B", "Qwen2-VL-72B-Instruct", 
12 |                                                                 "CogVLM2-llama3-chat-19B", "MiniCPM-V2.6-8B", "Qwen2-VL-7B-Instruct", "Qwen2-VL-2B-Instruct",
13 |                                                                 "LLaVA-1.6-34B", "LLaVA-1.5-7B"]):
14 |     print(f"Num models: {len(ranking_human)}")
15 |     print("Human ranking:")
16 |     print(ranking_human)
17 | 
18 |     if "human" in sorted_model_names:
19 |         sorted_model_names.remove("human")
20 |     print("Metrics ranking:")
21 |     print(sorted_model_names)
22 |     sorted_ranking = [i+1 for i in range(len(sorted_model_names))]  # Model ranking positions
23 | 
24 |     # Convert ranking_human to rankings
25 |     human_ranking = [ranking_human.index(model) + 1 for model in sorted_model_names]
26 | 
27 |     # Calculate Spearman correlation coefficient
28 |     rho, p_value = spearmanr(human_ranking, sorted_ranking)
29 |     print(f"Spearman ρ: {rho}")
30 | 
31 |     # Calculate Kendall Tau correlation coefficient
32 |     tau, kendall_p_value = kendalltau(human_ranking, sorted_ranking)
33 |     print(f"Kendall Tau: {tau}")
34 | 
35 | 
36 | def cal_metrics_agreement(eval_dir):
37 |     metrics_annot = json.load(open(eval_dir, 'r'))
38 | 
39 |     # Calculate caption-level agreement
40 |     print("Caption-level agreement:")
41 |     cal_agreement(metrics_annot, include_tie=True, in_400=False)
42 | 
43 |     # Calculate Elo ranking
44 |     print("Model-level agreement:")
45 |     sorted_model_names = calculate_elo_rankings(eval_dir)
46 |     print(sorted_model_names)
47 | 
48 |     # Calculate model-level agreement
49 |     cal_model_level_agreement(sorted_model_names)
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser(description='Calculate metrics agreement')
53 |     parser.add_argument('--eval_dir', type=str, required=True, help='Path to JSON file containing caption evaluation candidates')
54 |     args = parser.parse_args()
55 |     eval_dir = args.eval_dir
56 |     cal_metrics_agreement(eval_dir)
57 | 
58 | 


--------------------------------------------------------------------------------
/caparena_auto_eval.py:
--------------------------------------------------------------------------------
 1 | # Calculate CapArena-Auto Scores by performing pairwise evaluation
 2 | # usage: python caparena_auto_eval.py --test_model Model-Test --result_path xxx/test_model_result.json --imgs_dir xxx/all_images
 3 | import json
 4 | import os
 5 | import random
 6 | import argparse
 7 | from tqdm import tqdm
 8 | from vlm_as_a_judge import mllm_judge_pairs
 9 | 
10 | 
11 | def calculate_caparena_auto_scores(test_model, caparena_eval_dir, result_path, imgs_dir, caparena_auto_600_path):
12 | 
13 |     test_model_save = os.path.join(caparena_eval_dir, f"{test_model}.json")
14 |     # Step1: convert the caption result to caparena_auto format
15 |     caparena_auto_600 = json.load(open(caparena_auto_600_path, 'r'))
16 |     test_model_data = json.load(open(result_path, 'r'))
17 |     caparena_eval = []
18 | 
19 |     for img_filename, img_data in tqdm(caparena_auto_600.items()):
20 |         if img_data["ref_model"] == test_model:
21 |             continue
22 | 
23 |         data_item = dict()
24 |         data_item["img"] = img_filename
25 | 
26 |         ref_model = img_data["ref_model"]
27 |         eval_model = test_model
28 |         ref_caption = img_data["captions"][ref_model]
29 |         eval_caption = test_model_data[img_filename]
30 | 
31 |         if random.randint(0, 1) == 0:
32 |             data_item["source1"] = ref_model
33 |             data_item["source2"] = eval_model
34 |             data_item["caption1"] = ref_caption
35 |             data_item["caption2"] = eval_caption
36 |         else:
37 |             data_item["source1"] = eval_model
38 |             data_item["source2"] = ref_model
39 |             data_item["caption1"] = eval_caption
40 |             data_item["caption2"] = ref_caption
41 | 
42 |         data_item["ref"] = img_data["captions"]["human"]
43 |         data_item["ref_model"] = ref_model
44 | 
45 |         caparena_eval.append(data_item)
46 | 
47 |     print(f"Num of eval for model {test_model}: {len(caparena_eval)}")
48 |     json.dump(caparena_eval, open(test_model_save, "w"))
49 | 
50 |     # Step2: use GPT-4o-as-a-Judge to perform pairwise judgment for the model's generated results
51 |     print(f"Evaluating {test_model} ...")
52 |     mllm_judge_pairs(
53 |         caption_eval_cand_dir=test_model_save,
54 |         imgs_dir=imgs_dir,
55 |         with_ref=True,
56 |         cal_agree=False,
57 |         eval_model_name=test_model
58 |     )
59 | 
60 | def main():
61 |     # Set up argument parser
62 |     parser = argparse.ArgumentParser(description="Calculate CapArena-Auto Scores and perform pairwise evaluation.")
63 |     parser.add_argument('--caparena_eval_dir', type=str, default='data/caparena_auto', help="Directory to save evaluation data.")
64 |     parser.add_argument('--caparena_auto_600_path', type=str, default='data/caparena_auto/caparena_auto_600.json', help="Path to the CapArena Auto 600 JSON file.")
65 |     parser.add_argument('--test_model', type=str, required=True, help="The name of the model to test.")
66 |     parser.add_argument('--result_path', type=str, required=True, help="Path to the result JSON file.")
67 |     parser.add_argument('--imgs_dir', type=str, required=True, help="Directory containing the images.")
68 | 
69 |     args = parser.parse_args()
70 | 
71 |     calculate_caparena_auto_scores(
72 |         test_model=args.test_model,
73 |         caparena_eval_dir=args.caparena_eval_dir,
74 |         result_path=args.result_path,
75 |         imgs_dir=args.imgs_dir,
76 |         caparena_auto_600_path=args.caparena_auto_600_path
77 |     )
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/caparena_auto_scores.py:
--------------------------------------------------------------------------------
  1 | # Calculate CapArena-Auto Leaderboard scores
  2 | # usage: python caparena_auto_scores.py --caparena_auto_dir data/caparena_auto
  3 | # usage: python caparena_auto_scores.py --caparena_auto_dir data/caparena_auto --new_model_name Model-Test
  4 | import json
  5 | import os
  6 | import argparse
  7 | 
  8 | def calculate_caparena_scores(caparena_auto_dir, new_model_name=None):
  9 |     model_list_default = [
 10 |         "GPT-4o-0806", "Claude-3.5-Sonnet-0620", "Gemini-1.5-pro-002", "InternVL2-26B",
 11 |         "Gemini-2.0-flash-exp", "Qwen2-VL-72B-Instruct", "CogVLM2-llama3-chat-19B",
 12 |         "GPT-4o-mini-0718", "LLama-3.2-90B", "MiniCPM-V2.6-8B", "LLaVA-1.6-34B",
 13 |         "Qwen2-VL-2B-Instruct", "Qwen2-VL-7B-Instruct", "LLaVA-1.5-7B", "cambrian-34b",
 14 |         "LLaVA-OV-72b", "Ovis-1_6-27b", "Ovis-2-34b", "Internvl2-5-8b", "Qwen2.5VL-72B",
 15 |         "Hunyuan-standard-vision", "GLM-4V-Plus"
 16 |     ]
 17 |     # model_list_default = [
 18 |     #     "GPT-4o-0806", "Claude-3.5-Sonnet-0620", "Gemini-1.5-pro-002", "InternVL2-26B",
 19 |     #     "Gemini-2.0-flash-exp", "Qwen2-VL-72B-Instruct", "CogVLM2-llama3-chat-19B",
 20 |     #     "GPT-4o-mini-0718", "LLama-3.2-90B", "MiniCPM-V2.6-8B", "LLaVA-1.6-34B",
 21 |     #     "Qwen2-VL-2B-Instruct", "Qwen2-VL-7B-Instruct", "LLaVA-1.5-7B"
 22 |     # ]
 23 | 
 24 |     if new_model_name:
 25 |         new_model_list = [new_model_name]
 26 |     else:
 27 |         new_model_list = None
 28 |         
 29 |     model_list = model_list_default if new_model_list is None else model_list_default + new_model_list
 30 | 
 31 |     score_all = {}
 32 |     for model_name in model_list:
 33 |         caparena_auto_eval = json.load(open(os.path.join(caparena_auto_dir, model_name+".json"), "r"))
 34 |         score_refs = {"GPT-4o-0806": [0, 0], "CogVLM2-llama3-chat-19B": [0, 0], "MiniCPM-V2.6-8B": [0, 0]}
 35 |         caption_length = []
 36 |         for data_item in caparena_auto_eval:
 37 | 
 38 |             if model_name == data_item["source1"]:
 39 |                 caption_length.append(len(data_item["caption1"].split(' ')))
 40 |             else:
 41 |                 caption_length.append(len(data_item["caption2"].split(' ')))
 42 | 
 43 |             if "judge" not in data_item:
 44 |                 continue
 45 | 
 46 |             if data_item["judge"] not in ["Caption 1 is better.", "Caption 1 is better", "Caption 2 is better.",
 47 |                                      "Caption 2 is better", "Tie", "Tie."]:
 48 |                 print("GPT judgment not 1 or 2 or tie")
 49 |                 continue
 50 |             score_refs[data_item["ref_model"]][0] += 1
 51 | 
 52 |             if data_item["judge"] in "Caption 1 is better.":
 53 |                 winner = data_item["source1"]
 54 |             elif data_item["judge"] in "Caption 2 is better.":
 55 |                 winner = data_item["source2"]
 56 |             else:
 57 |                 winner = "Tie"
 58 | 
 59 |             if winner == "Tie":
 60 |                 pass
 61 |             elif winner == model_name:
 62 |                 score_refs[data_item["ref_model"]][1] += 1
 63 |             else:
 64 |                 score_refs[data_item["ref_model"]][1] -= 1
 65 | 
 66 |         avg_score = 0
 67 |         for k, v in score_refs.items():
 68 |             score_refs[k][1] = score_refs[k][1]/2
 69 |             avg_score += score_refs[k][1]
 70 |         score_refs["Score_Avg"] = avg_score/3
 71 | 
 72 |         # Calculate average length
 73 |         score_refs["Length_Avg"] = sum(caption_length)/len(caption_length)
 74 | 
 75 |         score_all[model_name] = score_refs
 76 | 
 77 |     sorted_models = sorted(score_all.items(), key=lambda x: x[1]['Score_Avg'], reverse=True)
 78 | 
 79 |     print("CapArena-Auto Leaderboard:")
 80 |     print(f"{'Model':<30} | {'Score_avg':<8} | {'Score_gpt':<10} | {'Score_cog':<10} | {'Score_cpm':<10} | {'Length_Avg':<10} |")
 81 |     print("-" * 85)
 82 | 
 83 |     for model, data in sorted_models:
 84 |         score_gpt = data['GPT-4o-0806'][1]
 85 |         score_cog = data['CogVLM2-llama3-chat-19B'][1]
 86 |         score_cpm = data['MiniCPM-V2.6-8B'][1]
 87 | 
 88 |         # Format each line to align with columns
 89 |         print(f"{model:<30} | {data['Score_Avg']:<8.2f} | {score_gpt:<10.2f} | {score_cog:<10.2f} | {score_cpm:<10.2f} | {data['Length_Avg']:<10.2f} |")
 90 | 
 91 |     return [model for model, data in sorted_models]
 92 | 
 93 | 
 94 | def main():
 95 |     parser = argparse.ArgumentParser(description='Calculate CapArena Auto Scores')
 96 |     parser.add_argument('--caparena_auto_dir', type=str, default="data/caparena_auto", help='Directory containing CapArena auto evaluation files')
 97 |     parser.add_argument('--new_model_name', type=str, default=None, help='Name of new model to add to the leaderboard')
 98 |     args = parser.parse_args()
 99 |     
100 |     sorted_model_names = calculate_caparena_scores(args.caparena_auto_dir, args.new_model_name)
101 |     print("\nSorted model names:")
102 |     print(sorted_model_names)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/cal_ranking.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import csv
  3 | from collections import defaultdict
  4 | import json, math, gdown
  5 | import numpy as np
  6 | import pandas as pd
  7 | import plotly.express as px
  8 | from tqdm import tqdm
  9 | 
 10 | pd.options.display.float_format = '{:.2f}'.format
 11 | 
 12 | 
 13 | def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
 14 |     rating = defaultdict(lambda: INIT_RATING)
 15 | 
 16 |     for rd, source1, source2, winner in battles[['source1', 'source2', "winner_gpt"]].itertuples():
 17 |         ra = rating[source1]
 18 |         rb = rating[source2]
 19 |         # if "human" in [source1, source2]:
 20 |         #     continue
 21 |         ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
 22 |         eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
 23 |         if winner == source1:
 24 |             sa = 1
 25 |         elif winner == source2:
 26 |             sa = 0
 27 |         elif winner == "tie" or winner == "tie (bothbad)" or winner == "equal":
 28 |             sa = 0.5
 29 |         else:
 30 |             raise Exception(f"unexpected vote {winner}")
 31 |         rating[source1] += K * (sa - ea)
 32 |         rating[source2] += K * (1 - sa - eb)
 33 | 
 34 |     return rating
 35 | 
 36 | 
 37 | def preety_print_elo_ratings(ratings):
 38 |     df = pd.DataFrame([
 39 |         [n, ratings[n]] for n in ratings.keys()
 40 |     ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
 41 |     df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
 42 |     df.index = df.index + 1
 43 |     return df
 44 | 
 45 | 
 46 | def get_bootstrap_result(battles, func_compute_elo, num_round):
 47 |     rows = []
 48 |     for i in tqdm(range(num_round), desc="bootstrap"):
 49 |         rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
 50 |     df = pd.DataFrame(rows)
 51 |     return df[df.median().sort_values(ascending=False).index]
 52 | 
 53 | 
 54 | def compute_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
 55 |     from sklearn.linear_model import LogisticRegression
 56 |     models = pd.concat([df["source1"], df["source2"]]).unique()
 57 |     models = pd.Series(np.arange(len(models)), index=models)
 58 |     p = len(models.index)
 59 |     n = df.shape[0]
 60 | 
 61 |     X = np.zeros([n, p])
 62 |     X[np.arange(n), models[df["source1"]]] = +math.log(BASE)
 63 |     X[np.arange(n), models[df["source2"]]] = -math.log(BASE)
 64 | 
 65 |     Y = np.zeros(n)
 66 |     Y[df["winner_gpt"] == df["source1"]] = 1.0
 67 | 
 68 |     lr = LogisticRegression(fit_intercept=False)
 69 |     lr.fit(X, Y)
 70 | 
 71 |     elo_scores = SCALE * lr.coef_[0] + INIT_RATING
 72 | 
 73 |     return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
 74 | 
 75 | 
 76 | def visualize_bootstrap_scores(df, title):
 77 |     bars = pd.DataFrame(dict(
 78 |         lower=df.quantile(.025),
 79 |         rating=df.quantile(.5),
 80 |         upper=df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
 81 |     bars['error_y'] = bars['upper'] - bars["rating"]
 82 |     bars['error_y_minus'] = bars['rating'] - bars["lower"]
 83 |     bars['rating_rounded'] = np.round(bars['rating'], 2)
 84 |     fig = px.scatter(bars, x="model", y="rating", error_y="error_y",
 85 |                      error_y_minus="error_y_minus", text="rating_rounded",
 86 |                      title=title)
 87 |     print(list(bars["model"]), bars)
 88 |     fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
 89 |     return fig
 90 | 
 91 | 
 92 | def calculate_elo_rankings(json_path):
 93 |     # Read JSON file
 94 |     with open(json_path, 'r', encoding='utf-8') as json_file:
 95 |         data = json.load(json_file)  # data is a list of dictionaries
 96 | 
 97 |     result = []
 98 | 
 99 |     for item in data:
100 |         if "human" in [item["source1"], item["source2"]]:
101 |             continue
102 |         if "judge" not in item:
103 |             continue
104 |         if "1" in item["judge"]:
105 |             item["winner_gpt"] = item["source1"]
106 |         elif "2" in item["judge"]:
107 |             item["winner_gpt"] = item["source2"]
108 |         else:
109 |             item["winner_gpt"] = "equal"
110 |         result.append(item)
111 | 
112 |     # Convert JSON data to DataFrame
113 |     df = pd.DataFrame(result)
114 | 
115 |     # Extract required columns
116 |     battles = df[['img', 'source1', 'source2', 'winner_gpt']]
117 | 
118 |     # # Save as CSV file
119 |     # battles.to_csv('check.csv', index=False, encoding='utf-8')
120 | 
121 |     # Calculate ELO scores
122 |     elo_ratings = compute_elo(battles)
123 |     preety_print_elo_ratings(elo_ratings)
124 | 
125 |     # Calculate bootstrap results
126 |     BOOTSTRAP_ROUNDS = 1000
127 |     np.random.seed(42)
128 |     bootstrap_elo_lu = get_bootstrap_result(battles, compute_elo, BOOTSTRAP_ROUNDS)
129 |     bootstrap_lu_median = bootstrap_elo_lu.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
130 |     bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
131 |     
132 |     print("Elo ranking by metrics:")
133 |     print(bootstrap_lu_median)
134 |     
135 |     return list(bootstrap_lu_median["model"])
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     sorted_model_names = calculate_elo_rankings('data/eval/caparena_annots_eval_qwen25vl72b.json')
140 |     print(sorted_model_names)
141 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era
  2 | 
  3 | [![arXiv](https://img.shields.io/badge/arXiv-2503.12329-b31b1b.svg)](https://arxiv.org/abs/2503.12329) 
  4 | [![project](https://img.shields.io/badge/Project-Page-blue?logo=github)](https://caparena.github.io/) 
  5 | [![huggingface](https://img.shields.io/badge/🤗%20HF-Leaderboard-orange)](https://huggingface.co/spaces/yan111222/CapArena_Auto) 
  6 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) 
  7 | [![PR's Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat)](http://makeapullrequest.com)
  8 | 
  9 | This repository contains the data, code, and resources for the paper: **CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era**
 10 | [\[Arxiv Link\]](https://arxiv.org/abs/2503.12329)
 11 | [\[Project Page\]](https://caparena.github.io/)
 12 | 
 13 | News: This paper was accepted by ACL 2025 findings.
 14 | 
 15 | Release Plans:
 16 | 
 17 | - [x] Usage of *CapArena-Auto*
 18 | - [x] Code and data to reproduce the results in the paper
 19 | - [x] Other resources
 20 | 
 21 | <!-- ## Contents
 22 | - [Automated Evaluation Benchmark](#-caparena-auto-benchmark)
 23 | - [Evaluation Steps](#-evaluation-steps)
 24 |   - [Evaluate Your Model](#evaluate-your-own-model-on-caparena-auto)
 25 |   - [Reproduce Paper Results](#reproduce-paper-results)
 26 | - [VLM-as-a-judge](#️-vlm-as-a-judge)
 27 | - [Acknowledgements](#acknowledge)
 28 | - [Citation](#citation) -->
 29 | 
 30 | 
 31 | ***
 32 | ## 🏆 CapArena-Auto Benchmark
 33 | 
 34 | *CapArena-Auto* is an arena-style automated evaluation benchmark for detailed image captioning. It features:
 35 | - 600 evaluation images
 36 | - Pairwise battles against three baseline models
 37 | - GPT-4o-as-a-Judge scoring system
 38 | - high correlation with human rankings
 39 | 
 40 | **Current Leaderboard**: [🤗 CapArena-Auto Leaderboard](https://huggingface.co/spaces/yan111222/CapArena_Auto) (22 models evaluated)
 41 | 
 42 | ---
 43 | 
 44 | ## 🔍 Evaluation Steps
 45 | 
 46 | ### Evaluate Your Own Model on CapArena-Auto
 47 | 
 48 | #### 📥 Step 1: Download Required Files
 49 | 1. Download the [evaluation images](https://box.nju.edu.cn/f/a79c42c9c10e4acb83e7/) (600 images)
 50 |   - Contains all 600 images used for CapArena-Auto evaluation
 51 |   - These are the test images your model will need to generate caption
 52 |   - File structure: `all_images/` folder with `test_XXXXX.jpg` files
 53 | 
 54 | 2. Download the [result template](https://box.nju.edu.cn/f/43eb761488734c638824/)
 55 |   - Example JSON file showing the required output format
 56 |    - Ensures compatibility with our evaluation scripts
 57 | 3. Download [human reference captions & existing model results](https://box.nju.edu.cn/f/707c01ccdb724d2f925f/)
 58 | - Contains two critical components:
 59 |      - `caparena_auto_600.json`:  
 60 |        - Human-annotated reference captions for all 600 images
 61 |        - Structure: `{"image_id": ...,"captions": {"human": ..., "gpt": "...", "cog": "...", "cpm": "..."}, "ref_model": ...}`
 62 |      - Leaderboard models' results (e.g. `GPT-4o-0806.json`, `Claude-3.5.json` etc.)
 63 |         - Pre-computed pair-wise battle results for current models in our leaderboard (used for results visualization)
 64 |    <!-- - Used to:
 65 |      - Build pairwise comparisons during evaluation
 66 |      - Calculate relative performance against baseline models
 67 |      - Generate the leaderboard rankings -->
 68 | 
 69 | #### 🗂️ Step 2: Prepare Your Environment
 70 | Create the following directory structure:
 71 | ```
 72 | data/
 73 | └── caparena_auto/
 74 |     ├── GPT-4o-0806.json
 75 |     ├── Claude-3.5-Sonnet-0620.json
 76 |     ├── CogVLM2-llama3-chat-19B.json
 77 |     └── ...
 78 | ```
 79 | 
 80 | #### 📊 Step 3: Generate Captions
 81 | Generate detailed captions for all 600 images using your model. Format your results as:
 82 | ```json
 83 | {
 84 |     "test_01258.jpg": "The image features a cle… day with good weather.",
 85 |     "test_04765.jpg": "The image shows a small,…on the brick structure.",
 86 |     "test_02788.jpg": "The scene depicts a pair… and recycling efforts.",
 87 |     "test_02765.jpg": "The photo captures a str…al beauty to the scene.",
 88 |     ...
 89 | }
 90 | ```
 91 | 
 92 | #### ⚖️ Step 4: Run Evaluation (Cost: ~$4)
 93 | 1. Set your OpenAI API key:
 94 | ```bash
 95 | export OPENAI_API_KEY="sk-xxxx"
 96 | ```
 97 | 2. Run evaluation:
 98 | ```bash
 99 | python caparena_auto_eval.py \
100 |   --test_model YourModelName \  % defined by yourself
101 |   --result_path path/to/your_results.json \
102 |   --imgs_dir path/to/images
103 | ```
104 | #### 🏅 Step 5: View Leaderboard Results
105 | ```bash
106 | python caparena_auto_scores.py \
107 |   --caparena_auto_dir data/caparena_auto \
108 |   --new_model_name YourModelName
109 | ```
110 | > Note: If you would like to submit your results to the [online leaderboard](https://huggingface.co/spaces/yan111222/CapArena_Auto), please raise an issue or contact us!
111 | 
112 | <!-- To view the current leaderboard, download the [results](https://box.nju.edu.cn/f/707c01ccdb724d2f925f/) of the models we have evaluated and put them under `data/caparena_auto`. Then, you can use `python caparena_auto_scores.py` to view the current leaderboard. -->
113 | 
114 | ```
115 | Model                          | Score_avg | Score_gpt  | Score_cog  | Score_cpm  | Length_Avg |
116 | -------------------------------------------------------------------------------------
117 | Gemini-1.5-pro-002             | 56.17    | 29.00      | 61.00      | 78.50      | 168.56     |
118 | GPT-4o-0806                    | 44.00    | 0.00       | 55.50      | 76.50      | 115.80     |
119 | Qwen2.5VL-72B                  | 35.33    | -1.00      | 49.00      | 58.00      | 163.67     |
120 | Gemini-2.0-flash-exp           | 30.83    | -2.00      | 39.50      | 55.00      | 416.99     |
121 | Ovis-2-34b                     | 27.00    | -15.00     | 33.50      | 62.50      | 120.20     |
122 | Claude-3.5-Sonnet-0620         | 21.50    | -14.00     | 30.00      | 48.50      | 147.93     |
123 | InternVL2-26B                  | 13.00    | -38.50     | 20.00      | 57.50      | 236.32     |
124 | GPT-4o-mini-0718               | 9.33     | -36.00     | 17.00      | 47.00      | 139.83     |
125 | Ovis-1_6-27b                   | 3.00     | -49.50     | 14.50      | 44.00      | 94.16      |
126 | GLM-4V-Plus                    | -0.17    | -51.50     | 13.00      | 38.00      | 109.27     |
127 | CogVLM2-llama3-chat-19B        | -8.50    | -56.50     | 0.00       | 31.00      | 115.87     |
128 | Qwen2-VL-72B-Instruct          | -9.00    | -50.50     | -4.50      | 28.00      | 114.45     |
129 | LLaVA-OV-72b                   | -12.33   | -57.50     | -6.00      | 26.50      | 200.88     |
130 | LLama-3.2-90B                  | -25.67   | -72.00     | -13.00     | 8.00       | 160.25     |
131 | Hunyuan-standard-vision        | -26.00   | -63.00     | -19.00     | 4.00       | 354.10     |
132 | Internvl2-5-8b                 | -29.83   | -71.00     | -29.00     | 10.50      | 117.77     |
133 | MiniCPM-V2.6-8B                | -38.00   | -80.00     | -34.00     | 0.00       | 106.74     |
134 | Qwen2-VL-2B-Instruct           | -48.67   | -86.00     | -49.50     | -10.50     | 116.84     |
135 | Qwen2-VL-7B-Instruct           | -49.00   | -78.00     | -59.00     | -10.00     | 97.81      |
136 | LLaVA-1.6-34B                  | -67.50   | -92.00     | -53.50     | -57.00     | 124.81     |
137 | cambrian-34b                   | -75.00   | -93.00     | -76.00     | -56.00     | 120.23     |
138 | LLaVA-1.5-7B                   | -94.00   | -99.50     | -92.00     | -90.50     | 74.38      |
139 | ```
140 | 
141 | 
142 | ***
143 | ### Reproduce Paper Results
144 | 
145 | #### 📥 Step 1: Download Annotation Data
146 | 
147 | Download the [human pair-wise battle annotation](https://box.nju.edu.cn/f/0fd0a0d3dce243ab8c12/) of *CapArena* and put them under `data/eval`. 
148 | 
149 | ```
150 | data/
151 | └── eval/
152 |     ├── caparena_annots_eval.json
153 |     ├── caparena_annots_eval_gpt_ref.json
154 |     ├── caparena_annots_eval_gpt.json
155 |     └── ...
156 | ```
157 | 
158 | `caparena_annots_eval.json` is the human annotation results of *CapArena*, which contains 6523 pair-wise battle/judgment given by our human annotators.
159 | 
160 | Other files are the results of the annotation of these 6523 pairs by captioning metrics (e.g., GPT-4o, GPT-4o with ref, LLaVA-OneVision). Each item in these files include a `judge` key to represent the judgment given by the metric.
161 | 
162 | #### 🎯 Step 2: Calculate the caption-level agreement and model-level agreement
163 | 
164 | Calculate caption-level agreement and model-level agreement based on metrics annotation results:
165 | 
166 | ```bash
167 | python caparena_metrics.py \
168 |     --eval_dir data/eval/caparena_annots_eval_gpt_ref.json
169 | ```
170 | 
171 | ### ⚖️ VLM-as-a-Judge
172 | The above provides the VLM-as-a-Judge results that we have generated.
173 | If you want to reproduce our VLM-as-a-Judge process, first download the total [5100 images](https://box.nju.edu.cn/f/9d2b9ded47d54999926c/) from DOCCI.
174 | Then you can conduct GPT-4o-as-a-Judge by:
175 | ```
176 | python vlm_as_a_judge.py --caption_eval_cand_dir data/eval/caparena_annots_eval.json --eval_save_path data/eval/caparena_annots_eval_gpt_ref.json --imgs_dir xxx/images
177 | ```
178 | #### 📊 Calculating Additional Metrics with Human References
179 | 
180 | If you need human-annotated references for calculating traditional metrics (e.g., BLEU, CIDEr, SPICE), you can obtain the DOCCI human descriptions from:
181 | [docci_descriptions](https://storage.googleapis.com/docci/data/docci_descriptions.jsonlines)
182 | 
183 | *** 
184 | ### Acknowledge
185 | 
186 | Thanks to [DOCCI](https://google.github.io/docci/) for their high-quality human annotation and wonderful open-sourced work.
187 | 
188 | Thanks to all the annotators who participated in compiling our CapArena dataset.
189 | 
190 | ***
191 | ### Citation
192 | If you find this work helpful, please consider to star 🌟 this repo and cite our paper.
193 | ```
194 | @article{cheng2025caparena,
195 |   title={CapArena: Benchmarking and Analyzing Detailed Image Captioning in the LLM Era},
196 |   author={Cheng, Kanzhi and Song, Wenpo and Fan, Jiaxin and Ma, Zheng and Sun, Qiushi and Xu, Fangzhi and Yan, Chenyang and Chen, Nuo and Zhang, Jianbing and Chen, Jiajun},
197 |   journal={arXiv preprint arXiv:2503.12329},
198 |   year={2025}
199 | }
200 | ```


--------------------------------------------------------------------------------
/vlm_as_a_judge.py:
--------------------------------------------------------------------------------
  1 | # usage: python vlm_as_a_judge.py --caption_eval_cand_dir data/eval/caparena_annots_eval.json --eval_save_path data/eval/caparena_annots_eval_gpt_ref.json --imgs_dir data/eval/images
  2 | import os
  3 | import time
  4 | import json
  5 | import requests
  6 | import base64
  7 | from tqdm import tqdm
  8 | import re
  9 | import argparse
 10 | import shutil
 11 | 
 12 | 
 13 | # Helper functions for image encoding
 14 | def encode_image(image_content):
 15 |     return base64.b64encode(image_content).decode('utf-8')
 16 | 
 17 | 
 18 | def convert_image_to_base64(image_path):
 19 |     with open(image_path, 'rb') as f:
 20 |         image_bytes = f.read()
 21 |         return encode_image(image_bytes)
 22 | 
 23 | 
 24 | # Function to call OpenAI API
 25 | def call_llm(model_name, payload):
 26 |     headers = {
 27 |         "Content-Type": "application/json",
 28 |         "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
 29 |     }
 30 |     print("Generating content with GPT model: {}".format(model_name))
 31 |     response = requests.post(
 32 |         "https://api.openai.com/v1/chat/completions",
 33 |         headers=headers,
 34 |         json={**payload, "temperature": 0.1}
 35 |     )
 36 |     if response.status_code != 200:
 37 |         if response.json()['error']['code'] == "context_length_exceeded":
 38 |             print("Context length exceeded. Retrying with a smaller context.")
 39 |             payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:]
 40 |             retry_response = requests.post(
 41 |                 "https://api.openai.com/v1/chat/completions",
 42 |                 headers=headers,
 43 |                 json={**payload, "temperature": 0.1}
 44 |             )
 45 |             if retry_response.status_code != 200:
 46 |                 print(
 47 |                     "Failed to call LLM even after attempt on shortening the history: " + retry_response.text)
 48 |                 return ""
 49 | 
 50 |         print("Failed to call LLM: " + response.text)
 51 |         time.sleep(2)
 52 |         return ""
 53 |     else:
 54 |         return response.json()['choices'][0]['message']['content']
 55 | 
 56 | 
 57 | # Calculate agreement between model predictions and human judgments
 58 | def cal_agreement(caption_eval_pair_list, include_tie=False, in_400=False):
 59 |     agreement_level = {"overall": [], "level 1": [], "level 2": [], "level 3": [], "level 4": []}
 60 |     tie_num = {"level 1": 0, "level 2": 0, "level 3": 0, "level 4": 0}
 61 |     for item in caption_eval_pair_list:
 62 |         if "judge" not in item:
 63 |             continue
 64 | 
 65 |         if item["source1"] == "human" or item["source2"] == "human":
 66 |            continue
 67 | 
 68 |         if in_400 == True:
 69 |             if not item["in-400"]:
 70 |                 continue
 71 | 
 72 |         if item["judge"] not in ["Caption 1 is better.", "Caption 1 is better", "Caption 2 is better.", "Caption 2 is better", "Tie", "Tie."]:
 73 |             print("GPT judgment not 1 or 2")
 74 |             continue
 75 | 
 76 |         if item["winner"] == item["source1"]:
 77 |             judge_human = "Caption 1 is better."
 78 |         elif item["winner"] == item["source2"]:
 79 |             judge_human = "Caption 2 is better."
 80 |         else:
 81 |             judge_human = "Tie."
 82 | 
 83 |         if not include_tie:
 84 |             if judge_human != "Tie." and item["judge"] != "Tie." and item["judge"] != "Tie":
 85 |                 agree = 1 if item["judge"] in judge_human else 0
 86 |                 agreement_level["overall"].append(agree)
 87 |                 agreement_level[item["cluster"]].append(agree)
 88 |         else:
 89 |             agree = 1 if item["judge"] in judge_human else 0
 90 |             agreement_level["overall"].append(agree)
 91 |             agreement_level[item["cluster"]].append(agree)
 92 | 
 93 |             if item["judge"] == "Tie" or item["judge"] == "Tie.":
 94 |                 tie_num[item["cluster"]] += 1
 95 | 
 96 |     overall = sum(agreement_level["overall"]) / len(agreement_level["overall"]) if len(
 97 |         agreement_level["overall"]) > 0 else None
 98 |     level1 = sum(agreement_level["level 1"]) / len(agreement_level["level 1"]) if len(
 99 |         agreement_level["level 1"]) > 0 else None
100 |     level2 = sum(agreement_level["level 2"]) / len(agreement_level["level 2"]) if len(
101 |         agreement_level["level 2"]) > 0 else None
102 |     level3 = sum(agreement_level["level 3"]) / len(agreement_level["level 3"]) if len(
103 |         agreement_level["level 3"]) > 0 else None
104 |     level4 = sum(agreement_level["level 4"]) / len(agreement_level["level 4"]) if len(
105 |         agreement_level["level 4"]) > 0 else None
106 |     overall_num = len(agreement_level["overall"])
107 |     level1_num = len(agreement_level["level 1"])
108 |     level2_num = len(agreement_level["level 2"])
109 |     level3_num = len(agreement_level["level 3"])
110 |     level4_num = len(agreement_level["level 4"])
111 | 
112 |     result = (
113 |         f"Overall: {overall if overall is None else f'{overall:.3f}'} ({overall_num}), "
114 |         f"Level 1: {level1 if level1 is None else f'{level1:.3f}'} ({level1_num}), "
115 |         f"Level 2: {level2 if level2 is None else f'{level2:.3f}'} ({level2_num}), "
116 |         f"Level 3: {level3 if level3 is None else f'{level3:.3f}'} ({level3_num}), "
117 |         f"Level 4: {level4 if level4 is None else f'{level4:.3f}'} ({level4_num})"
118 |     )
119 |     print(result)
120 | 
121 |     if include_tie:
122 |         level1_num = tie_num["level 1"]
123 |         level2_num = tie_num["level 2"]
124 |         level3_num = tie_num["level 3"]
125 |         level4_num = tie_num["level 4"]
126 |         result_tie_num = f"Level 1: {level1_num}, Level 2: {level2_num}, Level 3: {level3_num}, Level 4: {level4_num}"
127 |         print(result_tie_num)
128 | 
129 | 
130 | system_prompt_without_ref = """
131 | You are a highly capable multimodal AI assistant tasked with evaluating image captions.
132 | 
133 | Given an image and two candidate captions, you are require to determine which of the two captions is better.
134 | 
135 | Below are some guidelines for your reference:
136 | 
137 | 1. **Precision**: The caption should accurately correspond to the content of the image, providing precise information about it. Common examples of imprecision include errors in color, quantity, spatial relationships, or the posture of people.
138 | 
139 | 2. **Informativeness**: Salient information in the image should be reflected in the caption. Since it is impossible to include every detail, you will need to subjectively judge which aspects of the image are important. For instance, describing an otter as "a small animal" is precise, but it is less informative than specifying "an otter".
140 | 
141 | 3. **Hallucination**: Captions that include descriptions of objects or elements that are clearly absent from the image should be significantly penalized.
142 | 
143 | 4. **Attention to detail**: Annotators should pay close attention to the details in the image to distinguish the quality of the descriptions.
144 | 
145 | 5. **Assistive description**: Imagine a visually impaired person asking you to describe the image for them. How would you convey the image to them?
146 | 
147 | 6. **Reverse thinking**: What image does the caption lead us to imagine? Does the caption effectively lead you to imagine the intended image?
148 | 
149 | 7. **Ties are acceptable**: If you find it genuinely difficult to determine which caption is better (e.g., both captions are excellent), marking a tie is acceptable.
150 | 
151 | While the above guidelines provide a framework, they cannot cover all possible cases. Therefore, we encourage you to make **subjective judgments** based on the specific circumstances and your own reasoning about which caption is better.
152 | 
153 | ### Response Format:
154 | Format your response into two lines as shown below:
155 | Reason: <your thoughts and reasoning process for the judgment>
156 | Judgment: <Caption 1 is better>/<Caption 2 is better>/<Tie>
157 | """
158 | 
159 | system_prompt_with_ref = """
160 | You are a highly capable multimodal AI assistant tasked with evaluating image captions.
161 | 
162 | Given an image, two candidate captions and one reference caption annotated by human expert, you are require to determine which of the two captions is better.
163 | 
164 | Below are some guidelines for your reference:
165 | 
166 | 1. **Precision**: The caption should accurately correspond to the content of the image, providing precise information about it. Common examples of imprecision include errors in color, quantity, spatial relationships, or the posture of people.
167 | 
168 | 2. **Informativeness**: Salient information in the image should be reflected in the caption. Since it is impossible to include every detail, you will need to subjectively judge which aspects of the image are important. For instance, describing an otter as "a small animal" is precise, but it is less informative than specifying "an otter".
169 | 
170 | 3. **Hallucination**: Captions that include descriptions of objects or elements that are clearly absent from the image should be significantly penalized.
171 | 
172 | 4. **Attention to detail**: Annotators should pay close attention to the details in the image to distinguish the quality of the descriptions.
173 | 
174 | 5. **Assistive description**: Imagine a visually impaired person asking you to describe the image for them. How would you convey the image to them?
175 | 
176 | 6. **Reverse thinking**: What image does the caption lead us to imagine? Does the caption effectively lead you to imagine the intended image?
177 | 
178 | 7. **Ties are acceptable**: If you find it genuinely difficult to determine which caption is better (e.g., both captions are excellent), marking a tie is acceptable.
179 | 
180 | While the above guidelines provide a framework, they cannot cover all possible cases. Therefore, we encourage you to make **subjective judgments** based on the specific circumstances and your own reasoning about which caption is better.
181 | 
182 | **Reference caption**: The reference caption is annotated by a human expert. When you're uncertain about which description is better (e.g., when unsure about specific details in the image), you can use the reference caption to assist your judgment. The content in the reference caption can be considered correct; however, it is not perfect, and descriptions not included in the reference caption can still be reasonable.
183 | 
184 | ### Response Format:
185 | Format your response into two lines as shown below:
186 | Reason: <your thoughts and reasoning process for the judgment>
187 | Judgment: <Caption 1 is better>/<Caption 2 is better>/<Tie>
188 | """
189 | 
190 | def mllm_judge_pairs(caption_eval_cand_dir, imgs_dir, with_ref=True, cal_agree=True, eval_model_name=None):
191 | 
192 |     caption_eval_cand = json.load(open(caption_eval_cand_dir, 'r'))
193 |     print(f"Num of All Caption Pair: {len(caption_eval_cand)}")
194 | 
195 |     for i, item in tqdm(enumerate(caption_eval_cand)):
196 | 
197 |         if "judge" in item:
198 |             print("processed")
199 |             continue
200 | 
201 |         if i % 20 == 0:
202 |             json.dump(caption_eval_cand, open(caption_eval_cand_dir, 'w'))
203 | 
204 |         if item["source1"] == "human" or item["source2"] == "human":
205 |             continue
206 | 
207 |         # if not item["in-400"]:
208 |         #     continue
209 | 
210 |         img_filename = item["img"]
211 |         img_path = os.path.join(imgs_dir, img_filename)
212 |         if not os.path.exists(img_path):
213 |             print("img not exist")
214 |         image = convert_image_to_base64(img_path)
215 | 
216 |         caption_1 = item["caption1"]
217 |         caption_2 = item["caption2"]
218 |         caption_ref = item["ref"]
219 | 
220 |         if with_ref:
221 |             compare_prompt = f"Caption 1:\n{caption_1}\nCaption 2:\n{caption_2}\nCaption Reference:\n{caption_ref}\nDetermine which is better and answer with the given format. Only mark a tie if it is truly difficult to decide which caption is better based on their quality, informativeness, and precision."
222 |         else:
223 |             compare_prompt = f"Caption 1:\n{caption_1}\nCaption 2:\n{caption_2}\nDetermine which is better and answer with the given format. Only mark a tie if there is no discernible difference in quality, informativeness, and precision after careful evaluation."
224 | 
225 |         messages = []
226 | 
227 |         messages.append({
228 |             "role": "system",
229 |             "content": [
230 |                 {
231 |                     "type": "text",
232 |                     "text": system_prompt_with_ref if with_ref else system_prompt_without_ref
233 |                 },
234 |             ]
235 |         })
236 | 
237 |         action_text_image = []
238 |         action_text_image.append(
239 |             {
240 |                 "type": "image_url",
241 |                 "image_url": {
242 |                     "url": f"data:image/png;base64,{image}",
243 |                     "detail": "high"
244 |                 }
245 |             }
246 |         )
247 |         action_text_image.append(
248 |             {
249 |                 "type": "text",
250 |                 "text": compare_prompt
251 |             }
252 |         )
253 | 
254 |         messages.append({
255 |             "role": "user",
256 |             "content": action_text_image
257 |         })
258 | 
259 |         print(compare_prompt)
260 | 
261 |         model_name = "gpt-4o-2024-08-06"
262 |         try_num = 0
263 |         while try_num < 5:
264 |             try_num += 1
265 |             try:
266 |                 response = call_llm(model_name, {
267 |                     "model": model_name,
268 |                     "messages": messages,
269 |                     "max_tokens": 1500,
270 |                     "top_p": 0.9,
271 |                     "temperature": 0.5
272 |                 })
273 |             except:
274 |                 print("error call")
275 |                 time.sleep(1.0)
276 |                 continue
277 |             try:
278 |                 print(response)
279 |                 reason_match = re.search(r"Reason:\s*(.+?)\s*Judgment:", response, re.DOTALL)
280 |                 judge_match = re.search(r"Judgment:\s*(.+)", response)
281 |                 reason = reason_match.group(1).strip() if reason_match else None
282 |                 judgment = judge_match.group(1).strip() if judge_match else None
283 | 
284 |                 if reason and judgment:
285 |                     item["judge_reason"] = response
286 |                     item["judge"] = judgment
287 |                     break
288 |                 else:
289 |                     print("Invalid response format, retrying...")
290 |                     time.sleep(1.0)
291 | 
292 |             except json.JSONDecodeError:
293 |                 # If response is not valid JSON, continue generating
294 |                 print("Invalid response received, retrying...")
295 |                 time.sleep(1.0)
296 | 
297 |         num_processed = len([item for item in caption_eval_cand if ("judge" in item)])
298 |         if eval_model_name != None:
299 |             print("Eval Model: {} Num of total: {} Num of success: {}".format(eval_model_name, len(caption_eval_cand), num_processed))
300 |         else:
301 |             print("Num of total: {} Num of success: {}".format(len(caption_eval_cand), num_processed))
302 | 
303 |         if cal_agree:
304 |             cal_agreement(caption_eval_cand, include_tie=True)
305 | 
306 |     json.dump(caption_eval_cand, open(caption_eval_cand_dir, 'w'))
307 |     print("Done")
308 | 
309 | def main():
310 |     parser = argparse.ArgumentParser(description='Evaluate the quality of image captions')
311 |     parser.add_argument('--caption_eval_cand_dir', type=str, required=True, help='Path to JSON file containing caption evaluation candidates')
312 |     parser.add_argument('--eval_save_path', type=str, required=True, help='Path to save evaluation results')
313 |     parser.add_argument('--imgs_dir', type=str, required=True, help='Path to directory containing images')
314 |     parser.add_argument('--with_ref', type=bool, default=True, help='Whether to use reference captions for evaluation')
315 |     parser.add_argument('--cal_agree', type=bool, default=True, help='Whether to calculate agreement')
316 |     parser.add_argument('--eval_model_name', type=str, default=None, help='Name of evaluation model')
317 | 
318 |     args = parser.parse_args()
319 |     # Copy original evaluation file to new save path
320 |     shutil.copy(args.caption_eval_cand_dir, args.eval_save_path)
321 |     print(f"Evaluation file copied to: {args.eval_save_path}")
322 | 
323 |     mllm_judge_pairs(
324 |         caption_eval_cand_dir=args.eval_save_path,
325 |         imgs_dir=args.imgs_dir,
326 |         with_ref=args.with_ref,
327 |         cal_agree=args.cal_agree,
328 |         eval_model_name=args.eval_model_name
329 |     )
330 | 
331 | if __name__ == "__main__":
332 |     main()


--------------------------------------------------------------------------------