├── requirements.txt ├── math24o.xlsx ├── model_answers.xlsx ├── README.md └── auto_evaluation.py /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=2.2.3 -------------------------------------------------------------------------------- /math24o.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/Math24o/HEAD/math24o.xlsx -------------------------------------------------------------------------------- /model_answers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLUEbenchmark/Math24o/HEAD/model_answers.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Math24o 2 | 3 | Math24o benchmarks LLMs on Chinese high school Olympiad math using the 2024 prelims. 4 | 5 | Math24o是一个中文题目的数学推理测评基准，用于评估大型语言模型在「高中奥林匹克数学竞赛」级别的数学推理能力。 6 | 7 | 该测评使用2024年预赛试题，可通过程序自动判断模型答案与参考答案是否一致，以客观评估模型的正确率。 8 | 9 | 此测评旨在为未来模型研发提供参考，提高模型在复杂数学任务中的可靠性。 10 | 11 | 12 | 13 | # 获得模型回复及提示词 Prompts 14 | 15 | Full Input: +"\n" + 16 | 17 | 指定提示词（Special Prompt Used）： 18 | 19 | 请把你的最终答案放在\boxed{}内，即使用\boxed{你的最终答案}这个格式，注意\boxed{}里只能是整数或小数。 20 | 21 | Special Prompt Used translated as Engish: 22 | 23 | Please put your final answer in \boxed{}, using the format \boxed{your final answer}. Note that only integers or decimals are allowed inside \boxed{}. 24 | 25 | 完整示例（Example）： 26 | 27 | 设函数 $$f : \{1, 2, 3 \} \to\{2, 3, 4 \}$$ 满足 $$f \left( f \left( x \right)-1 \right)=f \left( x \right)$$ ，则这样的函数有多少个？ 28 | 29 | 请把你的最终答案放在\boxed{}内，即使用\boxed{你的最终答案}这个格式，注意\boxed{}里只能是整数或小数。 30 | 31 | 32 | # 🏆 主要成绩 Main Result 33 | 34 | | 排名 | 模型 | 机构 | 总分 | 使用方式 | 发布日期 | 35 | |----|--------------------------------|----------|------|----------|----------| 36 | | 1 | o3-mini(high) | OpenAI | 85.71 |API |2025.03.12 | 37 | | 2 | Gemini-2.0-Flash-Thinking-Exp-01-21 | Google | 71.43|API | 2025.03.12 | 38 | | 3 | QwQ-Max-Preview | 阿里云 | 66.67 | 官网 | 2025.03.12 | 39 | | 3 | QwQ-32B | 阿里云 | 66.67 | 模型|2025.03.12 | 40 | | 3 | o1 | OpenAI | 66.67 |API | 2025.03.12 | 41 | | 4 | DeepSeek-R1 | 深度求索 | 57.14 | API | 2025.03.12 | 42 | | 4 | Claude 3.7 Sonnet | Anthropic | 57.14 |POE| 2025.03.12 | 43 | 44 | 注：以上成绩是大模型仅生成一次答案时的正确率。用户可自己结合问题和答案重新进行评估。 45 | 46 | # ✨自动化评估 Auto Evaluation 47 | 48 | 待所有待测大模型的回答都粘贴在 model_answers 后，保存 model_answers 文件。回到终端，依次发送以下内容： 49 | 50 | ## 安装所需的 Python 扩展包 Install 51 | 52 | pip install -r requirements.txt 53 | 54 | ## 获取评估结果 Run script 55 | 56 | python auto_evaluation.py 57 | 58 | 此时在终端会返回待测大模型的平均得分。 59 | 60 | 你也可以在终端发送以下内容来获取每道题目的详细评估结果： 61 | 62 | ## 打开 output.xlsx（也可以手动打开） 63 | 64 | output.xlsx 65 | -------------------------------------------------------------------------------- /auto_evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def extract_final_answer(each_mod_answer): 5 | """提取待测模型回答中 \boxed{} 中的内容，如果待测模型没有根据要求使用 \boxed{} 的格式输出最终答案，则返回空字符串 6 | params: 7 | each_mod_answer: 待测模型给出的回答 8 | return: 9 | each_final_answer：\boxed{} 中的内容""" 10 | 11 | start_index = 0 12 | end_index = 0 13 | 14 | each_mod_answer = repr(each_mod_answer) 15 | 16 | if not each_mod_answer or each_mod_answer == '' or each_mod_answer == None: 17 | each_final_answer = '' 18 | return each_final_answer 19 | 20 | for i in range(0, len(each_mod_answer)): 21 | if end_index != 0: 22 | try: 23 | if each_mod_answer[-i-2] == 'd' and each_mod_answer[-i-3] == 'e' and each_mod_answer[-i-4] == 'x' and each_mod_answer[-i-5] == 'o': 24 | start_index = -i 25 | break 26 | else: 27 | continue 28 | except: 29 | return '' 30 | 31 | 32 | else: 33 | if each_mod_answer[-i] == '}': 34 | end_index = -i 35 | else: 36 | continue 37 | 38 | each_final_answer = each_mod_answer[start_index:end_index] 39 | 40 | i = 0 41 | 42 | while i <= len(each_final_answer): 43 | try: 44 | if each_final_answer[i] == '\\' and each_final_answer[i+1] == '\\': 45 | each_final_answer = each_final_answer[:i] + each_final_answer[i+1:] 46 | except: 47 | i += 1 48 | continue 49 | 50 | i += 1 51 | 52 | return each_final_answer 53 | 54 | def auto_gene_evaluation_result(): 55 | 56 | #---------- initial config ----------# 57 | 58 | current_dir = os.path.dirname(os.path.realpath(__file__)) 59 | 60 | #---# 61 | 62 | questions_file_path = os.path.join(current_dir, 'math24o.xlsx') 63 | questions_dataframe = pd.read_excel(questions_file_path) 64 | 65 | mod_answers_file_path = os.path.join(current_dir, 'model_answers.xlsx') 66 | mod_answers_dataframe = pd.read_excel(mod_answers_file_path) 67 | 68 | id_list = questions_dataframe['id'] 69 | question_list = questions_dataframe['questions'] 70 | ref_answer_list = questions_dataframe['ref_answers'] 71 | mod_answer_list = mod_answers_dataframe['model_answers'] 72 | 73 | output_dataframe = pd.DataFrame({}) 74 | 75 | count = 1 76 | 77 | #---------- essential loop ----------# 78 | 79 | for each_id, each_question, each_ref_answer, each_mod_answer in zip(id_list, question_list, ref_answer_list, mod_answer_list): 80 | 81 | each_final_answer = extract_final_answer(each_mod_answer) 82 | 83 | try: 84 | each_final_answer = float(each_final_answer) 85 | except: 86 | each_final_answer = each_final_answer 87 | 88 | 89 | if each_final_answer == each_ref_answer: 90 | each_score = 1 91 | else: 92 | # print(type(each_ref_answer), type(each_final_answer)) 93 | each_score = 0 94 | 95 | #---------- output processing message ----------# 96 | 97 | # match str(count)[-1]: 98 | # case '1': 99 | # print(">>> Evaluating " + str(count) + "st model's answer...") 100 | # case '2': 101 | # print(">>> Evaluating " + str(count) + "nd model's answer...") 102 | # case '3': 103 | # print(">>> Evaluating " + str(count) + "rd model's answer...") 104 | # case _: 105 | # print(">>> Evaluating " + str(count) + "th model's answer...") 106 | 107 | #---------- for developer ----------# 108 | 109 | new_dataframe = pd.DataFrame({'id': [each_id], 'questions': [each_question], 'ref_answers': [each_ref_answer], 'mod_answers': [each_mod_answer], 'final_answers':[each_final_answer], 'scores':[each_score]}) 110 | 111 | #---------- for user ----------# 112 | 113 | # new_dataframe = pd.DataFrame({'id': [each_id], 'mod_answers': [each_mod_answer], 'scores':[each_score]}) 114 | 115 | #---------- preparation of output ----------# 116 | 117 | output_dataframe = pd.concat([output_dataframe, new_dataframe], ignore_index=True) 118 | 119 | output_file_path = os.path.join(current_dir, 'output.xlsx') 120 | 121 | output_dataframe.to_excel(output_file_path, index=False) 122 | 123 | #---------- output successfull message ----------# 124 | 125 | match str(count)[-1]: 126 | case '1': 127 | print(">>> " + str(count) + 128 | "st model's answer has been evaluated successfully.") 129 | case '2': 130 | print(">>> " + str(count) + 131 | "nd model's answer has been evaluated successfully.") 132 | case '3': 133 | print(">>> " + str(count) + 134 | "rd model's answer has been evaluated successfully.") 135 | case _: 136 | print(">>> " + str(count) + 137 | "th model's answer has been evaluated successfully.") 138 | 139 | #---------- after an evaluation, add 1 to count ----------# 140 | 141 | count += 1 142 | 143 | #----------------------------------------------# 144 | 145 | score_list = output_dataframe['scores'] 146 | 147 | average = sum(score_list) / len(score_list) 148 | 149 | average_dataframe = pd.DataFrame({'scores':[100*average]}) 150 | output_dataframe = pd.concat([output_dataframe, average_dataframe], ignore_index=True) 151 | 152 | output_dataframe.to_excel(output_file_path, index=False) 153 | 154 | print(">>> All model's answers have been evaluated successfully! The final score is " + str(100*average) + '. For more evaluation details, please check the output.xlsx file.') 155 | 156 | if __name__ == '__main__': 157 | 158 | auto_gene_evaluation_result() --------------------------------------------------------------------------------