├── BenchBuilder ├── README.md ├── category.py ├── config.yaml ├── embed.py ├── filter.py ├── label.py ├── requirements.txt └── topic_clustering.py ├── LICENSE ├── README.md ├── config ├── api_config.yaml ├── api_config_bedrock_models.yaml ├── arena-hard-v0.1.yaml ├── arena-hard-v2.0.yaml └── gen_answer_config.yaml ├── data ├── arena-hard-v0.1 │ ├── model_answer │ │ ├── gpt-3.5-turbo-0125.jsonl │ │ ├── gpt-4-0314.jsonl │ │ └── gpt-4-0613.jsonl │ ├── model_judgment │ │ └── gpt-4-1106-preview │ │ │ ├── gpt-3.5-turbo-0125.jsonl │ │ │ └── gpt-4-0613.jsonl │ └── question.jsonl └── arena-hard-v2.0 │ ├── model_answer │ ├── deepseek-r1.jsonl │ ├── gemini-2.0-flash-001.jsonl │ ├── o3-mini-2025-01-31.jsonl │ └── qwq-32b.jsonl │ ├── model_judgment │ ├── gemini-2.5 │ │ ├── deepseek-r1.jsonl │ │ └── qwq-32b.jsonl │ └── gpt-4.1 │ │ ├── deepseek-r1.jsonl │ │ └── qwq-32b.jsonl │ └── question.jsonl ├── gen_answer.py ├── gen_judgment.py ├── leaderboard └── arena_hard_leaderboard_20240731.csv ├── misc ├── past_leaderboards.md ├── pipeline_method.png ├── qa_browser.png └── sglang_setup.bash ├── qa_browser.py ├── requirements-optional.txt ├── requirements.txt ├── show_result.py └── utils ├── add_markdown_info.py ├── bedrock_utils.py ├── completion.py ├── judge_utils.py ├── math_utils.py └── sglang_server.py /BenchBuilder/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/README.md -------------------------------------------------------------------------------- /BenchBuilder/category.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/category.py -------------------------------------------------------------------------------- /BenchBuilder/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/config.yaml -------------------------------------------------------------------------------- /BenchBuilder/embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/embed.py -------------------------------------------------------------------------------- /BenchBuilder/filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/filter.py -------------------------------------------------------------------------------- /BenchBuilder/label.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/label.py -------------------------------------------------------------------------------- /BenchBuilder/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/requirements.txt -------------------------------------------------------------------------------- /BenchBuilder/topic_clustering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/BenchBuilder/topic_clustering.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/README.md -------------------------------------------------------------------------------- /config/api_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/config/api_config.yaml -------------------------------------------------------------------------------- /config/api_config_bedrock_models.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/config/api_config_bedrock_models.yaml -------------------------------------------------------------------------------- /config/arena-hard-v0.1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/config/arena-hard-v0.1.yaml -------------------------------------------------------------------------------- /config/arena-hard-v2.0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/config/arena-hard-v2.0.yaml -------------------------------------------------------------------------------- /config/gen_answer_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/config/gen_answer_config.yaml -------------------------------------------------------------------------------- /data/arena-hard-v0.1/model_answer/gpt-3.5-turbo-0125.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/model_answer/gpt-3.5-turbo-0125.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-3.5-turbo-0125.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-3.5-turbo-0125.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v0.1/question.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v0.1/question.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_answer/deepseek-r1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_answer/deepseek-r1.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_answer/gemini-2.0-flash-001.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_answer/gemini-2.0-flash-001.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_answer/qwq-32b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_answer/qwq-32b.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_judgment/gemini-2.5/deepseek-r1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_judgment/gemini-2.5/deepseek-r1.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_judgment/gemini-2.5/qwq-32b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_judgment/gemini-2.5/qwq-32b.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_judgment/gpt-4.1/deepseek-r1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_judgment/gpt-4.1/deepseek-r1.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/model_judgment/gpt-4.1/qwq-32b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/model_judgment/gpt-4.1/qwq-32b.jsonl -------------------------------------------------------------------------------- /data/arena-hard-v2.0/question.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/data/arena-hard-v2.0/question.jsonl -------------------------------------------------------------------------------- /gen_answer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/gen_answer.py -------------------------------------------------------------------------------- /gen_judgment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/gen_judgment.py -------------------------------------------------------------------------------- /leaderboard/arena_hard_leaderboard_20240731.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/leaderboard/arena_hard_leaderboard_20240731.csv -------------------------------------------------------------------------------- /misc/past_leaderboards.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/misc/past_leaderboards.md -------------------------------------------------------------------------------- /misc/pipeline_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/misc/pipeline_method.png -------------------------------------------------------------------------------- /misc/qa_browser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/misc/qa_browser.png -------------------------------------------------------------------------------- /misc/sglang_setup.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/misc/sglang_setup.bash -------------------------------------------------------------------------------- /qa_browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/qa_browser.py -------------------------------------------------------------------------------- /requirements-optional.txt: -------------------------------------------------------------------------------- 1 | mistralai 2 | anthropic 3 | cohere 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/requirements.txt -------------------------------------------------------------------------------- /show_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/show_result.py -------------------------------------------------------------------------------- /utils/add_markdown_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/add_markdown_info.py -------------------------------------------------------------------------------- /utils/bedrock_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/bedrock_utils.py -------------------------------------------------------------------------------- /utils/completion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/completion.py -------------------------------------------------------------------------------- /utils/judge_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/judge_utils.py -------------------------------------------------------------------------------- /utils/math_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/math_utils.py -------------------------------------------------------------------------------- /utils/sglang_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmarena/arena-hard-auto/HEAD/utils/sglang_server.py --------------------------------------------------------------------------------