├── .gitignore ├── README.md ├── analyze_results.py ├── dataset ├── AQuA │ └── test.jsonl ├── MATH-500 │ └── test.jsonl ├── grade-school-math │ └── test.jsonl └── hotpotqa │ ├── hotpot_dev_distractor_v1.json │ └── hotpot_dev_distractor_v1_500_sample.json ├── evaluation ├── __init__.py ├── base.py ├── datasets │ ├── aqua.py │ ├── gsm8k.py │ ├── hotpotqa.py │ ├── math.py │ └── mme_realworld_lite.py ├── metrics │ └── pass_rate.py └── multi_evaluator.py ├── example └── gsm8k_results_cot.json ├── figs ├── AQuA_llm_comparison.png ├── AQuA_llm_operator_comparison.png ├── AQuA_operator_comparison.png ├── AQuA_operator_llm_comparison.png ├── MATH-500_llm_comparison.png ├── MATH-500_llm_operator_comparison.png ├── MATH-500_operator_comparison.png ├── MATH-500_operator_llm_comparison.png ├── OPS5.png ├── average_score_vs_cost_by_algorithm_llm.png ├── average_score_vs_cost_by_algorithm_llm_2.png ├── gsm8k_llm_comparison.png ├── gsm8k_llm_operator_comparison.png ├── gsm8k_operator_comparison.png ├── gsm8k_operator_llm_comparison.png ├── overall_llm_comparison.png ├── overall_operator_comparison.png ├── overall_results.csv ├── plot.py ├── record.csv └── score_vs_cost.png ├── main.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/README.md -------------------------------------------------------------------------------- /analyze_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/analyze_results.py -------------------------------------------------------------------------------- /dataset/AQuA/test.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/dataset/AQuA/test.jsonl -------------------------------------------------------------------------------- /dataset/MATH-500/test.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/dataset/MATH-500/test.jsonl -------------------------------------------------------------------------------- /dataset/grade-school-math/test.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/dataset/grade-school-math/test.jsonl -------------------------------------------------------------------------------- /dataset/hotpotqa/hotpot_dev_distractor_v1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/dataset/hotpotqa/hotpot_dev_distractor_v1.json -------------------------------------------------------------------------------- /dataset/hotpotqa/hotpot_dev_distractor_v1_500_sample.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/dataset/hotpotqa/hotpot_dev_distractor_v1_500_sample.json -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/base.py -------------------------------------------------------------------------------- /evaluation/datasets/aqua.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/datasets/aqua.py -------------------------------------------------------------------------------- /evaluation/datasets/gsm8k.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/datasets/gsm8k.py -------------------------------------------------------------------------------- /evaluation/datasets/hotpotqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/datasets/hotpotqa.py -------------------------------------------------------------------------------- /evaluation/datasets/math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/datasets/math.py -------------------------------------------------------------------------------- /evaluation/datasets/mme_realworld_lite.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/datasets/mme_realworld_lite.py -------------------------------------------------------------------------------- /evaluation/metrics/pass_rate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/metrics/pass_rate.py -------------------------------------------------------------------------------- /evaluation/multi_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/evaluation/multi_evaluator.py -------------------------------------------------------------------------------- /example/gsm8k_results_cot.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/example/gsm8k_results_cot.json -------------------------------------------------------------------------------- /figs/AQuA_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/AQuA_llm_comparison.png -------------------------------------------------------------------------------- /figs/AQuA_llm_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/AQuA_llm_operator_comparison.png -------------------------------------------------------------------------------- /figs/AQuA_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/AQuA_operator_comparison.png -------------------------------------------------------------------------------- /figs/AQuA_operator_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/AQuA_operator_llm_comparison.png -------------------------------------------------------------------------------- /figs/MATH-500_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/MATH-500_llm_comparison.png -------------------------------------------------------------------------------- /figs/MATH-500_llm_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/MATH-500_llm_operator_comparison.png -------------------------------------------------------------------------------- /figs/MATH-500_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/MATH-500_operator_comparison.png -------------------------------------------------------------------------------- /figs/MATH-500_operator_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/MATH-500_operator_llm_comparison.png -------------------------------------------------------------------------------- /figs/OPS5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/OPS5.png -------------------------------------------------------------------------------- /figs/average_score_vs_cost_by_algorithm_llm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/average_score_vs_cost_by_algorithm_llm.png -------------------------------------------------------------------------------- /figs/average_score_vs_cost_by_algorithm_llm_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/average_score_vs_cost_by_algorithm_llm_2.png -------------------------------------------------------------------------------- /figs/gsm8k_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/gsm8k_llm_comparison.png -------------------------------------------------------------------------------- /figs/gsm8k_llm_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/gsm8k_llm_operator_comparison.png -------------------------------------------------------------------------------- /figs/gsm8k_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/gsm8k_operator_comparison.png -------------------------------------------------------------------------------- /figs/gsm8k_operator_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/gsm8k_operator_llm_comparison.png -------------------------------------------------------------------------------- /figs/overall_llm_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/overall_llm_comparison.png -------------------------------------------------------------------------------- /figs/overall_operator_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/overall_operator_comparison.png -------------------------------------------------------------------------------- /figs/overall_results.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/overall_results.csv -------------------------------------------------------------------------------- /figs/plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/plot.py -------------------------------------------------------------------------------- /figs/record.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/record.csv -------------------------------------------------------------------------------- /figs/score_vs_cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/figs/score_vs_cost.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/main.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse 2 | 3 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/om-ai-lab/open-agent-leaderboard/HEAD/utils.py --------------------------------------------------------------------------------