├── .env ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── TODO.md ├── benchmark_results ├── 3_digits │ ├── benchmark_4o-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_4o-mini-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_4t-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_dschat25-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_flash-20-exp-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_flash15-002-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_geminipro15-002-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_haiku3-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_llama31-405bi-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_llama31-8bi-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_llama33-70b-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_o1mini-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_qwq-pr-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── benchmark_sonnet35new-3d │ │ ├── full_conversations.json │ │ ├── results.json │ │ └── run.log │ ├── results_table.md │ └── visualization.html └── 4_digits │ ├── benchmark_4o-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_4o-500-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_4o-500-temp0-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_4o-mini-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_4o-mini-500-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_4o-mini-500-temp0-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_dschat25-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_dschat30-temp0-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_dsr1-50-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_flash-20-exp-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_flash15-002-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_geminipro15-002-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_haiku35-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_llama31-405bi-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_o1mini-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_o1mini-500-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_o3mini-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_qwq-pr-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── benchmark_sonnet35new-4d │ ├── full_conversations.json │ ├── results.json │ └── run.log │ ├── results_table.md │ └── visualization.html ├── config └── default_config.yaml ├── pyproject.toml ├── requirements.txt ├── run_benchmark.py ├── scripts └── visualize_results.py ├── src ├── benchmark.py ├── game.py ├── llm_player.py ├── logger.py └── prompts.py ├── static └── images │ ├── progress_demo.png │ ├── results.png │ └── results_2.png └── tests ├── test_benchmark.py ├── test_game.py ├── test_llm_player.py └── test_visualization.py /.env: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/.env -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/README.md -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/TODO.md -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-mini-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-mini-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-mini-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-mini-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4o-mini-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4o-mini-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4t-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4t-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4t-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4t-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_4t-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_4t-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_dschat25-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_dschat25-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_dschat25-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_dschat25-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_dschat25-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_dschat25-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash-20-exp-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash-20-exp-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash-20-exp-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash-20-exp-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash-20-exp-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash-20-exp-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash15-002-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash15-002-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash15-002-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash15-002-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_flash15-002-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_flash15-002-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_geminipro15-002-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_geminipro15-002-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_geminipro15-002-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_geminipro15-002-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_geminipro15-002-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_geminipro15-002-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_haiku3-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_haiku3-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_haiku3-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_haiku3-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_haiku3-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_haiku3-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-405bi-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-405bi-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-405bi-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-405bi-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-405bi-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-405bi-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-8bi-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-8bi-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-8bi-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-8bi-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama31-8bi-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama31-8bi-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama33-70b-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama33-70b-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama33-70b-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama33-70b-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_llama33-70b-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_llama33-70b-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_o1mini-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_o1mini-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_o1mini-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_o1mini-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_o1mini-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_o1mini-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_qwq-pr-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_qwq-pr-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_qwq-pr-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_qwq-pr-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_qwq-pr-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_qwq-pr-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_sonnet35new-3d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_sonnet35new-3d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_sonnet35new-3d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_sonnet35new-3d/results.json -------------------------------------------------------------------------------- /benchmark_results/3_digits/benchmark_sonnet35new-3d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/benchmark_sonnet35new-3d/run.log -------------------------------------------------------------------------------- /benchmark_results/3_digits/results_table.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/results_table.md -------------------------------------------------------------------------------- /benchmark_results/3_digits/visualization.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/3_digits/visualization.html -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-temp0-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-temp0-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-temp0-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-temp0-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-500-temp0-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-500-temp0-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_4o-mini-500-temp0-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat25-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat25-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat25-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat25-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat25-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat25-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat30-temp0-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat30-temp0-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat30-temp0-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat30-temp0-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dschat30-temp0-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dschat30-temp0-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dsr1-50-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dsr1-50-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dsr1-50-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dsr1-50-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_dsr1-50-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_dsr1-50-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash-20-exp-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash-20-exp-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash-20-exp-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash-20-exp-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash-20-exp-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash-20-exp-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash15-002-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash15-002-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash15-002-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash15-002-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_flash15-002-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_flash15-002-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_geminipro15-002-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_geminipro15-002-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_geminipro15-002-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_geminipro15-002-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_geminipro15-002-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_geminipro15-002-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_haiku35-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_haiku35-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_haiku35-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_haiku35-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_haiku35-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_haiku35-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_llama31-405bi-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_llama31-405bi-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_llama31-405bi-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_llama31-405bi-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_llama31-405bi-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_llama31-405bi-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-500-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-500-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-500-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-500-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o1mini-500-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o1mini-500-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o3mini-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o3mini-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o3mini-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o3mini-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_o3mini-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_o3mini-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_qwq-pr-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_qwq-pr-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_qwq-pr-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_qwq-pr-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_qwq-pr-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_qwq-pr-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_sonnet35new-4d/full_conversations.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_sonnet35new-4d/full_conversations.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_sonnet35new-4d/results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_sonnet35new-4d/results.json -------------------------------------------------------------------------------- /benchmark_results/4_digits/benchmark_sonnet35new-4d/run.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/benchmark_sonnet35new-4d/run.log -------------------------------------------------------------------------------- /benchmark_results/4_digits/results_table.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/results_table.md -------------------------------------------------------------------------------- /benchmark_results/4_digits/visualization.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/benchmark_results/4_digits/visualization.html -------------------------------------------------------------------------------- /config/default_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/config/default_config.yaml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/requirements.txt -------------------------------------------------------------------------------- /run_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/run_benchmark.py -------------------------------------------------------------------------------- /scripts/visualize_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/scripts/visualize_results.py -------------------------------------------------------------------------------- /src/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/src/benchmark.py -------------------------------------------------------------------------------- /src/game.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/src/game.py -------------------------------------------------------------------------------- /src/llm_player.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/src/llm_player.py -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/src/logger.py -------------------------------------------------------------------------------- /src/prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/src/prompts.py -------------------------------------------------------------------------------- /static/images/progress_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/static/images/progress_demo.png -------------------------------------------------------------------------------- /static/images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/static/images/results.png -------------------------------------------------------------------------------- /static/images/results_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/static/images/results_2.png -------------------------------------------------------------------------------- /tests/test_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/tests/test_benchmark.py -------------------------------------------------------------------------------- /tests/test_game.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/tests/test_game.py -------------------------------------------------------------------------------- /tests/test_llm_player.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/tests/test_llm_player.py -------------------------------------------------------------------------------- /tests/test_visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stalkermustang/llm-bulls-and-cows-benchmark/HEAD/tests/test_visualization.py --------------------------------------------------------------------------------