├── .python-version ├── docs └── github-repo-banner.png ├── pyproject.toml ├── LICENSE ├── llm_benchmark_plotting.py ├── .github └── workflows │ └── llm-eval-benchmarking.yaml ├── .gitignore └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /docs/github-repo-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/machine-language-model-arena/main/docs/github-repo-banner.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "machine-language-model-arena" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | requires-python = ">=3.12" 6 | dependencies = [ 7 | "hf-transfer>=0.1.9", 8 | "kernels>=0.9.0", 9 | "lm-eval[ifeval]>=0.4.9.1", 10 | "matplotlib>=3.10.5", 11 | "torch>=2.8.0", 12 | "transformers>=4.55.2", 13 | ] 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2025] [Machine] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /llm_benchmark_plotting.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | import glob 7 | 8 | # File paths for the results of the two models 9 | current_dir = Path(__file__).parent 10 | 11 | model_results = { 12 | "Model 1": sorted( 13 | (current_dir / "benchmarks/model_1/").glob("*/"), 14 | key=os.path.getctime, reverse=True 15 | )[0], # Selects the latest matching directory 16 | "Model 2": sorted( 17 | (current_dir / "benchmarks/model_2/").glob("*/"), 18 | key=os.path.getctime, reverse=True 19 | )[0] # Selects the latest matching directory 20 | } 21 | 22 | metrics = {model: {} for model in model_results} 23 | tasks = set() 24 | 25 | # Extract metrics from JSON files 26 | for model, dir_path in model_results.items(): 27 | result_files = glob.glob(os.path.join(dir_path, "results_*.json")) 28 | if result_files: 29 | latest_file = max(result_files, key=os.path.getctime) 30 | with open(latest_file) as f: 31 | data = json.load(f) 32 | for task, task_metrics in data['results'].items(): 33 | tasks.add(task) 34 | metrics[model][task] = task_metrics 35 | 36 | for task in sorted(tasks): 37 | plt.figure(figsize=(12, 7)) 38 | plt.title(f'{task} Comparison: Model 1 vs Model 2') 39 | 40 | model_metrics = [metrics[m].get(task, {}) for m in model_results] 41 | shared_metrics = set(model_metrics[0].keys()) & set(model_metrics[1].keys()) 42 | 43 | # Ensure GSM8K plots correctly by using specific known keys 44 | if task == "gsm8k": 45 | shared_metrics = {"exact_match,strict-match", "exact_match,flexible-extract"} 46 | 47 | metric_names = sorted({m.split(',')[0] for m in shared_metrics}) 48 | 49 | if not metric_names: 50 | plt.close() 51 | continue 52 | 53 | x = range(len(metric_names)) 54 | width = 0.35 55 | 56 | for i, model in enumerate(model_results): 57 | values = [metrics[model].get(task, {}).get(f'{metric},strict-match', 58 | metrics[model].get(task, {}).get(f'{metric},none', 0)) for metric in metric_names] 59 | 60 | errors = [metrics[model].get(task, {}).get(f'{metric}_stderr,strict-match', 61 | metrics[model].get(task, {}).get(f'{metric}_stderr,none', 0)) for metric in metric_names] 62 | 63 | bars = plt.bar([p + i * width for p in x], values, width, yerr=errors, capsize=5, label=model) 64 | 65 | # Show values below the bars 66 | for bar, value in zip(bars, values): 67 | plt.text(bar.get_x() + bar.get_width() / 2, bar.get_y() - 0.02, 68 | f'{value:.5f}', ha='center', va='top', fontsize=8, fontweight='bold') 69 | 70 | plt.axhline(0, color='grey', linestyle='--') 71 | plt.xticks([p + width / 2 for p in x], metric_names, rotation=45, ha="right") 72 | plt.ylabel('Scores') 73 | plt.legend() 74 | plt.grid(True, linestyle='--', alpha=0.7) 75 | plt.tight_layout() 76 | 77 | output_path = current_dir / f'benchmarks/{task}_comparison.png' 78 | plt.savefig(output_path) 79 | print(f"Generated comparison plot for {task} at {output_path}") 80 | plt.close() 81 | -------------------------------------------------------------------------------- /.github/workflows/llm-eval-benchmarking.yaml: -------------------------------------------------------------------------------- 1 | name: LLM Eval Benchmarking 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | model_1: 7 | type: string 8 | required: false 9 | description: "The first model to benchmark" 10 | default: "openai/gpt-oss-20b" 11 | model_1_revision: 12 | type: string 13 | required: false 14 | description: "The first model revision to benchmark" 15 | default: "main" 16 | model_1_dtype: 17 | type: string 18 | required: false 19 | description: "The dtype for the first model" 20 | default: "auto" 21 | model_2: 22 | type: string 23 | required: false 24 | description: "The second model to benchmark" 25 | default: "google/gemma-3-12b-it" 26 | model_2_revision: 27 | type: string 28 | required: false 29 | description: "The second model revision to benchmark" 30 | default: "main" 31 | model_2_dtype: 32 | type: string 33 | required: false 34 | description: "The dtype for the second model" 35 | default: "auto" 36 | batch_size: 37 | type: string 38 | required: false 39 | description: "The batch size to use" 40 | default: "auto" 41 | tasks: 42 | type: string 43 | required: false 44 | description: "The tasks to benchmark" 45 | default: "hellaswag,arc_easy,mathqa,truthfulqa,drop,arc_challenge,gsm8k,mmlu_abstract_algebra,mmlu_college_mathematics" 46 | examples_limit: 47 | type: string 48 | required: false 49 | description: "The number of examples to use for benchmarking" 50 | default: "100" 51 | tenancy: 52 | type: choice 53 | required: false 54 | description: "The tenancy of the machine" 55 | default: "spot" 56 | options: 57 | - "spot" 58 | - "on_demand" 59 | 60 | jobs: 61 | benchmark: 62 | name: LLM Eval Benchmarking 63 | runs-on: 64 | - machine 65 | - gpu=L40S 66 | - cpu=4 67 | - ram=32 68 | - architecture=x64 69 | - tenancy=${{ inputs.tenancy }} 70 | timeout-minutes: 120 71 | env: 72 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 73 | HF_HUB_ENABLE_HF_TRANSFER: 1 74 | HF_HUB_DOWNLOAD_TIMEOUT: 120 75 | 76 | steps: 77 | - uses: actions/checkout@v4 78 | 79 | - name: Install uv 80 | uses: astral-sh/setup-uv@v5 81 | 82 | - name: Install dependencies 83 | run: | 84 | uv sync --frozen --no-dev 85 | mkdir -p ./benchmarks/ 86 | 87 | - name: Benchmark Model 1 88 | run: | 89 | uv run lm_eval --model hf \ 90 | --model_args pretrained=${{ inputs.model_1 }},revision=${{ inputs.model_1_revision }},dtype=${{ inputs.model_1_dtype }} \ 91 | --tasks ${{ inputs.tasks }} \ 92 | --trust_remote_code \ 93 | --device cuda:0 \ 94 | --batch_size ${{ inputs.batch_size }} \ 95 | --limit ${{ inputs.examples_limit }} \ 96 | --output_path ./benchmarks/model_1/ 97 | 98 | - name: Benchmark Model 2 99 | run: | 100 | uv run lm_eval --model hf \ 101 | --model_args pretrained=${{ inputs.model_2 }},revision=${{ inputs.model_2_revision }},dtype=${{ inputs.model_2_dtype }} \ 102 | --tasks ${{ inputs.tasks }} \ 103 | --trust_remote_code \ 104 | --device cuda:0 \ 105 | --batch_size ${{ inputs.batch_size }} \ 106 | --limit ${{ inputs.examples_limit }} \ 107 | --output_path ./benchmarks/model_2/ 108 | 109 | - name: Generate Benchmark Comparison Chart 110 | run: | 111 | ls -l ./benchmarks/ 112 | uv run python ./llm_benchmark_plotting.py 113 | 114 | - name: Upload Benchmark Artifacts 115 | uses: actions/upload-artifact@v4 116 | with: 117 | name: benchmark-results 118 | path: benchmarks/ 119 | retention-days: 90 120 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | 10 | # Airflow 11 | airflow_settings.yaml 12 | **__pycache__ 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | test-results/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | .idea/ 167 | *.iml 168 | 169 | # VSCode 170 | .vscode/ 171 | 172 | # Zip files 173 | *.zip 174 | 175 | # Wheel files 176 | *.whl 177 | 178 | # Mac things 179 | .DS_Store 180 | 181 | requirements.txt 182 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Machine](./docs/github-repo-banner.png)](https://machine.dev/) 2 | 3 | Machine supercharges your GitHub Workflows with seamless GPU acceleration. Say goodbye to 4 | the tedious overhead of managing GPU runners and hello to streamlined efficiency. With Machine, 5 | developers and organizations can effortlessly scale their AI and machine learning projects, 6 | shifting focus from infrastructure headaches to innovation and speed. 7 | 8 | # Language Model Arena 9 | 10 | This repository enables easy comparison of open-weight Language Models using GPU-accelerated benchmarks via GitHub Actions powered by Machine. It leverages the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate model performance across multiple reasoning and language tasks. 11 | 12 | --- 13 | 14 | ### ✨ **Key Features** 15 | 16 | - **⚡ GPU Acceleration:** Quickly benchmark large language models using GPU power. 17 | - **📊 Automated Benchmarking:** Easily compare two language models on a configurable set of tasks. 18 | - **📈 Visualized Results:** Generate clear comparison charts automatically for straightforward analysis. 19 | - **🌎 Global Efficiency:** Utilize spot-priced GPU instances globally, optimizing performance and cost. 20 | - **🚀 Easy Customization:** Configure benchmarks, models, and computational resources through GitHub workflow inputs. 21 | 22 | --- 23 | 24 | ### 📁 **Repository Structure** 25 | 26 | ``` 27 | ├── .github/workflows/ 28 | │ └── llm-eval-benchmark.yaml # Workflow configuration 29 | └── llm_benchmark_plotting.py # Script for plotting benchmark comparisons 30 | ``` 31 | 32 | --- 33 | 34 | ### ▶️ **Getting Started** 35 | 36 | #### 1. **Use This Repository as a Template** 37 | Click the **Use this template** button at the top of this page to quickly create your own benchmarking project. 38 | 39 | #### 2. **Configure Your Benchmarking** 40 | You can run the benchmarking workflow manually via GitHub Actions using the `workflow_dispatch` trigger. This allows you to input parameters such as which models to compare, which tasks to run, and how many examples to evaluate: 41 | 42 | ```yaml 43 | on: 44 | workflow_dispatch: 45 | inputs: 46 | model_1: 47 | type: string 48 | required: false 49 | description: 'The first model to benchmark' 50 | default: 'Qwen/Qwen2.5-3B-Instruct' 51 | model_1_revision: 52 | type: string 53 | required: false 54 | description: 'The first model revision to benchmark' 55 | default: 'main' 56 | model_2: 57 | type: string 58 | required: false 59 | description: 'The second model to benchmark' 60 | default: 'unsloth/Llama-3.1-8B-Instruct' 61 | model_2_revision: 62 | type: string 63 | required: false 64 | description: 'The second model revision to benchmark' 65 | default: 'main' 66 | tasks: 67 | type: string 68 | required: false 69 | description: 'The tasks to benchmark' 70 | default: 'hellaswag,arc_easy,mathqa,truthfulqa,drop,arc_challenge,gsm8k,mmlu_abstract_algebra,mmlu_college_mathematics' 71 | examples_limit: 72 | type: string 73 | required: false 74 | description: 'The number of examples to use for benchmarking' 75 | default: '100' 76 | ``` 77 | 78 | These inputs are configurable directly in the GitHub Actions UI when manually triggering the workflow. 79 | 80 | #### 3. **Run the Workflow with GPU Resources** 81 | The benchmarking job is configured to run on Machine GPU-powered runners. Instead of using standard GitHub-hosted runners, it provisions custom GPU instances with the desired hardware specs: 82 | 83 | ```yaml 84 | jobs: 85 | benchmark: 86 | name: LLM Eval Benchmarking 87 | runs-on: 88 | - machine 89 | - gpu=L40S 90 | - cpu=4 91 | - ram=32 92 | - architecture=x64 93 | - tenancy=spot 94 | ``` 95 | 96 | This setup runs on a Machine runner with an L40S GPU, 4 vCPUs, and 32 GB RAM. By specifying `tenancy=spot`, you can take advantage of lower-cost spot pricing. Machine automatically searches globally for the best available spot instance. 97 | 98 | To further control where runners are provisioned, you can specify allowed regions: 99 | 100 | ```yaml 101 | jobs: 102 | benchmark: 103 | name: LLM Eval Benchmarking 104 | runs-on: 105 | - machine 106 | - gpu=L40S 107 | - cpu=4 108 | - ram=32 109 | - architecture=x64 110 | - tenancy=spot 111 | - regions=us-east-1,us-east-2 112 | ``` 113 | 114 | This limits provisioning to the listed AWS regions. 115 | 116 | #### 4. **Generate and Review Benchmark Results** 117 | The workflow automatically runs evaluation scripts and generates visual comparison charts: 118 | 119 | ```yaml 120 | - name: Generate Benchmark Comparison Chart 121 | run: | 122 | ls -l ./benchmarks/ 123 | python ./llm_benchmark_plotting.py 124 | ``` 125 | 126 | These charts compare the performance of both models across the selected tasks. 127 | 128 | #### 5. **Export Results as Artifacts** 129 | All benchmark outputs, including raw JSON and charts, are saved as GitHub Actions artifacts: 130 | 131 | ```yaml 132 | - name: Upload Benchmark Artifacts 133 | uses: actions/upload-artifact@v4 134 | with: 135 | name: benchmark-results 136 | path: benchmarks/ 137 | retention-days: 90 138 | ``` 139 | 140 | You can download these results after the workflow completes for further analysis or sharing. 141 | 142 | --- 143 | 144 | ### 🔑 **Prerequisites** 145 | 146 | - GitHub account 147 | - Access to [Machine](https://machine.dev) GPU-powered runners 148 | - Hugging Face token for model access. Save to repository secrets as `HF_TOKEN`. 149 | 150 | _No local setup required; all benchmarks run seamlessly through GitHub Actions._ 151 | 152 | --- 153 | 154 | ### 📄 **License** 155 | 156 | This repository is available under the [MIT License](LICENSE). 157 | 158 | --- 159 | 160 | ### 📌 **Notes** 161 | 162 | - Benchmarks provided are designed to test reasoning capabilities across tasks like: 163 | - `hellaswag`, `arc_easy`, `mathqa`, `truthfulqa`, `drop`, `arc_challenge`, `gsm8k`, `mmlu_abstract_algebra`, and `mmlu_college_mathematics`. 164 | 165 | - This repository is currently open for use as a template. While public forks are encouraged, we are not accepting Pull Requests at this time. 166 | 167 | _For questions or concerns, please open an issue._ 168 | --------------------------------------------------------------------------------