├── .github
    └── workflows
    │   └── deploy.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── api
    ├── .env.example
    ├── llm_bench
    │   ├── __init__.py
    │   ├── api.py
    │   ├── cloud
    │   │   ├── __init__.py
    │   │   ├── logging.py
    │   │   ├── providers
    │   │   │   ├── anthropic.py
    │   │   │   ├── anyscale.py
    │   │   │   ├── azure.py
    │   │   │   ├── bedrock.py
    │   │   │   ├── databricks.py
    │   │   │   ├── deepinfra.py
    │   │   │   ├── fireworks.py
    │   │   │   ├── groq.py
    │   │   │   ├── lambda.py
    │   │   │   ├── openai.py
    │   │   │   ├── openrouter.py
    │   │   │   ├── runpod.py
    │   │   │   ├── together.py
    │   │   │   └── vertex.py
    │   │   └── server.py
    │   ├── config.py
    │   ├── local
    │   │   ├── __init__.py
    │   │   ├── gguf
    │   │   │   ├── __init__.py
    │   │   │   ├── create_model.py
    │   │   │   ├── server.py
    │   │   │   └── utils.py
    │   │   └── hf
    │   │   │   ├── __init__.py
    │   │   │   ├── server.py
    │   │   │   ├── tgi
    │   │   │       ├── __init__.py
    │   │   │       ├── generate.py
    │   │   │       └── tgi_docker.py
    │   │   │   └── transformers
    │   │   │       ├── __init__.py
    │   │   │       └── generate.py
    │   ├── logging.py
    │   ├── types.py
    │   └── utils.py
    ├── run_cloud.py
    ├── run_gguf.py
    ├── run_hf.py
    ├── run_test.py
    └── run_vllm.py
├── cloud
    ├── .env.example
    ├── Dockerfile-cloud
    ├── README.md
    ├── docker-compose.cloud.yml
    └── models.json
├── local
    ├── .env.example
    ├── README.md
    ├── docker-compose.local.yml
    ├── gguf
    │   ├── Dockerfile-gguf
    │   ├── create_models.sh
    │   └── pyproject.toml
    ├── huggingface
    │   ├── Dockerfile-huggingface
    │   ├── pyproject.toml
    │   └── tgi_server.sh
    └── vllm
    │   ├── Dockerfile-vllm
    │   ├── llm_bench_vllm
    │       ├── __init__.py
    │       ├── generate.py
    │       └── server.py
    │   └── pyproject.toml
├── models_config.yaml
├── pyproject.toml
├── static
    ├── benchmarks_all_models.png
    ├── benchmarks_large_models.png
    ├── dolly2_compare_size_and_quant_inference.png
    ├── falcon_compare_quantization_inference.png
    ├── ggml-hf-llama-compare.png
    ├── gpt2_compare_quantization_inference.png
    └── llama_compare_size_and_quant_inference.png
└── tox.ini


/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy - Clifford
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Set up SSH
17 |         run: |
18 |           mkdir -p ~/.ssh
19 |           echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
20 |           chmod 600 ~/.ssh/id_rsa
21 |           ssh-keyscan -H 5.161.97.53 >> ~/.ssh/known_hosts
22 | 
23 |       - name: Deploy to VPS
24 |         run: |
25 |           ssh -i ~/.ssh/id_rsa drose@5.161.97.53 << EOF
26 |           cd llm-benchmarks
27 | 
28 |           git reset --hard HEAD
29 |           git pull origin main --force
30 | 
31 |           docker compose down
32 |           docker compose -f ./cloud/docker-compose.cloud.yml up --build --remove-orphans -d
33 |           EOF
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | wandb/
 3 | __pycache__/
 4 | results/
 5 | old/
 6 | *.lock
 7 | *.ipynb
 8 | *.env
 9 | logs/
10 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Apply to all files without commiting:
 2 | #   pre-commit run --all-files
 3 | # Update this file:
 4 | #   pre-commit autoupdate
 5 | repos:
 6 |   -   repo: https://github.com/pre-commit/pre-commit-hooks
 7 |       rev: v4.6.0
 8 |       hooks:
 9 |         -   id: fix-byte-order-marker
10 |         -   id: check-case-conflict
11 |         -   id: check-executables-have-shebangs
12 |   -   repo: https://github.com/astral-sh/ruff-pre-commit
13 |       rev: v0.4.5
14 |       hooks:
15 |         -   id: ruff
16 |             args: [--fix]
17 |         -   id: ruff-format
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 David Rose
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![llmbenchmarkscom](https://cronitor.io/badges/G8yp5e/production/VnmBXHNorcpEyvbg9ASvxeGp8zU.svg)
  2 | 
  3 | # LLM Benchmarks
  4 | 
  5 | ### 🌐 Live at: [llm-benchmarks.com](https://llm-benchmarks.com)
  6 | [![Status](https://img.shields.io/uptimerobot/status/m797914664-fefc15fb1a5bba071a8a5c91)](https://stats.uptimerobot.com/m797914664-fefc15fb1a5bba071a8a5c91)
  7 | [![Uptime](https://img.shields.io/uptimerobot/ratio/30/m797914664-fefc15fb1a5bba071a8a5c91)](https://stats.uptimerobot.com/m797914664-fefc15fb1a5bba071a8a5c91)
  8 | 
  9 | [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 10 | [![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg?logo=docker&logoColor=white)](https://www.docker.com/)
 11 | [![MongoDB](https://img.shields.io/badge/MongoDB-4EA94B.svg?logo=mongodb&logoColor=white)](https://www.mongodb.com/)
 12 | [![Redis](https://img.shields.io/badge/Redis-DC382D.svg?logo=redis&logoColor=white)](https://redis.io/)
 13 | [![NVIDIA CUDA](https://img.shields.io/badge/NVIDIA-CUDA-76B900.svg?logo=nvidia&logoColor=white)](https://developer.nvidia.com/cuda-toolkit)
 14 | [![vLLM](https://img.shields.io/badge/vLLM-Accelerated_Inference-orange.svg)](https://github.com/vllm-project/vllm)
 15 | [![Hugging Face](https://img.shields.io/badge/🤗_Hugging_Face-Transformers-yellow.svg)](https://huggingface.co/docs/transformers/index)
 16 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
 17 | 
 18 | A comprehensive framework for benchmarking LLM inference speeds across various models and frameworks.
 19 | 
 20 | ## Overview
 21 | 
 22 | This project provides tools to benchmark Large Language Model (LLM) inference speeds across different frameworks, model sizes, and quantization methods. The benchmarks are designed to run both locally and in cloud environments, with results displayed on a dashboard at [llm-benchmarks.com](https://llm-benchmarks.com).
 23 | 
 24 | The system uses Docker with various frameworks (vLLM, Transformers, Text-Generation-Inference, llama-cpp) to automate benchmarks and upload results to a MongoDB database. Most frameworks fetch models from the HuggingFace Hub and cache them for on-demand loading, with the exception of llama-cpp/GGUF which requires specially compiled model formats.
 25 | 
 26 | ## Repository Structure
 27 | 
 28 | - **`/api`**: Core benchmarking logic and API clients for different frameworks
 29 | - **`/cloud`**: Configuration and Docker setup for cloud-based benchmarks (OpenAI, Anthropic, etc.)
 30 | - **`/local`**: Configuration and Docker setup for local benchmarks (Hugging Face, vLLM, GGUF)
 31 |   - **`/local/huggingface`**: Transformers and Text-Generation-Inference benchmarks
 32 |   - **`/local/vllm`**: vLLM benchmarks
 33 |   - **`/local/gguf`**: GGUF/llama-cpp benchmarks
 34 | - **`/scripts`**: Utility scripts and notebooks
 35 | - **`/static`**: Static assets like benchmark result images
 36 | - **`models_config.yaml`**: Configuration for model groups used in benchmarks
 37 | 
 38 | ## Getting Started
 39 | 
 40 | ### Prerequisites
 41 | 
 42 | - Docker and Docker Compose
 43 | - NVIDIA GPU with CUDA support
 44 | - Python 3.9+
 45 | - MongoDB (optional, for result storage)
 46 | - Redis (for task queuing)
 47 | 
 48 | ### Setup
 49 | 
 50 | 1. Clone the repository:
 51 |    ```bash
 52 |    git clone https://github.com/cipher982/llm-benchmarks.git
 53 |    cd llm-benchmarks
 54 |    ```
 55 | 
 56 | 2. Set up environment variables:
 57 |    ```bash
 58 |    # For local benchmarks
 59 |    cp local/.env.example local/.env
 60 |    # For cloud benchmarks
 61 |    cp cloud/.env.example cloud/.env
 62 |    ```
 63 | 
 64 | 3. Edit the `.env` files with your configuration:
 65 |    - Set `HF_HUB_CACHE` to your Hugging Face model cache directory
 66 |    - Configure MongoDB connection if using (`MONGODB_URI`, `MONGODB_DB`, etc.)
 67 |    - Set API keys for cloud providers if benchmarking them
 68 |    - Configure Redis connection details
 69 | 
 70 | ### Running Benchmarks
 71 | 
 72 | #### Local Benchmarks
 73 | 
 74 | 1. Start the local benchmark containers:
 75 |    ```bash
 76 |    cd local
 77 |    docker compose -f docker-compose.local.yml up --build
 78 |    ```
 79 | 
 80 | 2. Run benchmarks for specific frameworks:
 81 | 
 82 |    - Hugging Face Transformers:
 83 |      ```bash
 84 |      python api/run_hf.py --framework transformers --limit 5 --max-size-billion 10 --run-always
 85 |      ```
 86 | 
 87 |    - Hugging Face Text-Generation-Inference:
 88 |      ```bash
 89 |      python api/run_hf.py --framework hf-tgi --limit 5 --max-size-billion 10 --run-always
 90 |      ```
 91 | 
 92 |    - vLLM:
 93 |      ```bash
 94 |      python api/run_vllm.py --framework vllm --limit 5 --max-size-billion 10 --run-always
 95 |      ```
 96 | 
 97 |    - GGUF/llama-cpp:
 98 |      ```bash
 99 |      python api/run_gguf.py --limit 5 --run-always --log-level DEBUG
100 |      ```
101 | 
102 | #### Cloud Benchmarks
103 | 
104 | 1. Start the cloud benchmark container:
105 |    ```bash
106 |    cd cloud
107 |    docker compose -f docker-compose.cloud.yml up --build
108 |    ```
109 | 
110 | 2. Run benchmarks for cloud providers:
111 |    ```bash
112 |    python api/run_cloud.py --providers openai
113 |    # Or run all configured providers
114 |    python api/run_cloud.py --providers all
115 |    ```
116 | 
117 | ## Viewing Results
118 | 
119 | Results can be viewed in several ways:
120 | 
121 | 1. **Dashboard**: Visit [llm-benchmarks.com](https://llm-benchmarks.com) to see the latest benchmark results
122 | 2. **Log Files**: Check the `logs/` directory for:
123 |    - `benchmarks_local.log` and `benchmarks_cloud.log`: Text logs with detailed metrics
124 |    - `benchmarks_local.json` and `benchmarks_cloud.json`: JSON-formatted logs
125 | 3. **MongoDB**: If configured, results are stored in MongoDB collections
126 | 
127 | ## Benchmark Results
128 | 
129 | The benchmarks measure inference speed across different models, quantization methods, and output token counts. Results indicate that even the slowest performing combinations still handily beat GPT-4 and almost always match or beat GPT-3.5, sometimes significantly.
130 | 
131 | ### Framework Comparisons
132 | 
133 | Different frameworks show significant performance variations. For example, GGML with cuBLAS significantly outperforms Hugging Face Transformers with BitsAndBytes quantization:
134 | 
135 | ![GGML v HF](https://github.com/cipher982/llm-benchmarks/blob/main/static/ggml-hf-llama-compare.png?raw=true)
136 | 
137 | ### Model Size and Quantization Impact
138 | 
139 | Benchmarks show how model size and quantization affect inference speed:
140 | 
141 | #### LLaMA Models
142 | ![LLaMA Models](https://github.com/cipher982/llm-benchmarks/blob/main/static/llama_compare_size_and_quant_inference.png?raw=true)
143 | 
144 | #### Dolly-2 Models
145 | ![Dolly2 Models](https://github.com/cipher982/llm-benchmarks/blob/main/static/dolly2_compare_size_and_quant_inference.png?raw=true)
146 | 
147 | #### Falcon Models
148 | ![Falcon Models](https://github.com/cipher982/llm-benchmarks/blob/main/static/falcon_compare_quantization_inference.png?raw=true)
149 | 
150 | ## Hardware Considerations
151 | 
152 | Benchmarks have been run on various GPUs including:
153 | - NVIDIA RTX 3090
154 | - NVIDIA A10
155 | - NVIDIA A100
156 | - NVIDIA H100
157 | 
158 | The H100 consistently delivers the fastest performance but at a higher cost (~$2.40/hour). Surprisingly, the A10 performed below expectations despite its higher tensor core count, possibly due to memory bandwidth limitations.
159 | 
160 | ## Contributing
161 | 
162 | Contributions are welcome! To add new models or frameworks:
163 | 
164 | 1. Fork the repository
165 | 2. Create a feature branch
166 | 3. Add your implementation
167 | 4. Submit a pull request
168 | 
169 | For more details, see the individual README files in the `/local` and `/cloud` directories.
170 | 
171 | ## License
172 | 
173 | This project is licensed under the MIT License - see the LICENSE file for details.
174 | 


--------------------------------------------------------------------------------
/api/.env.example:
--------------------------------------------------------------------------------
1 | LOGS_DIR="./logs"
2 | FASTAPI_PORT_CLOUD="5004"
3 | 


--------------------------------------------------------------------------------
/api/llm_bench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/api/llm_bench/__init__.py


--------------------------------------------------------------------------------
/api/llm_bench/api.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from enum import Enum
  4 | from enum import auto
  5 | from typing import Optional
  6 | 
  7 | import requests
  8 | from requests.exceptions import HTTPError
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | FLASK_URL = "http://localhost:{}/benchmark"
 13 | 
 14 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
 15 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
 16 | 
 17 | 
 18 | class ModelType(Enum):
 19 |     GPTQ = auto()
 20 |     AWQ = auto()
 21 |     OTHER = auto()
 22 | 
 23 | 
 24 | class BenchmarkConfig:
 25 |     def __init__(
 26 |         self,
 27 |         framework: str,
 28 |         model: str,
 29 |         quant_types: list,
 30 |         limit: int,
 31 |         run_always: bool,
 32 |         query: str,
 33 |         max_tokens: int,
 34 |         temperature: float,
 35 |         flask_port: int,
 36 |     ):
 37 |         self.framework = framework
 38 |         self.model = model
 39 |         self.quant_types = quant_types
 40 |         self.limit = limit
 41 |         self.run_always = run_always
 42 |         self.query = query
 43 |         self.max_tokens = max_tokens
 44 |         self.temperature = temperature
 45 |         self.flask_port = flask_port
 46 | 
 47 | 
 48 | class CloudConfig:
 49 |     def __init__(
 50 |         self,
 51 |         provider: str,
 52 |         model: str,
 53 |         query: str,
 54 |         max_tokens: int,
 55 |         temperature: float,
 56 |         limit: int,
 57 |         run_always: bool,
 58 |     ):
 59 |         self.provider = provider
 60 |         self.model = model
 61 |         self.query = query
 62 |         self.max_tokens = max_tokens
 63 |         self.temperature = temperature
 64 |         self.limit = limit
 65 |         self.run_always = run_always
 66 | 
 67 | 
 68 | def determine_model_type(model_name: str) -> ModelType:
 69 |     if "GPTQ" in model_name:
 70 |         return ModelType.GPTQ
 71 |     elif "AWQ" in model_name:
 72 |         return ModelType.AWQ
 73 |     else:
 74 |         return ModelType.OTHER
 75 | 
 76 | 
 77 | def bench_all_models(
 78 |     framework: str,
 79 |     quant_types: list,
 80 |     model_names: list[str],
 81 |     model_status: dict[str, dict],
 82 |     limit: int,
 83 |     run_always: bool,
 84 |     query: str,
 85 |     max_tokens: int,
 86 |     temperature: float,
 87 |     flask_port: int,
 88 | ) -> None:
 89 |     for model in model_names[:limit]:
 90 |         model_type = determine_model_type(model)
 91 |         is_limit_reached = run_benchmark_for_type(
 92 |             framework,
 93 |             model,
 94 |             quant_types,
 95 |             model_status,
 96 |             model_type,
 97 |             limit,
 98 |             run_always,
 99 |             query,
100 |             max_tokens,
101 |             temperature,
102 |             flask_port,
103 |         )
104 |         if is_limit_reached:
105 |             break
106 | 
107 | 
108 | def run_benchmark_for_type(
109 |     framework: str,
110 |     model: str,
111 |     quant_types: list,
112 |     model_status: dict[str, dict],
113 |     model_type: ModelType,
114 |     limit: int,
115 |     run_always: bool,
116 |     query: str,
117 |     max_tokens: int,
118 |     temperature: float,
119 |     flask_port: int,
120 | ) -> bool:
121 |     config = BenchmarkConfig(
122 |         framework,
123 |         model,
124 |         quant_types,
125 |         limit,
126 |         run_always,
127 |         query,
128 |         max_tokens,
129 |         temperature,
130 |         flask_port,
131 |     )
132 |     if model_type == ModelType.GPTQ:
133 |         return run_benchmark(config, model_status, "gptq", "4bit")
134 |     elif model_type == ModelType.AWQ:
135 |         return run_benchmark(config, model_status, "awq", "4bit")
136 |     else:
137 |         for quant in quant_types:
138 |             quant_method = "bitsandbytes" if quant is not None else None
139 |             if run_benchmark(config, model_status, quant_method, quant):
140 |                 return True
141 |     return False
142 | 
143 | 
144 | def run_benchmark(
145 |     config: BenchmarkConfig,
146 |     model_status: dict[str, dict],
147 |     quant_method: Optional[str],
148 |     quant_bits: Optional[str],
149 | ) -> bool:
150 |     """
151 |     Run benchmark for a given model and quantization type.
152 |     Returns True if the limit is reached, False otherwise.
153 |     """
154 |     quant_str = f"{quant_method}_{quant_bits}" if quant_method is not None else "none"
155 |     print(f"Running benchmark: {config.model}, quant: {quant_str}")
156 | 
157 |     flask_data = {
158 |         "framework": config.framework,
159 |         "model_name": config.model,
160 |         "query": config.query,
161 |         "quant_method": quant_method,
162 |         "quant_bits": quant_bits,
163 |         "max_tokens": config.max_tokens,
164 |         "temperature": config.temperature,
165 |         "run_always": config.run_always,
166 |     }
167 |     try:
168 |         response = requests.post(FLASK_URL.format(config.flask_port), data=flask_data)
169 |         response.raise_for_status()
170 |     except HTTPError as http_err:
171 |         print(f"HTTP error occurred: {http_err}")
172 |         model_status[f"{config.model}_{quant_str}"] = {"status_code": 500, "json": {}}
173 |         return False
174 |     except Exception as err:
175 |         print(f"Other error occurred: {err}")
176 |         model_status[f"{config.model}_{quant_str}"] = {"status_code": 500, "json": {}}
177 |         return False
178 |     else:
179 |         response_code = response.status_code
180 |         response_json = response.json()
181 |         print(f"Finished benchmark: {config.model}, quant: {quant_str} with Status Code: {response_code}")
182 | 
183 |         model_status[f"{config.model}_{quant_str}"] = {"status_code": response_code, "json": response_json}
184 |         return len(model_status) >= config.limit
185 | 
186 | 
187 | def print_summary(model_status: dict[str, dict]) -> None:
188 |     """
189 |     Print a summary of the benchmark runs.
190 |     """
191 |     print("Summary of benchmark runs:")
192 |     skipped_models = []
193 |     for model, response in model_status.items():
194 |         status = response["json"]["status"] if "json" in response and "status" in response["json"] else "unknown"
195 |         if status == "skipped":
196 |             skipped_models.append(model)
197 |             continue
198 | 
199 |     if skipped_models:
200 |         print(f"Skipped models: {', '.join(skipped_models)} ⏭️")
201 | 
202 |     for model, response in model_status.items():
203 |         status = response["json"]["status"] if "json" in response and "status" in response["json"] else "unknown"
204 |         if status == "skipped":
205 |             continue
206 |         elif response["status_code"] == 200:
207 |             print(f"Model: {model}, {response['status_code']} ✅ (Benchmark Successful)")
208 |         elif response["status_code"] == 500:
209 |             print(f"Model: {model}, {response['status_code']} ❌ (Benchmark Failed)")
210 |         else:
211 |             print(f"Model: {model}, {response['status_code']} ❓ (Unknown Status)")
212 |     print("🎊 Done 🎊")
213 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/api/llm_bench/cloud/__init__.py


--------------------------------------------------------------------------------
/api/llm_bench/cloud/logging.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import defaultdict
  4 | from datetime import datetime
  5 | from typing import Any
  6 | from typing import Dict
  7 | from typing import List
  8 | 
  9 | import redis
 10 | 
 11 | 
 12 | class CustomJSONEncoder(json.JSONEncoder):
 13 |     def default(self, obj: Any) -> Any:
 14 |         if isinstance(obj, datetime):
 15 |             return obj.isoformat()
 16 |         return super().default(obj)
 17 | 
 18 | 
 19 | class Logger:
 20 |     def __init__(self, logs_dir: str, redis_url: str, max_runs: int = 10):
 21 |         if not redis_url:
 22 |             raise ValueError("redis_url must be provided")
 23 |         if not redis_url.startswith("redis://"):
 24 |             raise ValueError("redis_url must start with 'redis://'")
 25 | 
 26 |         self.redis_url = redis_url
 27 |         self.max_runs = max_runs  # Number of runs to keep in history
 28 | 
 29 |         self.logger = logging.getLogger(__name__)
 30 |         self.logger.setLevel(logging.INFO)
 31 | 
 32 |         stream_handler = logging.StreamHandler()
 33 |         stream_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
 34 | 
 35 |         self.logger.addHandler(stream_handler)
 36 | 
 37 |     def log_info(self, message: str) -> None:
 38 |         self.logger.info(message)
 39 | 
 40 |     def log_error(self, message: str) -> None:
 41 |         self.logger.error(message)
 42 | 
 43 |     def log_benchmark_request(self, request: Any) -> None:
 44 |         self.log_info(f"Benchmark Request - Provider: {request.provider}, Model: {request.model}")
 45 | 
 46 |     def get_run_outcome(self, status: Dict[str, Any]) -> bool:
 47 |         return status.get("status") == "success"
 48 | 
 49 |     def log_benchmark_status(self, model_status: List[Dict[str, Any]]) -> None:
 50 |         try:
 51 |             with redis.Redis.from_url(self.redis_url) as redis_client:
 52 |                 # Get current data from Redis
 53 |                 current_data = redis_client.get("cloud_log_status")
 54 |                 existing_data = defaultdict(lambda: {"runs": []}, json.loads(current_data) if current_data else {})
 55 | 
 56 |                 # Track which providers we're updating in this run
 57 |                 current_providers = {status.get("provider") for status in model_status if status.get("provider")}
 58 |                 self.log_info(f"Updating status for providers: {current_providers}")
 59 | 
 60 |                 # Process new results
 61 |                 for status in model_status:
 62 |                     try:
 63 |                         # self.log_info(f"Processing status: {json.dumps(status, default=str)}")
 64 | 
 65 |                         model = status["model"]
 66 |                         provider = status.get("provider")
 67 |                         if not provider:
 68 |                             self.log_error(f"No provider found in status: {json.dumps(status, default=str)}")
 69 |                             continue
 70 | 
 71 |                         composite_key = f"{provider}:{model}"
 72 |                         existing_data[composite_key].update(
 73 |                             {
 74 |                                 "provider": provider,
 75 |                                 "model": model,
 76 |                                 "last_run_timestamp": status.get("timestamp", datetime.now().isoformat()),
 77 |                             }
 78 |                         )
 79 | 
 80 |                         # Add new run to the end of the list
 81 |                         existing_data[composite_key]["runs"].append(self.get_run_outcome(status))
 82 |                         # If we exceed max_runs, remove oldest entries (from the beginning)
 83 |                         if len(existing_data[composite_key]["runs"]) > self.max_runs:
 84 |                             existing_data[composite_key]["runs"] = existing_data[composite_key]["runs"][
 85 |                                 -self.max_runs :
 86 |                             ]
 87 | 
 88 |                     except KeyError as e:
 89 |                         self.log_error(f"KeyError processing status: {e}, Status: {json.dumps(status, default=str)}")
 90 |                         continue
 91 |                     except Exception as e:
 92 |                         self.log_error(
 93 |                             f"Error processing individual status: {str(e)}, Status: {json.dumps(status, default=str)}"
 94 |                         )
 95 |                         continue
 96 | 
 97 |                 # Update Redis with new data
 98 |                 redis_client.set("cloud_log_status", json.dumps(existing_data, cls=CustomJSONEncoder))
 99 |                 self.log_info("Successfully updated api status to redis")
100 | 
101 |         except redis.ConnectionError as e:
102 |             self.log_error(f"Redis connection error: {str(e)}")
103 |         except redis.RedisError as e:
104 |             self.log_error(f"Redis error: {str(e)}")
105 |         except json.JSONDecodeError as e:
106 |             self.log_error(f"Error decoding Redis data: {str(e)}")
107 |         except Exception as e:
108 |             self.log_error(f"Unexpected error occurred: {str(e)}")
109 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/anthropic.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | from datetime import datetime
 4 | 
 5 | from anthropic import Anthropic
 6 | from llm_bench.config import CloudConfig
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | NON_CHAT_MODELS = []
12 | 
13 | 
14 | def generate(config: CloudConfig, run_config: dict) -> dict:
15 |     """Run Anthropic inference using the new Messages format and return metrics, with streaming."""
16 | 
17 |     assert config.provider == "anthropic", "provider must be anthropic"
18 |     assert "query" in run_config, "query must be in run_config"
19 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
20 | 
21 |     # Set up connection
22 |     anthropic = Anthropic()
23 | 
24 |     # Generate
25 |     time_0 = time.time()
26 |     first_token_received = False
27 |     previous_token_time = None
28 |     output_tokens = 0
29 |     times_between_tokens = []
30 | 
31 |     if config.model_name in NON_CHAT_MODELS:
32 |         raise NotImplementedError
33 |     else:
34 |         with anthropic.messages.stream(
35 |             model=config.model_name,
36 |             max_tokens=run_config["max_tokens"],
37 |             messages=[{"role": "user", "content": run_config["query"]}],
38 |         ) as stream:
39 |             time_to_first_token = None
40 |             for event in stream:
41 |                 current_time = time.time()
42 |                 event_type = type(event).__name__
43 |                 if event_type == "RawMessageStartEvent":
44 |                     first_token_received = False
45 |                 elif event_type == "RawContentBlockDeltaEvent":
46 |                     if not first_token_received:
47 |                         time_to_first_token = current_time - time_0
48 |                         first_token_received = True
49 |                     else:
50 |                         assert previous_token_time is not None
51 |                         times_between_tokens.append(current_time - previous_token_time)
52 |                     previous_token_time = current_time
53 |                 elif event_type == "MessageStopEvent":
54 |                     output_tokens = event.message.usage.output_tokens  # type: ignore
55 | 
56 |     time_1 = time.time()
57 |     generate_time = time_1 - time_0
58 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
59 | 
60 |     metrics = {
61 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
62 |         "requested_tokens": run_config["max_tokens"],
63 |         "output_tokens": output_tokens,
64 |         "generate_time": generate_time,
65 |         "tokens_per_second": tokens_per_second,
66 |         "time_to_first_token": time_to_first_token,
67 |         "times_between_tokens": times_between_tokens,
68 |     }
69 | 
70 |     return metrics
71 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/anyscale.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from llm_bench.config import CloudConfig
 7 | from openai import OpenAI
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def generate(config: CloudConfig, run_config: dict) -> dict:
13 |     """Run Anyscale inference and return metrics."""
14 | 
15 |     assert config.provider == "anyscale", "provider must be Anyscale"
16 |     assert "query" in run_config, "query must be in run_config"
17 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
18 | 
19 |     # Set up connection
20 |     client = OpenAI(
21 |         base_url=os.environ["ANYSCALE_BASE_URL"],
22 |         api_key=os.environ["ANYSCALE_API_KEY"],
23 |     )
24 | 
25 |     # Generate
26 |     time_0 = time.time()
27 |     first_token_received = False
28 |     previous_token_time = None
29 |     output_chunks = 0
30 |     times_between_tokens = []
31 |     time_to_first_token = 0
32 |     response_str = ""
33 | 
34 |     stream = client.chat.completions.create(
35 |         model=config.model_name,
36 |         messages=[{"role": "user", "content": run_config["query"]}],
37 |         max_tokens=run_config["max_tokens"],
38 |         stream=True,
39 |     )
40 | 
41 |     for chunk in stream:
42 |         response = chunk.choices[0].delta  # type: ignore
43 |         response_content = response.content if response is not None else None
44 | 
45 |         if response_content is not None:
46 |             current_time = time.time()
47 |             if not first_token_received:
48 |                 time_to_first_token = current_time - time_0
49 |                 first_token_received = True
50 |             else:
51 |                 assert previous_token_time is not None
52 |                 times_between_tokens.append(current_time - previous_token_time)
53 |             previous_token_time = current_time
54 |             response_str += response_content
55 |             output_chunks += 1
56 | 
57 |     time_1 = time.time()
58 |     generate_time = time_1 - time_0
59 | 
60 |     # Calculate tokens
61 |     output_tokens = chunk.usage.completion_tokens  # type: ignore
62 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
63 | 
64 |     metrics = {
65 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
66 |         "requested_tokens": run_config["max_tokens"],
67 |         "output_tokens": output_tokens,
68 |         "generate_time": generate_time,
69 |         "tokens_per_second": tokens_per_second,
70 |         "time_to_first_token": time_to_first_token,
71 |         "times_between_tokens": times_between_tokens,
72 |     }
73 | 
74 |     return metrics
75 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/azure.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from llm_bench.config import CloudConfig
 7 | from openai import OpenAI
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | # Azure-specific model mappings to OpenAI parameters
12 | MODEL_NAME_MAPPING = {
13 |     "llama-2-7b-chat": {
14 |         "api_key_env": "AZURE_L7_API_KEY",
15 |         "post_url_env": "AZURE_L7_POST_URL",
16 |     },
17 |     "llama-2-13b-chat": {
18 |         "api_key_env": "AZURE_L13_API_KEY",
19 |         "post_url_env": "AZURE_L13_POST_URL",
20 |     },
21 |     "llama-2-70b-chat": {
22 |         "api_key_env": "AZURE_L70_API_KEY",
23 |         "post_url_env": "AZURE_L70_POST_URL",
24 |     },
25 |     "mistral-large": {
26 |         "api_key_env": "AZURE_MISTRAL_L_API_KEY",
27 |         "post_url_env": "AZURE_MISTRAL_L_POST_URL",
28 |     },
29 |     "cohere-cmd-r-plus": {
30 |         "api_key_env": "AZURE_COHERE_CMD_R_PLUS_API_KEY",
31 |         "post_url_env": "AZURE_COHERE_CMD_R_PLUS_POST_URL",
32 |     },
33 | }
34 | 
35 | 
36 | def generate(config: CloudConfig, run_config: dict) -> dict:
37 |     """Run Azure inference using OpenAI format and return metrics."""
38 | 
39 |     assert config.provider == "azure", "provider must be 'azure'"
40 |     assert "query" in run_config, "query must be in run_config"
41 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
42 | 
43 |     model_mapping = MODEL_NAME_MAPPING.get(config.model_name)
44 |     if not model_mapping:
45 |         raise ValueError(f"Unsupported model_name: {config.model_name}")
46 | 
47 |     client = OpenAI(
48 |         base_url=os.environ[model_mapping["post_url_env"]],
49 |         api_key=os.environ[model_mapping["api_key_env"]],
50 |     )
51 | 
52 |     # Generate
53 |     time_0 = time.time()
54 |     first_token_received = False
55 |     previous_token_time = None
56 |     output_tokens = 0
57 |     times_between_tokens = []
58 |     time_to_first_token = 0
59 | 
60 |     completion = client.chat.completions.create(
61 |         model="azureai",
62 |         messages=[
63 |             {"role": "system", "content": "You are a friendly AI."},
64 |             {"role": "user", "content": run_config["query"]},
65 |         ],
66 |         max_tokens=run_config["max_tokens"],
67 |         stream=True,
68 |     )
69 |     logger.debug(f"Completion: {completion}")
70 | 
71 |     for chunk in completion:
72 |         logger.debug(f"Chunk: {chunk}")
73 |         current_time = time.time()
74 |         if not first_token_received:
75 |             time_to_first_token = current_time - time_0
76 |             first_token_received = True
77 |         else:
78 |             assert previous_token_time is not None
79 |             times_between_tokens.append(current_time - previous_token_time)
80 |         previous_token_time = current_time
81 |         output_tokens += 1
82 | 
83 |     time_1 = time.time()
84 |     generate_time = time_1 - time_0
85 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
86 | 
87 |     metrics = {
88 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
89 |         "requested_tokens": run_config["max_tokens"],
90 |         "output_tokens": output_tokens,
91 |         "generate_time": generate_time,
92 |         "tokens_per_second": tokens_per_second,
93 |         "time_to_first_token": time_to_first_token,
94 |         "times_between_tokens": times_between_tokens,
95 |     }
96 | 
97 |     return metrics
98 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/bedrock.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | from datetime import datetime
 4 | 
 5 | import boto3
 6 | from botocore.exceptions import ClientError
 7 | from llm_bench.config import CloudConfig
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def generate(config: CloudConfig, run_config: dict) -> dict:
13 |     """Run BedRock inference and return metrics."""
14 | 
15 |     assert config.provider == "bedrock", "provider must be bedrock"
16 |     assert "query" in run_config, "query must be in run_config"
17 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
18 | 
19 |     # For some reason newer or bigger models start only in us-west-2
20 |     REGION_MAP = {"opus": "us-west-2", "llama3-1": "us-west-2"}
21 |     region_name = next((REGION_MAP[k] for k in REGION_MAP if k in config.model_name.lower()), "us-east-1")
22 | 
23 |     # Set up connection
24 |     bedrock_client = boto3.client(
25 |         service_name="bedrock-runtime",
26 |         region_name=region_name,
27 |     )
28 | 
29 |     # Prepare the messages
30 |     messages = [{"role": "user", "content": [{"text": run_config["query"]}]}]
31 | 
32 |     # Prepare system prompts (if needed)
33 |     system_prompts = []
34 | 
35 |     # Prepare inference config
36 |     inference_config = {"temperature": config.temperature, "maxTokens": run_config["max_tokens"]}
37 | 
38 |     # Additional model fields
39 |     additional_model_fields = {}
40 | 
41 |     # Generate
42 |     time_0 = time.time()
43 |     first_token_received = False
44 |     previous_token_time = None
45 |     time_to_first_token = None
46 |     output_tokens = 0
47 |     times_between_tokens = []
48 | 
49 |     try:
50 |         response = bedrock_client.converse_stream(
51 |             modelId=config.model_name,
52 |             messages=messages,
53 |             system=system_prompts,
54 |             inferenceConfig=inference_config,
55 |             additionalModelRequestFields=additional_model_fields,
56 |         )
57 | 
58 |         stream = response.get("stream")
59 |         if stream:
60 |             for event in stream:
61 |                 current_time = time.time()
62 |                 if "contentBlockDelta" in event:
63 |                     if not first_token_received:
64 |                         time_to_first_token = current_time - time_0
65 |                         first_token_received = True
66 |                     else:
67 |                         assert previous_token_time is not None
68 |                         times_between_tokens.append(current_time - previous_token_time)
69 |                     previous_token_time = current_time
70 |                     output_tokens += 1
71 |                 elif "metadata" in event:
72 |                     metadata = event["metadata"]
73 |                     if "usage" in metadata and "outputTokens" in metadata["usage"]:
74 |                         output_tokens = metadata["usage"]["outputTokens"]
75 | 
76 |     except ClientError as err:
77 |         message = err.response["Error"]["Message"]
78 |         logger.error(f"Error: {message}")
79 |         raise
80 | 
81 |     time_1 = time.time()
82 |     generate_time = time_1 - time_0
83 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
84 | 
85 |     metrics = {
86 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
87 |         "requested_tokens": run_config["max_tokens"],
88 |         "output_tokens": output_tokens,
89 |         "generate_time": generate_time,
90 |         "tokens_per_second": tokens_per_second,
91 |         "time_to_first_token": time_to_first_token,
92 |         "times_between_tokens": times_between_tokens,
93 |     }
94 | 
95 |     return metrics
96 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/databricks.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from llm_bench.config import CloudConfig
 7 | from openai import OpenAI
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def generate(config: CloudConfig, run_config: dict) -> dict:
13 |     """Run Databricks inference and return metrics."""
14 | 
15 |     assert config.provider == "databricks", "provider must be 'databricks'"
16 |     assert "query" in run_config, "query must be in run_config"
17 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
18 | 
19 |     client = OpenAI(
20 |         base_url=os.environ["DATABRICKS_BASE_URL"],
21 |         api_key=os.environ["DATABRICKS_API_KEY"],
22 |     )
23 | 
24 |     # Generate
25 |     time_0 = time.time()
26 |     first_token_received = False
27 |     previous_token_time = None
28 |     output_chunks = 0
29 |     output_tokens = 0
30 |     times_between_tokens = []
31 |     time_to_first_token = 0
32 |     response_str = ""
33 | 
34 |     stream = client.chat.completions.create(
35 |         model=config.model_name,
36 |         messages=[{"role": "user", "content": run_config["query"]}],
37 |         stream=True,
38 |         max_tokens=run_config["max_tokens"],
39 |     )
40 | 
41 |     for chunk in stream:
42 |         response = chunk.choices[0].delta
43 |         response_content = response.content if response is not None else None
44 | 
45 |         if response_content is not None:
46 |             current_time = time.time()
47 |             if not first_token_received:
48 |                 time_to_first_token = current_time - time_0
49 |                 first_token_received = True
50 |             else:
51 |                 assert previous_token_time is not None
52 |                 times_between_tokens.append(current_time - previous_token_time)
53 |             previous_token_time = current_time
54 |             response_str += response_content
55 |             output_chunks += 1
56 |             if len(chunk.choices) == 1:
57 |                 output_tokens += 1
58 |             else:
59 |                 raise ValueError("Unexpected number of choices")
60 | 
61 |     time_1 = time.time()
62 |     generate_time = time_1 - time_0
63 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
64 | 
65 |     metrics = {
66 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
67 |         "requested_tokens": run_config["max_tokens"],
68 |         "output_tokens": output_tokens,
69 |         "generate_time": generate_time,
70 |         "tokens_per_second": tokens_per_second,
71 |         "time_to_first_token": time_to_first_token,
72 |         "times_between_tokens": times_between_tokens,
73 |     }
74 | 
75 |     return metrics
76 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/deepinfra.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from llm_bench.config import CloudConfig
 7 | from openai import OpenAI
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def generate(config: CloudConfig, run_config: dict) -> dict:
13 |     """Run Deep Infra inference and return metrics."""
14 | 
15 |     assert config.provider == "deepinfra", "provider must be 'deepinfra'"
16 |     assert "query" in run_config, "query must be in run_config"
17 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
18 | 
19 |     client = OpenAI(
20 |         base_url=os.environ["DEEPINFRA_BASE_URL"],
21 |         api_key=os.environ["DEEPINFRA_API_KEY"],
22 |     )
23 | 
24 |     # Generate
25 |     time_0 = time.time()
26 |     first_token_received = False
27 |     previous_token_time = None
28 |     output_chunks = 0
29 |     output_tokens = 0
30 |     times_between_tokens = []
31 |     time_to_first_token = 0
32 |     response_str = ""
33 | 
34 |     stream = client.chat.completions.create(
35 |         model=config.model_name,
36 |         messages=[{"role": "user", "content": run_config["query"]}],
37 |         stream=True,
38 |         max_tokens=run_config["max_tokens"],
39 |     )
40 | 
41 |     for chunk in stream:
42 |         response = chunk.choices[0].delta
43 |         response_content = response.content if response is not None else None
44 | 
45 |         if response_content is not None:
46 |             current_time = time.time()
47 |             if not first_token_received:
48 |                 time_to_first_token = current_time - time_0
49 |                 first_token_received = True
50 |             else:
51 |                 assert previous_token_time is not None
52 |                 times_between_tokens.append(current_time - previous_token_time)
53 |             previous_token_time = current_time
54 |             response_str += response_content
55 |             output_chunks += 1
56 |             if len(chunk.choices) == 1:
57 |                 output_tokens += 1
58 |             else:
59 |                 raise ValueError("Unexpected number of choices")
60 | 
61 |     time_1 = time.time()
62 |     generate_time = time_1 - time_0
63 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
64 | 
65 |     metrics = {
66 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
67 |         "requested_tokens": run_config["max_tokens"],
68 |         "output_tokens": output_tokens,
69 |         "generate_time": generate_time,
70 |         "tokens_per_second": tokens_per_second,
71 |         "time_to_first_token": time_to_first_token,
72 |         "times_between_tokens": times_between_tokens,
73 |     }
74 | 
75 |     return metrics
76 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/fireworks.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from llm_bench.config import CloudConfig
 7 | from openai import OpenAI
 8 | from tiktoken import get_encoding
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def generate(config: CloudConfig, run_config: dict) -> dict:
14 |     """Run Fireworks inference and return metrics."""
15 | 
16 |     assert config.provider == "fireworks", "provider must be 'fireworks'"
17 |     assert "query" in run_config, "query must be in run_config"
18 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
19 | 
20 |     # Set up connection
21 |     client = OpenAI(
22 |         base_url=os.environ["FIREWORKS_BASE_URL"],
23 |         api_key=os.environ["FIREWORKS_API_KEY"],
24 |     )
25 | 
26 |     # Generate
27 |     time_0 = time.time()
28 |     first_token_received = False
29 |     previous_token_time = None
30 |     output_chunks = 0
31 |     output_tokens = 0
32 |     times_between_tokens = []
33 |     time_to_first_token = 0
34 |     response_str = ""
35 | 
36 |     response = client.completions.create(
37 |         model=config.model_name,
38 |         prompt=run_config["query"],
39 |         max_tokens=run_config["max_tokens"],
40 |         stream=True,
41 |     )
42 | 
43 |     for chunk in response:
44 |         response_content = chunk.choices[0].text
45 |         if response_content is not None:
46 |             current_time = time.time()
47 |             if not first_token_received:
48 |                 time_to_first_token = current_time - time_0
49 |                 first_token_received = True
50 |             else:
51 |                 assert previous_token_time is not None
52 |                 times_between_tokens.append(current_time - previous_token_time)
53 |             previous_token_time = current_time
54 |             response_str += response_content
55 |             output_chunks += 1
56 |             if len(chunk.choices) != 1:
57 |                 raise ValueError("Unexpected number of choices")
58 |         else:
59 |             logger.warning(f"Received empty content chunk: {chunk}")
60 | 
61 |     output_tokens = count_tokens(run_config["max_tokens"], response_str)
62 | 
63 |     time_1 = time.time()
64 |     generate_time = time_1 - time_0
65 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
66 | 
67 |     metrics = {
68 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
69 |         "requested_tokens": run_config["max_tokens"],
70 |         "output_tokens": output_tokens,
71 |         "generate_time": generate_time,
72 |         "tokens_per_second": tokens_per_second,
73 |         "time_to_first_token": time_to_first_token,
74 |         "times_between_tokens": times_between_tokens,
75 |         "output_text": response_str,
76 |     }
77 | 
78 |     return metrics
79 | 
80 | 
81 | def count_tokens(max_tokens: int, response_str: str) -> int:
82 |     encoder = get_encoding("cl100k_base")
83 |     n_tokens = len(encoder.encode(response_str))
84 |     if not 0.8 * max_tokens <= n_tokens <= 1.2 * max_tokens:
85 |         raise ValueError(f"N Tokens {n_tokens} not within 20% of max tokens {max_tokens}")
86 |     return max_tokens
87 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/groq.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from groq import Groq
 7 | from llm_bench.config import CloudConfig
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def generate(config: CloudConfig, run_config: dict) -> dict:
13 |     """Run Groq inference and return metrics."""
14 | 
15 |     assert config.provider == "groq", "provider must be 'groq'"
16 |     assert "query" in run_config, "query must be in run_config"
17 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
18 | 
19 |     client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
20 | 
21 |     # Generate
22 |     time_0 = time.time()
23 |     first_token_received = False
24 |     previous_token_time = None
25 |     output_chunks = 0
26 |     output_tokens = 0
27 |     times_between_tokens = []
28 |     time_to_first_token = 0
29 |     response_str = ""
30 | 
31 |     stream = client.chat.completions.create(
32 |         model=config.model_name,
33 |         messages=[
34 |             {"role": "system", "content": "You are a helpful assistant."},
35 |             {"role": "user", "content": run_config["query"]},
36 |         ],
37 |         stream=True,
38 |         max_tokens=run_config["max_tokens"],
39 |     )
40 | 
41 |     for chunk in stream:
42 |         response = chunk.choices[0].delta
43 |         response_content = response.content if response is not None else None
44 | 
45 |         if response_content is not None:
46 |             current_time = time.time()
47 |             if not first_token_received:
48 |                 time_to_first_token = current_time - time_0
49 |                 first_token_received = True
50 |             else:
51 |                 assert previous_token_time is not None
52 |                 times_between_tokens.append(current_time - previous_token_time)
53 |             previous_token_time = current_time
54 |             response_str += response_content
55 |             output_chunks += 1
56 |             if len(chunk.choices) == 1:
57 |                 output_tokens += 1
58 |             else:
59 |                 raise ValueError("Unexpected number of choices")
60 | 
61 |     time_1 = time.time()
62 |     generate_time = time_1 - time_0
63 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
64 | 
65 |     metrics = {
66 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
67 |         "requested_tokens": run_config["max_tokens"],
68 |         "output_tokens": output_tokens,
69 |         "generate_time": generate_time,
70 |         "tokens_per_second": tokens_per_second,
71 |         "time_to_first_token": time_to_first_token,
72 |         "times_between_tokens": times_between_tokens,
73 |     }
74 | 
75 |     return metrics
76 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/lambda.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | from typing import Any
 6 | from typing import Dict
 7 | from typing import Tuple
 8 | 
 9 | from llm_bench.config import CloudConfig
10 | from openai import OpenAI
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def process_stream_response(stream, start_time: float, max_tokens: int) -> Tuple[str, Dict[str, Any]]:
16 |     response_text = ""
17 |     output_tokens = 0
18 |     time_to_first_token = None
19 |     times_between_tokens = []
20 |     last_token_time = None
21 | 
22 |     for chunk in stream:
23 |         current_time = time.time()
24 |         output_tokens += 1
25 | 
26 |         if time_to_first_token is None:
27 |             time_to_first_token = current_time - start_time
28 |         elif last_token_time:
29 |             times_between_tokens.append(current_time - last_token_time)
30 | 
31 |         last_token_time = current_time
32 | 
33 |         if chunk.choices and chunk.choices[0].delta.content:
34 |             response_text += chunk.choices[0].delta.content
35 | 
36 |     # Check if tokens received is within 20% of requested
37 |     if abs(output_tokens - max_tokens) > (max_tokens * 0.2):
38 |         raise ValueError(f"Received {output_tokens} tokens, expected around {max_tokens}")
39 | 
40 |     metrics = {
41 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
42 |         "output_tokens": output_tokens,
43 |         "generate_time": time.time() - start_time,
44 |         "tokens_per_second": output_tokens / (time.time() - start_time),
45 |         "time_to_first_token": time_to_first_token,
46 |         "times_between_tokens": times_between_tokens,
47 |     }
48 | 
49 |     return response_text, metrics
50 | 
51 | 
52 | def generate(config: CloudConfig, run_config: dict) -> dict:
53 |     """Run Lambda inference and return metrics."""
54 |     assert config.provider == "lambda", "provider must be 'lambda'"
55 |     assert "query" in run_config, "query must be in run_config"
56 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
57 | 
58 |     client = OpenAI(
59 |         base_url=os.environ["LAMBDA_BASE_URL"],
60 |         api_key=os.environ["LAMBDA_API_KEY"],
61 |     )
62 | 
63 |     start_time = time.time()
64 |     stream = client.chat.completions.create(
65 |         model=config.model_name,
66 |         messages=[{"role": "user", "content": run_config["query"]}],
67 |         max_tokens=run_config["max_tokens"],
68 |         stream=True,
69 |     )
70 | 
71 |     _, metrics = process_stream_response(stream, start_time, run_config["max_tokens"])
72 |     metrics["requested_tokens"] = run_config["max_tokens"]
73 | 
74 |     return metrics
75 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/openai.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from datetime import datetime
  4 | 
  5 | import tiktoken
  6 | from llm_bench.config import CloudConfig
  7 | from openai import OpenAI
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | NON_CHAT_MODELS = ["gpt-3.5-turbo-instruct"]
 13 | 
 14 | 
 15 | def process_non_chat_model(client, config, run_config):
 16 |     return (
 17 |         client.completions.create(
 18 |             model=config.model_name,
 19 |             prompt=run_config["query"],
 20 |             max_tokens=run_config["max_tokens"],
 21 |             stream=True,
 22 |         ),
 23 |         "text",
 24 |     )
 25 | 
 26 | 
 27 | def process_chat_model(client, config, run_config):
 28 |     return (
 29 |         client.chat.completions.create(
 30 |             model=config.model_name,
 31 |             messages=[{"role": "user", "content": run_config["query"]}],
 32 |             max_tokens=run_config["max_tokens"],
 33 |             stream=True,
 34 |         ),
 35 |         "choices",
 36 |     )
 37 | 
 38 | 
 39 | def generate(config: CloudConfig, run_config: dict) -> dict:
 40 |     """Run OpenAI inference and return metrics."""
 41 | 
 42 |     assert config.provider == "openai", "provider must be openai"
 43 |     assert "query" in run_config, "query must be in run_config"
 44 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
 45 | 
 46 |     # Set up connection
 47 |     client = OpenAI()
 48 | 
 49 |     # Generate
 50 |     time_0 = time.time()
 51 |     first_token_received = False
 52 |     previous_token_time = None
 53 |     output_chunks = 0
 54 |     times_between_tokens = []
 55 |     time_to_first_token = 0
 56 |     response_str = ""
 57 | 
 58 |     process_func = process_non_chat_model if config.model_name in NON_CHAT_MODELS else process_chat_model
 59 |     stream, response_key = process_func(client, config, run_config)
 60 | 
 61 |     for chunk in stream:
 62 |         if config.model_name in NON_CHAT_MODELS:
 63 |             response = chunk.choices[0]
 64 |             response_content = getattr(response, response_key)
 65 |         else:
 66 |             response = chunk.choices[0].delta  # type: ignore
 67 |             response_content = response.content if response is not None else None
 68 | 
 69 |         if response_content is not None:
 70 |             current_time = time.time()
 71 |             if not first_token_received:
 72 |                 time_to_first_token = current_time - time_0
 73 |                 first_token_received = True
 74 |             else:
 75 |                 assert previous_token_time is not None
 76 |                 times_between_tokens.append(current_time - previous_token_time)
 77 |             previous_token_time = current_time
 78 |             response_str += response_content
 79 |             output_chunks += 1
 80 | 
 81 |     time_1 = time.time()
 82 |     generate_time = time_1 - time_0
 83 | 
 84 |     # Calculate tokens
 85 |     encoder = tiktoken.encoding_for_model(config.model_name)
 86 |     output_tokens = len(encoder.encode(response_str))
 87 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
 88 | 
 89 |     metrics = {
 90 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 91 |         "requested_tokens": run_config["max_tokens"],
 92 |         "output_tokens": output_tokens,
 93 |         "generate_time": generate_time,
 94 |         "tokens_per_second": tokens_per_second,
 95 |         "time_to_first_token": time_to_first_token,
 96 |         "times_between_tokens": times_between_tokens,
 97 |     }
 98 | 
 99 |     return metrics
100 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/openrouter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | from datetime import datetime
  5 | 
  6 | from llm_bench.config import CloudConfig
  7 | from openai import OpenAI
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | NON_CHAT_MODELS = []
 13 | 
 14 | 
 15 | def process_non_chat_model(client, config, run_config):
 16 |     raise NotImplementedError
 17 | 
 18 | 
 19 | def process_chat_model(client, config, run_config):
 20 |     return (
 21 |         client.chat.completions.create(
 22 |             model=config.model_name,
 23 |             messages=[{"role": "user", "content": run_config["query"]}],
 24 |             max_tokens=run_config["max_tokens"],
 25 |             stream=True,
 26 |             extra_headers={
 27 |                 "HTTP-Referer": "llm-benchmarks.com",
 28 |                 "X-Title": "LLM Benchmarks",
 29 |             },
 30 |         ),
 31 |         "choices",
 32 |     )
 33 | 
 34 | 
 35 | def generate(config: CloudConfig, run_config: dict) -> dict:
 36 |     """Run OpenRouter inference and return metrics."""
 37 | 
 38 |     assert config.provider == "openrouter", "provider must be 'openrouter'"
 39 |     assert "query" in run_config, "query must be in run_config"
 40 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
 41 | 
 42 |     # Set up connection
 43 |     client = OpenAI(
 44 |         base_url=os.environ["OPENROUTER_BASE_URL"],
 45 |         api_key=os.environ["OPENROUTER_API_KEY"],
 46 |     )
 47 | 
 48 |     # Generate
 49 |     time_0 = time.time()
 50 |     first_token_received = False
 51 |     previous_token_time = None
 52 |     output_chunks = 0
 53 |     output_tokens = 0
 54 |     times_between_tokens = []
 55 |     time_to_first_token = 0
 56 |     response_str = ""
 57 | 
 58 |     process_func = process_non_chat_model if config.model_name in NON_CHAT_MODELS else process_chat_model
 59 |     stream, response_key = process_func(client, config, run_config)
 60 | 
 61 |     for chunk in stream:
 62 |         if config.model_name in NON_CHAT_MODELS:
 63 |             response = chunk.choices[0]
 64 |             response_content = getattr(response, response_key)
 65 |         else:
 66 |             response = chunk.choices[0].delta  # type: ignore
 67 |             response_content = response.content if response is not None else None
 68 | 
 69 |         if response_content is not None:
 70 |             current_time = time.time()
 71 |             if not first_token_received:
 72 |                 time_to_first_token = current_time - time_0
 73 |                 first_token_received = True
 74 |             else:
 75 |                 assert previous_token_time is not None
 76 |                 times_between_tokens.append(current_time - previous_token_time)
 77 |             previous_token_time = current_time
 78 |             response_str += response_content
 79 |             output_chunks += 1
 80 |             if len(chunk.choices) == 1:
 81 |                 output_tokens += 1
 82 |             else:
 83 |                 raise ValueError("Unexpected number of choices")
 84 | 
 85 |     time_1 = time.time()
 86 |     generate_time = time_1 - time_0
 87 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
 88 | 
 89 |     metrics = {
 90 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 91 |         "requested_tokens": run_config["max_tokens"],
 92 |         "output_tokens": output_tokens,
 93 |         "generate_time": generate_time,
 94 |         "tokens_per_second": tokens_per_second,
 95 |         "time_to_first_token": time_to_first_token,
 96 |         "times_between_tokens": times_between_tokens,
 97 |     }
 98 | 
 99 |     return metrics
100 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/runpod.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | import requests
 8 | from llm_bench.config import CloudConfig
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def process_model(config, run_config):
14 |     url = f"https://api.runpod.ai/v2/{config.model_name}/run"
15 |     headers = {"Authorization": os.environ["RUNPOD_API_KEY"], "Content-Type": "application/json"}
16 |     payload = {
17 |         "input": {
18 |             "prompt": run_config["query"],
19 |             "sampling_params": {
20 |                 "max_tokens": run_config["max_tokens"],
21 |                 "n": 1,
22 |                 "temperature": 0.0,
23 |             },
24 |         }
25 |     }
26 |     response = requests.post(url, headers=headers, json=payload)
27 |     response_json = json.loads(response.text)
28 |     status_url = f"https://api.runpod.ai/v2/{config.model_name}/stream/{response_json['id']}"
29 |     return status_url, headers
30 | 
31 | 
32 | def generate(config: CloudConfig, run_config: dict) -> dict:
33 |     """Run RunPod inference and return metrics."""
34 | 
35 |     assert config.provider == "runpod", "provider must be 'runpod'"
36 |     assert "query" in run_config, "query must be in run_config"
37 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
38 | 
39 |     # Generate
40 |     time_0 = time.time()
41 |     first_token_received = False
42 |     previous_token_time = None
43 |     output_tokens = 0
44 |     times_between_tokens = []
45 |     time_to_first_token = 0
46 | 
47 |     status_url, headers = process_model(config, run_config)
48 | 
49 |     while True:
50 |         get_status = requests.get(status_url, headers=headers)
51 |         status_data = get_status.json()
52 |         if status_data["status"] == "COMPLETED":
53 |             break
54 |         elif get_status.status_code != 200:
55 |             raise ValueError("An error occurred.")
56 |         else:
57 |             current_time = time.time()
58 |             if not first_token_received:
59 |                 time_to_first_token = current_time - time_0
60 |                 first_token_received = True
61 |             else:
62 |                 assert previous_token_time is not None
63 |                 times_between_tokens.append(current_time - previous_token_time)
64 |             previous_token_time = current_time
65 |             output_tokens += status_data["stream"][0]["metrics"]["output_tokens"]
66 | 
67 |     time_1 = time.time()
68 |     generate_time = time_1 - time_0
69 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
70 | 
71 |     metrics = {
72 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
73 |         "requested_tokens": run_config["max_tokens"],
74 |         "output_tokens": output_tokens,
75 |         "generate_time": generate_time,
76 |         "tokens_per_second": tokens_per_second,
77 |         "time_to_first_token": time_to_first_token,
78 |         "times_between_tokens": times_between_tokens,
79 |     }
80 | 
81 |     return metrics
82 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/together.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | from typing import Any
 6 | from typing import Dict
 7 | from typing import Tuple
 8 | 
 9 | from llm_bench.config import CloudConfig
10 | from openai import OpenAI
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def process_stream_response(stream, start_time: float, max_tokens: int) -> Tuple[str, Dict[str, Any]]:
16 |     response_text = ""
17 |     output_tokens = 0
18 |     time_to_first_token = None
19 |     times_between_tokens = []
20 |     last_token_time = None
21 | 
22 |     for chunk in stream:
23 |         current_time = time.time()
24 |         output_tokens += 1
25 | 
26 |         if time_to_first_token is None:
27 |             time_to_first_token = current_time - start_time
28 |         elif last_token_time:
29 |             times_between_tokens.append(current_time - last_token_time)
30 | 
31 |         last_token_time = current_time
32 | 
33 |         if chunk.choices and chunk.choices[0].delta.content:
34 |             response_text += chunk.choices[0].delta.content
35 | 
36 |     # Check if tokens received is within 20% of requested
37 |     if abs(output_tokens - max_tokens) > (max_tokens * 0.2):
38 |         raise ValueError(f"Received {output_tokens} tokens, expected around {max_tokens}")
39 | 
40 |     metrics = {
41 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
42 |         "output_tokens": output_tokens,
43 |         "generate_time": time.time() - start_time,
44 |         "tokens_per_second": output_tokens / (time.time() - start_time),
45 |         "time_to_first_token": time_to_first_token,
46 |         "times_between_tokens": times_between_tokens,
47 |     }
48 | 
49 |     return response_text, metrics
50 | 
51 | 
52 | def generate(config: CloudConfig, run_config: dict) -> dict:
53 |     """Run TogetherAI inference and return metrics."""
54 |     assert config.provider == "together", "provider must be 'together'"
55 |     assert "query" in run_config, "query must be in run_config"
56 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
57 | 
58 |     client = OpenAI(
59 |         base_url=os.environ["TOGETHER_BASE_URL"],
60 |         api_key=os.environ["TOGETHER_API_KEY"],
61 |     )
62 | 
63 |     start_time = time.time()
64 |     stream = client.chat.completions.create(
65 |         model=config.model_name,
66 |         messages=[{"role": "user", "content": run_config["query"]}],
67 |         max_tokens=run_config["max_tokens"],
68 |         stream=True,
69 |     )
70 | 
71 |     _, metrics = process_stream_response(stream, start_time, run_config["max_tokens"])
72 |     metrics["requested_tokens"] = run_config["max_tokens"]
73 | 
74 |     return metrics
75 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/providers/vertex.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | from datetime import datetime
  5 | 
  6 | import openai
  7 | import vertexai
  8 | from anthropic import AnthropicVertex
  9 | from google.auth import default
 10 | from google.auth import transport
 11 | from llm_bench.config import CloudConfig
 12 | from vertexai.generative_models import GenerationConfig
 13 | from vertexai.generative_models import GenerativeModel
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | PROJECT_ID = "llm-bench"
 18 | REGION = "us-central1"
 19 | SECONDARY_REGION = "us-east5"
 20 | MAAS_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
 21 | 
 22 | os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
 23 | 
 24 | 
 25 | def _get_openai_client():
 26 |     """Get an OpenAI client configured for Vertex AI."""
 27 |     credentials, _ = default()
 28 |     auth_request = transport.requests.Request()
 29 |     credentials.refresh(auth_request)
 30 | 
 31 |     return openai.OpenAI(
 32 |         base_url=f"https://{MAAS_ENDPOINT}/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi",
 33 |         api_key=credentials.token,
 34 |     )
 35 | 
 36 | 
 37 | def generate(config: CloudConfig, run_config: dict) -> dict:
 38 |     assert config.provider == "vertex", "provider must be Vertex"
 39 |     assert "query" in run_config, "query must be in run_config"
 40 |     assert "max_tokens" in run_config, "max_tokens must be in run_config"
 41 | 
 42 |     vertexai.init(project=PROJECT_ID)
 43 | 
 44 |     if "llama" in config.model_name.lower():
 45 |         logger.debug("Using Vertex/OpenAI API for Llama model")
 46 |         client = _get_openai_client()
 47 |         time_0 = time.time()
 48 |         stream = client.chat.completions.create(
 49 |             model=config.model_name,
 50 |             messages=[{"role": "user", "content": run_config["query"]}],
 51 |             max_tokens=run_config["max_tokens"],
 52 |             stream=True,
 53 |         )
 54 |         ttft, tbts, n_tokens = generate_tokens(stream, time_0, False, is_openai=True)
 55 |         generate_time = time.time() - time_0
 56 |     elif "claude" not in config.model_name.lower():
 57 |         logger.debug("Using Vertex/GenerativeModel")
 58 |         model = GenerativeModel(config.model_name)
 59 |         time_0 = time.time()
 60 |         stream = model.generate_content(
 61 |             contents=run_config["query"],
 62 |             generation_config=GenerationConfig(max_output_tokens=run_config["max_tokens"]),
 63 |             stream=True,
 64 |         )
 65 |         ttft, tbts, n_tokens = generate_tokens(stream, time_0, False)
 66 |         generate_time = time.time() - time_0
 67 |     else:
 68 |         logger.debug("Using Vertex/AnthropicVertex")
 69 |         keywords = ["opus", "3-5"]
 70 |         region = SECONDARY_REGION if any(keyword in config.model_name.lower() for keyword in keywords) else REGION
 71 |         client = AnthropicVertex(region=region, project_id=PROJECT_ID)
 72 |         time_0 = time.time()
 73 |         with client.messages.stream(
 74 |             max_tokens=run_config["max_tokens"],
 75 |             messages=[{"role": "user", "content": run_config["query"]}],
 76 |             model=config.model_name,
 77 |         ) as stream:
 78 |             ttft, tbts, n_tokens = generate_tokens(stream, time_0, True)
 79 |         generate_time = time.time() - time_0
 80 | 
 81 |     return calculate_metrics(run_config, n_tokens, generate_time, ttft, tbts)
 82 | 
 83 | 
 84 | def generate_tokens(stream, time_0, is_anthropic=False, is_openai=False):
 85 |     first_token_received = False
 86 |     previous_token_time = None
 87 |     time_to_first_token = None
 88 |     times_between_tokens = []
 89 |     token_count = 0
 90 | 
 91 |     stream_iter = stream if is_anthropic or is_openai else stream
 92 | 
 93 |     item = None
 94 |     for item in stream_iter:
 95 |         current_time = time.time()
 96 |         if not first_token_received:
 97 |             time_to_first_token = current_time - time_0
 98 |             first_token_received = True
 99 |         else:
100 |             assert previous_token_time is not None
101 |             times_between_tokens.append(current_time - previous_token_time)
102 |         previous_token_time = current_time
103 | 
104 |     assert item, "No tokens received"
105 | 
106 |     if is_anthropic:
107 |         token_count = item.message.usage.output_tokens
108 |     elif is_openai:
109 |         token_count = item.usage.completion_tokens
110 |     else:
111 |         token_count = item._raw_response.usage_metadata.candidates_token_count
112 | 
113 |     return time_to_first_token, times_between_tokens, token_count
114 | 
115 | 
116 | def calculate_metrics(run_config, output_tokens, generate_time, time_to_first_token, times_between_tokens):
117 |     tokens_per_second = output_tokens / generate_time if generate_time > 0 else 0
118 | 
119 |     return {
120 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
121 |         "requested_tokens": run_config["max_tokens"],
122 |         "output_tokens": output_tokens,
123 |         "generate_time": generate_time,
124 |         "tokens_per_second": tokens_per_second,
125 |         "time_to_first_token": time_to_first_token,
126 |         "times_between_tokens": times_between_tokens,
127 |     }
128 | 


--------------------------------------------------------------------------------
/api/llm_bench/cloud/server.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | 
  5 | from fastapi import FastAPI
  6 | 
  7 | from llm_bench.config import CloudConfig
  8 | from llm_bench.config import MongoConfig
  9 | from llm_bench.logging import log_metrics
 10 | from llm_bench.types import BenchmarkRequest
 11 | from llm_bench.types import BenchmarkResponse
 12 | from llm_bench.utils import has_existing_run
 13 | 
 14 | LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 15 | LOG_DIR = os.environ.get("LOG_DIR", "/var/log")
 16 | LOG_FILE_TXT = os.path.join(LOG_DIR, "benchmarks_cloud.log")
 17 | LOG_FILE_JSON = os.path.join(LOG_DIR, "benchmarks_cloud.json")
 18 | LOG_TO_MONGO = os.getenv("LOG_TO_MONGO", "False").lower() in ("true", "1", "t")
 19 | MONGODB_URI = os.environ.get("MONGODB_URI")
 20 | MONGODB_DB = os.environ.get("MONGODB_DB")
 21 | MONGODB_COLLECTION_CLOUD = os.environ.get("MONGODB_COLLECTION_CLOUD")
 22 | 
 23 | FASTAPI_PORT_CLOUD = os.environ.get("FASTAPI_PORT_CLOUD")
 24 | assert FASTAPI_PORT_CLOUD, "FASTAPI_PORT_CLOUD environment variable not set"
 25 | 
 26 | logging.basicConfig(
 27 |     filename=os.path.join(LOG_DIR, LOG_FILE_TXT),
 28 |     level=LOG_LEVEL,
 29 |     format="%(asctime)s|%(name)s|%(levelname)s|%(funcName)s:%(lineno)d|%(message)s",
 30 |     datefmt="%m-%d %H:%M:%S",
 31 | )
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | PROVIDER_MODULES = {
 35 |     "openai": "llm_bench.cloud.providers.openai",
 36 |     "anthropic": "llm_bench.cloud.providers.anthropic",
 37 |     "bedrock": "llm_bench.cloud.providers.bedrock",
 38 |     "vertex": "llm_bench.cloud.providers.vertex",
 39 |     "anyscale": "llm_bench.cloud.providers.anyscale",
 40 |     "together": "llm_bench.cloud.providers.together",
 41 |     "openrouter": "llm_bench.cloud.providers.openrouter",
 42 |     "azure": "llm_bench.cloud.providers.azure",
 43 |     "runpod": "llm_bench.cloud.providers.runpod",
 44 |     "fireworks": "llm_bench.cloud.providers.fireworks",
 45 |     "deepinfra": "llm_bench.cloud.providers.deepinfra",
 46 |     "groq": "llm_bench.cloud.providers.groq",
 47 |     "databricks": "llm_bench.cloud.providers.databricks",
 48 |     "lambda": "llm_bench.cloud.providers.lambda",
 49 | }
 50 | 
 51 | app = FastAPI(
 52 |     title="LLM Benchmarking API",
 53 |     description="API for benchmarking LLMs on the cloud",
 54 |     port=FASTAPI_PORT_CLOUD,
 55 | )
 56 | 
 57 | 
 58 | @app.post("/benchmark", response_model=BenchmarkResponse)
 59 | async def call_cloud(request: BenchmarkRequest):
 60 |     logger.info(f"Received benchmark request: Provider={request.provider}, Model={request.model}")
 61 |     provider = request.provider
 62 |     model_name = request.model
 63 |     query = request.query
 64 |     max_tokens = request.max_tokens
 65 |     temperature = request.temperature
 66 |     run_always = request.run_always
 67 |     debug = request.debug
 68 | 
 69 |     if provider not in PROVIDER_MODULES:
 70 |         error_message = f"Invalid provider: {provider}"
 71 |         logger.error(error_message)
 72 |         return {"status": "error", "message": error_message}
 73 | 
 74 |     if not model_name:
 75 |         error_message = "model_name must be set"
 76 |         logger.error(error_message)
 77 |         return {"status": "error", "message": error_message}
 78 | 
 79 |     logger.info(f"Received request for model: {model_name}")
 80 | 
 81 |     run_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 82 | 
 83 |     # Create model config
 84 |     model_config = CloudConfig(
 85 |         provider=provider,
 86 |         model_name=model_name,
 87 |         run_ts=run_ts,
 88 |         temperature=temperature,
 89 |         misc={},
 90 |     )
 91 | 
 92 |     # Create run config
 93 |     run_config = {
 94 |         "query": query,
 95 |         "max_tokens": max_tokens,
 96 |     }
 97 | 
 98 |     # Check if model has been benchmarked before
 99 |     if LOG_TO_MONGO:
100 |         logger.debug("Logging to MongoDB")
101 |         mongo_config = MongoConfig(
102 |             uri=MONGODB_URI,  # type: ignore
103 |             db=MONGODB_DB,  # type: ignore
104 |             collection=MONGODB_COLLECTION_CLOUD,  # type: ignore
105 |         )
106 |         existing_run = has_existing_run(model_name, model_config, mongo_config)
107 |         if existing_run:
108 |             if run_always:
109 |                 logger.info(f"Model has been benchmarked before: {model_name}")
110 |                 logger.info("Re-running benchmark anyway because run_always is True")
111 |             else:
112 |                 logger.info(f"Model has been benchmarked before: {model_name}")
113 |                 return {"status": "skipped", "reason": "model has been benchmarked before"}
114 |         else:
115 |             logger.info(f"Model has not been benchmarked before: {model_name}")
116 |     else:
117 |         logger.debug("Not logging to MongoDB")
118 | 
119 |     # Load provider module
120 |     module_name = PROVIDER_MODULES[provider]
121 |     module = __import__(module_name, fromlist=["generate"])
122 |     generate = module.generate
123 | 
124 |     # Run benchmark
125 |     try:
126 |         result = generate(model_config, run_config)
127 | 
128 |         if isinstance(result, dict) and "status" in result and result["status"] == "error":
129 |             return result
130 | 
131 |         metrics = result
132 | 
133 |         if not metrics:
134 |             error_message = "metrics is empty"
135 |             logger.error(error_message)
136 |             return {"status": "error", "message": error_message}
137 | 
138 |         if metrics["tokens_per_second"] <= 0:
139 |             error_message = "tokens_per_second must be greater than 0"
140 |             logger.error(error_message)
141 |             return {"status": "error", "message": error_message}
142 | 
143 |         if abs(metrics["output_tokens"] - max_tokens) > max_tokens * 0.1:
144 |             error_message = f"Token count not within 10% of max tokens: {metrics['output_tokens']}"
145 |             logger.error(error_message)
146 |             return {"status": "error", "message": error_message}
147 | 
148 |     except Exception as e:
149 |         error_message = f"An error occurred during benchmark: {str(e)}"
150 |         logger.error(error_message)
151 |         return {"status": "error", "message": error_message}
152 | 
153 |     if debug:
154 |         logger.info(f"Debug mode: {debug}")
155 |         logger.info(f"Metrics: {metrics}")
156 |     else:
157 |         # Log metrics
158 |         log_metrics(
159 |             model_type="cloud",
160 |             config=model_config,
161 |             metrics=metrics,
162 |             file_path=os.path.join(LOG_DIR, LOG_FILE_JSON),
163 |             log_to_mongo=LOG_TO_MONGO,
164 |             mongo_uri=MONGODB_URI,
165 |             mongo_db=MONGODB_DB,
166 |             mongo_collection=MONGODB_COLLECTION_CLOUD,
167 |         )
168 | 
169 |     # Print metrics
170 |     logger.info(f"===== Model: {provider}/{model_name} =====")
171 |     logger.info(f"provider: {model_config.provider}")
172 |     logger.info(f"Output tokens: {metrics['output_tokens']}")
173 |     logger.info(f"Generate time: {metrics['generate_time']:.2f} s")
174 |     logger.info(f"Tokens per second: {metrics['tokens_per_second']:.2f}")
175 | 
176 |     return {"status": "success"}
177 | 


--------------------------------------------------------------------------------
/api/llm_bench/config.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Optional
  3 | 
  4 | logger = logging.getLogger(__name__)
  5 | 
  6 | 
  7 | class ModelConfig:
  8 |     """
  9 |     Configuration for a local model run.
 10 |     """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         framework: str,
 15 |         model_name: str,
 16 |         run_ts: str,
 17 |         model_dtype: str,
 18 |         temperature: float,
 19 |         quantization_method: Optional[str] = None,
 20 |         quantization_bits: Optional[str] = None,
 21 |         misc: dict = {},
 22 |     ):
 23 |         self.framework = framework
 24 |         self.model_name = model_name
 25 |         self.run_ts = run_ts
 26 |         self.model_dtype = model_dtype
 27 |         self.temperature = temperature
 28 |         self.quantization_method = quantization_method
 29 |         self.quantization_bits = quantization_bits
 30 |         self.misc = misc
 31 | 
 32 |     @property
 33 |     def framework(self):
 34 |         return self._framework
 35 | 
 36 |     @framework.setter
 37 |     def framework(self, value):
 38 |         if value not in ["transformers", "gguf", "hf-tgi", "vllm"]:
 39 |             raise ValueError("framework must be: 'transformers', 'gguf', 'hf-tgi', 'vllm'")
 40 |         self._framework = value
 41 | 
 42 |     @property
 43 |     def quantization_method(self):
 44 |         return self._quantization_method
 45 | 
 46 |     @quantization_method.setter
 47 |     def quantization_method(self, value):
 48 |         if value not in ["bitsandbytes", "gptq", "awq", "gguf", None]:
 49 |             raise ValueError(f"quant method not in ['bitsandbytes', 'gptq', 'awq', 'gguf', None]. Got {value}")
 50 |         self._quantization_method = value
 51 | 
 52 |     @property
 53 |     def load_in_4bit(self) -> bool:
 54 |         return self.quantization_bits == "4bit" if self.quantization_bits is not None else False
 55 | 
 56 |     @property
 57 |     def load_in_8bit(self) -> bool:
 58 |         return self.quantization_bits == "8bit" if self.quantization_bits is not None else False
 59 | 
 60 |     def to_dict(self):
 61 |         return {
 62 |             "framework": self.framework,
 63 |             "model_name": self.model_name,
 64 |             "run_ts": self.run_ts,
 65 |             "model_dtype": self.model_dtype,
 66 |             "temperature": self.temperature,
 67 |             "quantization_method": self.quantization_method,
 68 |             "quantization_bits": self.quantization_bits,
 69 |             "misc": self.misc,
 70 |         }
 71 | 
 72 | 
 73 | class CloudConfig:
 74 |     def __init__(
 75 |         self,
 76 |         provider: str,
 77 |         model_name: str,
 78 |         run_ts: str,
 79 |         temperature: float,
 80 |         misc: dict = {},
 81 |     ):
 82 |         """
 83 |         Configuration for a cloud model run.
 84 |         """
 85 |         self.provider = provider
 86 |         self.model_name = model_name
 87 |         self.run_ts = run_ts
 88 |         self.temperature = temperature
 89 |         self.misc = misc
 90 | 
 91 |     def to_dict(self):
 92 |         return {
 93 |             "provider": self.provider,
 94 |             "model_name": self.model_name,
 95 |             "run_ts": self.run_ts,
 96 |             "temperature": self.temperature,
 97 |             "misc": self.misc,
 98 |         }
 99 | 
100 | 
101 | class MongoConfig:
102 |     def __init__(self, uri: str, db: str, collection: str):
103 |         """
104 |         Initialize the MongoDB configuration.
105 |         """
106 |         self.uri = uri
107 |         self.db = db
108 |         self.collection = collection
109 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/api/llm_bench/local/__init__.py


--------------------------------------------------------------------------------
/api/llm_bench/local/gguf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/api/llm_bench/local/gguf/__init__.py


--------------------------------------------------------------------------------
/api/llm_bench/local/gguf/create_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import sys
 6 | 
 7 | from huggingface_hub import snapshot_download
 8 | 
 9 | # Define the constant for the final output directory
10 | OUTPUT_DIR = "/gemini/gguf"
11 | 
12 | 
13 | def main() -> None:
14 |     parser = argparse.ArgumentParser(description="Download a model from the Hugging Face Hub.")
15 |     parser.add_argument("-m", "--model", required=True, help="Model ID from the Hugging Face Hub.")
16 |     args = parser.parse_args()
17 | 
18 |     # Clean the model id
19 |     cleaned_model_id = clean_model_id(args.model)
20 | 
21 |     # Check if the model already exists
22 |     model_path = os.path.join(OUTPUT_DIR, cleaned_model_id, "m-f16.gguf")
23 |     if os.path.exists(model_path):
24 |         print(f"Model {args.model} already exists at {model_path}. Skipping.")
25 |         sys.exit(0)
26 | 
27 |     # Download the model
28 |     tmp_dir = "/tmp/" + cleaned_model_id
29 |     download_model(model_id=args.model, local_dir=tmp_dir)
30 |     # Convert the model to the gguf format
31 |     print("Converting model to gguf format...")
32 |     outfile_path = os.path.join(OUTPUT_DIR, cleaned_model_id, "m-f16.gguf")
33 |     os.makedirs(os.path.dirname(outfile_path), exist_ok=True)
34 |     try:
35 |         process = subprocess.run(
36 |             [
37 |                 "python",
38 |                 "./prep/llama.cpp/convert.py",
39 |                 tmp_dir,
40 |                 "--outfile",
41 |                 outfile_path,
42 |                 "--padvocab",
43 |             ],
44 |             stdout=subprocess.PIPE,
45 |             stderr=subprocess.PIPE,
46 |             text=True,
47 |             check=True,
48 |         )
49 |     except subprocess.CalledProcessError as e:
50 |         print(f"Error occurred while running convert.py: {e.stderr}")
51 |         sys.exit(1)
52 | 
53 |     # Filter the output
54 |     for line in process.stdout.split("\n"):
55 |         if "error" in line.lower() or "warning" in line.lower():
56 |             print(line)
57 | 
58 |     shutil.rmtree(tmp_dir)
59 |     print(f"Model {args.model} converted to gguf format and saved to {outfile_path}")
60 | 
61 | 
62 | def download_model(model_id: str, local_dir: str, revision: str = "main") -> None:
63 |     os.environ["HF_HUB_CACHE"] = "/tmp"
64 | 
65 |     snapshot_download(
66 |         repo_id=model_id,
67 |         local_dir=local_dir,
68 |         local_dir_use_symlinks=False,
69 |         revision=revision,
70 |         token=os.environ["HUGGINGFACE_TOKEN"],
71 |     )
72 |     print(f"Model {model_id} downloaded to {local_dir}")
73 | 
74 | 
75 | def clean_model_id(model_id: str) -> str:
76 |     return model_id.replace("/", "--")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/gguf/server.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | from datetime import datetime
  5 | from typing import Tuple
  6 | from typing import Union
  7 | 
  8 | import pynvml
  9 | from flask import Flask
 10 | from flask import jsonify
 11 | from flask import request
 12 | from flask.wrappers import Response
 13 | from llama_cpp import Llama
 14 | 
 15 | from llm_bench.config import ModelConfig
 16 | from llm_bench.config import MongoConfig
 17 | from llm_bench.logging import log_metrics
 18 | from llm_bench.utils import has_existing_run
 19 | 
 20 | LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 21 | LOG_DIR = os.environ.get("LOG_DIR", "/var/log")
 22 | LOG_FILE_TXT = os.path.join(LOG_DIR, "benchmarks_local.log")
 23 | LOG_FILE_JSON = os.path.join(LOG_DIR, "benchmarks_local.json")
 24 | LOG_TO_MONGO = os.getenv("LOG_TO_MONGO", "False").lower() in ("true", "1", "t")
 25 | MONGODB_URI = os.environ.get("MONGODB_URI")
 26 | MONGODB_DB = os.environ.get("MONGODB_DB")
 27 | MONGODB_COLLECTION_LOCAL = os.environ.get("MONGODB_COLLECTION_LOCAL")
 28 | FLASK_PORT = 5003
 29 | 
 30 | logging.basicConfig(filename=os.path.join(LOG_DIR, LOG_FILE_TXT), level=LOG_LEVEL)
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | app = Flask(__name__)
 34 | 
 35 | 
 36 | @app.route("/benchmark", methods=["POST"])
 37 | def benchmark_gguf() -> Union[Response, Tuple[Response, int]]:
 38 |     """Enables the use a POST request to call the benchmarking function."""
 39 |     try:
 40 |         # Load config from request
 41 |         framework = "gguf"
 42 |         model_name = request.form.get("model_name")
 43 |         model_path = f"/models/gguf/{model_name}"
 44 |         query = request.form.get("query", "User: Complain that I did not send a request.\nAI:")
 45 |         max_tokens = int(request.form.get("max_tokens", 512))
 46 |         temperature = request.form.get("temperature", default=0.1, type=float)
 47 |         quant_method = request.form.get("quant_method", type=str)
 48 |         quant_type = request.form.get("quant_type", type=str)
 49 |         quant_bits = request.form.get("quant_bits", type=str)
 50 |         n_gpu_layers = int(request.form.get("n_gpu_layers", 0))
 51 |         run_always_str = request.form.get("run_always", "False").lower()
 52 |         run_always = run_always_str == "true"
 53 |         log_level = request.form.get("log_level", "INFO")
 54 |         logger.setLevel(log_level.upper())
 55 | 
 56 |         assert model_name, "model_name not set"
 57 | 
 58 |         quant_str = quant_type if quant_method is not None else "none"
 59 |         logger.info(f"Received request for model: {model_name}, quant: {quant_str}")
 60 | 
 61 |         # Create model config
 62 |         model_config = ModelConfig(
 63 |             framework=framework,
 64 |             model_name=model_name,
 65 |             run_ts=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 66 |             model_dtype="half_float::half",
 67 |             quantization_method=quant_method,
 68 |             quantization_bits=quant_bits,
 69 |             temperature=temperature,
 70 |             misc={"gguf_quant_type": quant_type},
 71 |         )
 72 | 
 73 |         run_config = {
 74 |             "query": query,
 75 |             "max_tokens": max_tokens,
 76 |         }
 77 |         logger.info(f"Run config: {run_config}")
 78 | 
 79 |         if LOG_TO_MONGO:
 80 |             mongo_config = MongoConfig(
 81 |                 uri=MONGODB_URI,  # type: ignore
 82 |                 db=MONGODB_DB,  # type: ignore
 83 |                 collection=MONGODB_COLLECTION_LOCAL,  # type: ignore
 84 |             )
 85 |             existing_run = has_existing_run(model_name, model_config, mongo_config)
 86 |             if existing_run:
 87 |                 if run_always:
 88 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 89 |                     logger.info("Re-running benchmark anyway because run_always is True")
 90 |                 else:
 91 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 92 |                     return jsonify({"status": "skipped", "reason": "model has been benchmarked before"}), 200
 93 |             else:
 94 |                 logger.info(f"Model has not been benchmarked before: {model_name}, quant: {quant_str}")
 95 | 
 96 |         # Main benchmarking function
 97 |         logger.info(f"Loading llama-cpp model from path: {model_path}")
 98 |         llm = Llama(
 99 |             model_path=model_path,
100 |             n_gpu_layers=n_gpu_layers,
101 |         )
102 | 
103 |         # Get GPU memory usage
104 |         time.sleep(1)
105 |         pynvml.nvmlInit()
106 |         gpu_device = int(os.environ.get("GPU_DEVICE", 0))
107 |         handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_device)
108 |         info = pynvml.nvmlDeviceGetMemoryInfo(handle)
109 |         pynvml.nvmlShutdown()
110 | 
111 |         # Run benchmark
112 |         time_0 = time.time()
113 |         output = llm(
114 |             run_config["query"],
115 |             echo=True,
116 |             max_tokens=run_config["max_tokens"],
117 |             temperature=temperature,
118 |         )
119 |         time_1 = time.time()
120 | 
121 |         # # Build config object
122 |         # model_quantization_list = [
123 |         #     ("q4_0", "4bit"),
124 |         #     ("q8_0", "8bit"),
125 |         #     ("f16", None),
126 |         # ]
127 |         # quantization_bits = next(
128 |         #     (bits for key, bits in model_quantization_list if key in model_name),
129 |         #     "unknown",
130 |         # )
131 | 
132 |         # Build metrics object
133 |         output_tokens = output["usage"]["completion_tokens"]  # type: ignore
134 | 
135 |         metrics = {
136 |             "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
137 |             "requested_tokens": [max_tokens],
138 |             "output_tokens": [output_tokens],
139 |             "gpu_mem_usage": [info.used],
140 |             "generate_time": [time_1 - time_0],
141 |             "tokens_per_second": [output_tokens / (time_1 - time_0) if time_1 > time_0 else 0],
142 |         }
143 | 
144 |         # Log metrics
145 |         log_metrics(
146 |             model_type="local",
147 |             config=model_config,
148 |             metrics=metrics,
149 |             file_path=os.path.join(LOG_DIR, LOG_FILE_JSON),
150 |             log_to_mongo=LOG_TO_MONGO,
151 |             mongo_uri=MONGODB_URI,
152 |             mongo_db=MONGODB_DB,
153 |             mongo_collection=MONGODB_COLLECTION_LOCAL,
154 |         )
155 | 
156 |         logger.info(f"===== Model: {model_name} =====")
157 |         logger.info(f"Requested tokens: {max_tokens}")
158 |         logger.info(f"Output tokens: {metrics['output_tokens'][0]}")
159 |         logger.info(f"GPU mem usage: {(metrics['gpu_mem_usage'][0] / 1024**3) :.2f}GB")
160 |         logger.info(f"Generate time: {metrics['generate_time'][0]:.2f} s")
161 |         logger.info(f"Tokens per second: {metrics['tokens_per_second'][0]:.2f}")
162 |         logger.debug(f"Full output: {output}")
163 | 
164 |         return jsonify({"status": "success"}), 200
165 |     except Exception as e:
166 |         logger.exception(f"Error in call_benchmark: {e}")
167 |         return jsonify({"status": "error", "reason": str(e)}), 500
168 | 
169 | 
170 | app.run(host="0.0.0.0", port=FLASK_PORT)
171 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/gguf/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from huggingface_hub import snapshot_download
 4 | 
 5 | 
 6 | def get_quant_type(file: str) -> str:
 7 |     """Get quantization type from file name."""
 8 |     if "f16" in file:
 9 |         return "f16"
10 |     elif "int8" in file:
11 |         return "8bit"
12 |     elif "int4" in file:
13 |         return "4bit"
14 |     else:
15 |         raise ValueError(f"Unknown quant type for file: {file}")
16 | 
17 | 
18 | def download_gguf_model(model_name: str, model_dir: str) -> None:
19 |     model_path = os.path.join(model_dir, model_name.replace("/", "--"))
20 |     snapshot_download(
21 |         repo_id=model_name,
22 |         local_dir=model_path,
23 |         local_dir_use_symlinks=False,
24 |     )
25 | 
26 | 
27 | def fetch_gguf_files(model_dir: str) -> list[str]:
28 |     """Fetch .gguf files from the given directory."""
29 |     gguf_files = []
30 |     for root, _, files in os.walk(model_dir):
31 |         for file in files:
32 |             if file.endswith(".gguf"):
33 |                 relative_path = os.path.relpath(os.path.join(root, file), model_dir)
34 |                 gguf_files.append(relative_path)
35 |     return gguf_files
36 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/api/llm_bench/local/hf/__init__.py


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/server.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import Tuple
  5 | from typing import Union
  6 | 
  7 | import click
  8 | from flask import Flask
  9 | from flask import jsonify
 10 | from flask import request
 11 | from flask.wrappers import Response
 12 | 
 13 | from llm_bench.config import ModelConfig
 14 | from llm_bench.config import MongoConfig
 15 | from llm_bench.logging import log_metrics
 16 | from llm_bench.utils import check_and_clean_space
 17 | from llm_bench.utils import has_existing_run
 18 | 
 19 | LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 20 | LOG_DIR = os.environ.get("LOG_DIR", "/var/log")
 21 | LOG_FILE_TXT = os.path.join(LOG_DIR, "benchmarks_local.log")
 22 | LOG_FILE_JSON = os.path.join(LOG_DIR, "benchmarks_local.json")
 23 | LOG_TO_MONGO = os.getenv("LOG_TO_MONGO")
 24 | MONGODB_URI = os.environ.get("MONGODB_URI")
 25 | MONGODB_DB = os.environ.get("MONGODB_DB")
 26 | MONGODB_COLLECTION_LOCAL = os.environ.get("MONGODB_COLLECTION_LOCAL")
 27 | 
 28 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
 29 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
 30 | 
 31 | logging.basicConfig(filename=os.path.join(LOG_DIR, LOG_FILE_TXT), level=LOG_LEVEL)
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | DO_SAMPLE = False
 35 | 
 36 | 
 37 | app = Flask(__name__)
 38 | 
 39 | 
 40 | @app.route("/benchmark", methods=["POST"])
 41 | def call_huggingface() -> Union[Response, Tuple[Response, int]]:
 42 |     """Enables the use a POST request to call the benchmarking function."""
 43 |     try:
 44 |         model_name = request.form.get("model_name", type=str)
 45 |         framework = request.form.get("framework", type=str)
 46 |         query = request.form.get("query", default=None, type=str)
 47 |         quant_method = request.form.get("quant_method", default=None, type=str)
 48 |         quant_bits = request.form.get("quant_bits", default=None, type=str)
 49 |         max_tokens = request.form.get("max_tokens", default=256, type=int)
 50 |         temperature = request.form.get("temperature", default=0.1, type=float)
 51 | 
 52 |         run_always_str = request.form.get("run_always", "False").lower()
 53 |         run_always = run_always_str == "true"
 54 | 
 55 |         assert framework is not None, "framework is required"
 56 |         assert model_name is not None, "model_name is required"
 57 | 
 58 |         quant_str = f"{quant_method}_{quant_bits}" if quant_method is not None else "none"
 59 |         logger.info(f"Received request for model: {model_name}, quant: {quant_str}")
 60 | 
 61 |         # Create model config
 62 |         model_config = ModelConfig(
 63 |             framework=framework,
 64 |             model_name=model_name,
 65 |             run_ts=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 66 |             model_dtype="torch.float16",
 67 |             quantization_method=quant_method,
 68 |             quantization_bits=quant_bits,
 69 |             temperature=temperature,
 70 |             misc={"do_sample": DO_SAMPLE},
 71 |         )
 72 | 
 73 |         run_config = {
 74 |             "query": query,
 75 |             "max_tokens": max_tokens,
 76 |         }
 77 | 
 78 |         # Check if model has been benchmarked before
 79 |         if LOG_TO_MONGO:
 80 |             mongo_config = MongoConfig(
 81 |                 uri=MONGODB_URI,  # type: ignore
 82 |                 db=MONGODB_DB,  # type: ignore
 83 |                 collection=MONGODB_COLLECTION_LOCAL,  # type: ignore
 84 |             )
 85 |             existing_run = has_existing_run(model_name, model_config, mongo_config)
 86 |             if existing_run:
 87 |                 if run_always:
 88 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 89 |                     logger.info("Re-running benchmark anyway because run_always is True")
 90 |                 else:
 91 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 92 |                     return (
 93 |                         jsonify(
 94 |                             {
 95 |                                 "status": "skipped",
 96 |                                 "reason": "model has been benchmarked before",
 97 |                             }
 98 |                         ),
 99 |                         200,
100 |                     )
101 |             else:
102 |                 logger.info(f"Model has not been benchmarked before: {model_name}, quant: {quant_str}")
103 | 
104 |         # Check and clean disk space if needed
105 |         check_and_clean_space(directory=CACHE_DIR, threshold=90.0)
106 | 
107 |         if framework == "transformers":
108 |             from llm_bench.local.hf.transformers import generate
109 |         elif framework == "hf-tgi":
110 |             from llm_bench.local.hf.tgi import generate
111 |         else:
112 |             raise ValueError(f"Unknown framework: {framework}")
113 | 
114 |         # Main benchmarking function
115 |         metrics = generate(model_config, run_config)
116 |         assert metrics, "metrics is empty"
117 | 
118 |         # Log metrics
119 |         log_metrics(
120 |             model_type="local",
121 |             config=model_config,
122 |             metrics=metrics,
123 |             file_path=os.path.join(LOG_DIR, LOG_FILE_JSON),
124 |             log_to_mongo=LOG_TO_MONGO,  # type: ignore
125 |             mongo_uri=MONGODB_URI,
126 |             mongo_db=MONGODB_DB,
127 |             mongo_collection=MONGODB_COLLECTION_LOCAL,
128 |         )
129 | 
130 |         # print metrics
131 |         logger.info(f"===== Model: {model_name} =====")
132 |         logger.info(f"Requested tokens: {run_config['max_tokens']}")
133 |         logger.info(f"Output tokens: {metrics['output_tokens'][0]}")
134 |         logger.info(f"GPU mem usage: {(metrics['gpu_mem_usage'][0] / 1024**3) :.2f}GB")
135 |         logger.info(f"Generate time: {metrics['generate_time'][0]:.2f} s")
136 |         logger.info(f"Tokens per second: {metrics['tokens_per_second'][0]:.2f}")
137 | 
138 |         return jsonify({"status": "success"}), 200
139 |     except Exception as e:
140 |         logger.exception(f"Error in call_benchmark: {e}")
141 |         return jsonify({"status": "error", "reason": str(e)}), 500
142 | 
143 | 
144 | @click.command()
145 | @click.option("--port", required=True, help="Port to run the server on")
146 | def main(port):
147 |     app.run(host="0.0.0.0", port=port)
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()
152 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/tgi/__init__.py:
--------------------------------------------------------------------------------
1 | from .generate import generate
2 | from .tgi_docker import DockerContainer
3 | 
4 | __all__ = ["generate", "DockerContainer"]
5 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/tgi/generate.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | 
 6 | from huggingface_hub import InferenceClient
 7 | 
 8 | from llm_bench.config import ModelConfig
 9 | from llm_bench.local.hf.tgi import DockerContainer
10 | from llm_bench.utils import get_vram_usage
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | GPU_DEVICE = os.environ.get("GPU_DEVICE")
16 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
17 | assert GPU_DEVICE, "GPU_DEVICE environment variable not set"
18 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
19 | 
20 | 
21 | def generate(config: ModelConfig, run_config: dict):
22 |     """Run TGI inference and return metrics."""
23 |     time.sleep(1)
24 | 
25 |     quant_str = f"{config.quantization_method}_{config.quantization_bits}" or "none"
26 |     logger.info(f"Running benchmark: {config.model_name}, quant: {quant_str}")
27 | 
28 |     # Load model
29 |     with DockerContainer(
30 |         config.model_name,
31 |         CACHE_DIR,
32 |         int(GPU_DEVICE),
33 |         config.quantization_method,
34 |         config.quantization_bits,
35 |     ) as container:
36 |         if container.is_ready():
37 |             logger.info("Docker container is ready.")
38 |             client = InferenceClient("http://127.0.0.0:8080")
39 | 
40 |             # Generate samples
41 |             time0 = time.time()
42 |             response = client.text_generation(
43 |                 prompt=run_config["query"],
44 |                 max_new_tokens=run_config["max_tokens"],
45 |                 temperature=config.temperature,
46 |                 details=True,
47 |             )
48 |             time1 = time.time()
49 | 
50 |             # Process metrics
51 |             output_tokens = len(response.details.tokens) if response.details is not None else 0
52 |             vram_usage = get_vram_usage(int(GPU_DEVICE))
53 |             metrics = {
54 |                 "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
55 |                 "requested_tokens": [run_config["max_tokens"]],
56 |                 "output_tokens": [output_tokens],
57 |                 "gpu_mem_usage": [vram_usage],
58 |                 "generate_time": [time1 - time0],
59 |                 "tokens_per_second": [output_tokens / (time1 - time0) if time1 > time0 else 0],
60 |             }
61 | 
62 |             return metrics
63 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/tgi/tgi_docker.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import subprocess
  3 | import sys
  4 | import time
  5 | from typing import Optional
  6 | 
  7 | import requests
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class DockerContainer:
 13 |     def __init__(
 14 |         self,
 15 |         model: str,
 16 |         cache_dir: str,
 17 |         gpu_device: int = 0,
 18 |         quant_method: Optional[str] = None,
 19 |         quant_bits: Optional[str] = None,
 20 |     ):
 21 |         self.model = model
 22 |         self.cache_dir = cache_dir
 23 |         self.gpu_device = gpu_device
 24 |         self.quant_method = quant_method
 25 |         self.quant_bits = quant_bits
 26 |         self.container_id = None
 27 | 
 28 |     def __enter__(self):
 29 |         self.start()
 30 |         return self
 31 | 
 32 |     def __exit__(self, exc_type, exc_val, exc_tb):
 33 |         self.stop()
 34 | 
 35 |     def start(self):
 36 |         """Starts the Docker container."""
 37 |         command = [
 38 |             "/usr/local/bin/docker",
 39 |             "run",
 40 |             "-d",
 41 |             "--gpus",
 42 |             f"device={self.gpu_device}",
 43 |             "--shm-size",
 44 |             "1g",
 45 |             "-p",
 46 |             "8080:80",
 47 |             "--hostname",
 48 |             "0.0.0.0",
 49 |             "-v",
 50 |             f"{self.cache_dir}:/data",
 51 |             "ghcr.io/huggingface/text-generation-inference:latest",
 52 |             "--model-id",
 53 |             self.model,
 54 |         ]
 55 | 
 56 |         quant_info = self.get_quantization_info()
 57 |         command.extend(quant_info["command"])
 58 |         logger.info(quant_info["message"])
 59 | 
 60 |         try:
 61 |             process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
 62 |             stdout, stderr = process.communicate()
 63 |             if stdout:
 64 |                 logger.info(f"Docker process stdout: {stdout.decode()}")
 65 |             if stderr:
 66 |                 logger.error(f"Docker process stderr: {stderr.decode()}")
 67 |             process.wait()
 68 |             logger.info("Docker container started successfully.")
 69 |         except FileNotFoundError as e:
 70 |             logger.error(f"Docker command not found: {e}")  # Log specific error message
 71 |             sys.exit(1)
 72 |         except subprocess.CalledProcessError as e:
 73 |             logger.error(f"Failed to start Docker container: {e}")
 74 |             sys.exit(1)
 75 | 
 76 |         # Fetch container ID immediately
 77 |         self.container_id = stdout.decode().strip()
 78 |         if not self.container_id:
 79 |             raise RuntimeError("Failed to get Docker container ID")
 80 | 
 81 |     def stop(self):
 82 |         """Stops the Docker container."""
 83 |         if not self.container_id:
 84 |             return
 85 | 
 86 |         try:
 87 |             subprocess.run(["docker", "stop", self.container_id], check=True)
 88 |             logger.info("Docker container stopped successfully.")
 89 |         except subprocess.CalledProcessError as e:
 90 |             logger.error(f"Failed to stop Docker container: {e}")
 91 |             sys.exit(1)
 92 | 
 93 |     def fetch_logs(self):
 94 |         """Fetches logs from the Docker container."""
 95 |         if not self.container_id:
 96 |             return ""
 97 | 
 98 |         try:
 99 |             result = subprocess.run(
100 |                 ["docker", "logs", self.container_id],
101 |                 capture_output=True,
102 |                 text=True,
103 |                 check=True,
104 |             )
105 |             return result.stdout
106 |         except subprocess.CalledProcessError as e:
107 |             logger.error(f"Failed to fetch Docker logs: {e}")
108 |             return ""
109 | 
110 |     def is_ready(self, timeout: int = 1800):
111 |         """Check if the Docker container is ready."""
112 |         success_message = "Connected"
113 |         error_pattern = "Error:"
114 | 
115 |         start_time = time.time()
116 |         while time.time() - start_time < timeout:
117 |             # Check Docker logs for success or error messages
118 |             logs = self.fetch_logs()
119 |             if success_message in logs:
120 |                 return True
121 |             if error_pattern in logs:
122 |                 logger.error("Error detected in Docker logs.")
123 |                 error_log = logs.split("Error:")[1].split("\n")[0]
124 |                 logger.error(f"Docker logs error: {error_log}")
125 |                 return False
126 | 
127 |             # Check if container's service is responding
128 |             try:
129 |                 response = requests.get("http://127.0.0.1:8080")
130 |                 if response.status_code == 200:
131 |                     return True
132 |             except requests.exceptions.RequestException:
133 |                 pass
134 | 
135 |             time.sleep(5)
136 |         return False
137 | 
138 |     def get_quantization_info(self):
139 |         if self.quant_method is None:
140 |             return {
141 |                 "command": [],
142 |                 "message": "Starting Docker container without quantization.",
143 |             }
144 |         elif self.quant_method == "gptq":
145 |             return {
146 |                 "command": ["--quantize", "gptq"],
147 |                 "message": "Starting Docker container with GPTQ quantization.",
148 |             }
149 |         elif self.quant_method == "bitsandbytes":
150 |             if self.quant_bits == "4bit":
151 |                 return {
152 |                     "command": ["--quantize", "bitsandbytes-nf4"],
153 |                     "message": "Starting Docker container with 4bit quantization.",
154 |                 }
155 |             elif self.quant_bits == "8bit":
156 |                 return {
157 |                     "command": ["--quantize", "bitsandbytes"],
158 |                     "message": "Starting Docker container with 8bit quantization.",
159 |                 }
160 |             else:
161 |                 raise ValueError(f"Invalid quant_bits: {self.quant_bits}")
162 |         elif self.quant_method == "awq":
163 |             raise NotImplementedError("AWQ not implemented yet.")
164 |         else:
165 |             raise ValueError(f"Invalid quant_method: {self.quant_method}")
166 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .generate import generate
2 | 
3 | __all__ = ["generate"]
4 | 


--------------------------------------------------------------------------------
/api/llm_bench/local/hf/transformers/generate.py:
--------------------------------------------------------------------------------
  1 | """LLM generation and benchmarking for HuggingFace Transformers library."""
  2 | 
  3 | import gc
  4 | import logging.config
  5 | import os
  6 | from datetime import datetime
  7 | from time import time
  8 | 
  9 | import torch
 10 | from transformers import AutoModelForCausalLM
 11 | from transformers import BitsAndBytesConfig
 12 | 
 13 | from llm_bench.config import ModelConfig
 14 | from llm_bench.utils import get_vram_usage
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | HF_TOKEN = os.environ.get("HF_TOKEN", None)
 19 | GPU_DEVICE = os.environ.get("GPU_DEVICE")
 20 | assert GPU_DEVICE, "GPU_DEVICE environment variable not set"
 21 | 
 22 | 
 23 | def generate(
 24 |     config: ModelConfig,
 25 |     run_config: dict,
 26 | ) -> dict:
 27 |     """Run Transformers inference and return metrics."""
 28 | 
 29 |     quant_str = f"{config.quantization_method}_{config.quantization_bits}" or "none"
 30 |     logger.info(f"Running benchmark: {config.model_name}, quant: {quant_str}")
 31 | 
 32 |     if config.quantization_method == "gptq":
 33 |         # gptq comes pre-quantized from the hub, no need to convert when loading
 34 |         quant_config = None
 35 |     elif config.quantization_method == "awq":
 36 |         raise NotImplementedError("AWQ not supported at the moment due to compatibility issues")
 37 |     elif config.quantization_method == "bitsandbytes":
 38 |         quant_config = BitsAndBytesConfig(
 39 |             load_in_8bit=config.load_in_8bit,
 40 |             load_in_4bit=config.load_in_4bit,
 41 |             bnb_4bit_compute_dtype=torch.float16,
 42 |         )
 43 |     else:
 44 |         quant_config = None
 45 | 
 46 |     # Prepare the arguments for loading the model
 47 |     load_args = {
 48 |         "torch_dtype": torch.float16 if config.model_dtype == "torch.float16" else torch.float32,
 49 |         "device_map": "auto",
 50 |         "trust_remote_code": True,
 51 |         "token": HF_TOKEN,
 52 |     }
 53 | 
 54 |     # Conditionally add quantization_config if it is not None
 55 |     if quant_config is not None:
 56 |         load_args["quantization_config"] = quant_config
 57 | 
 58 |     # Load model
 59 |     logger.info(f"Loading pretrained model: {config.model_name}, quant: {quant_str}")
 60 |     logger.info(f"Config: {config.to_dict()}")
 61 |     model = AutoModelForCausalLM.from_pretrained(config.model_name, **load_args)
 62 |     model.eval()
 63 | 
 64 |     # Checks for correct model loading and dtype (hack to ensure pre-quantized models are loaded correctly)
 65 |     if config.quantization_bits == "4bit" and config.quantization_method != "bitsandbytes":
 66 |         assert model.config.quantization_config.bits == 4, f"Model quant bits: {model.config.quantization.bits}"
 67 |     elif config.quantization_bits == "8bit" and config.quantization_method != "bitsandbytes":
 68 |         assert model.config.quantization_config.bits == 8, f"Model quant bits: {model.config.quantization.bits}"
 69 | 
 70 |     # Generate samples
 71 |     time_0 = time()
 72 |     output = None
 73 |     try:
 74 |         input_ids = torch.tensor([[0, 1, 2]]).to("cuda")
 75 |         attention_mask = torch.ones_like(input_ids).to("cuda")
 76 |         with torch.no_grad():
 77 |             output = model.generate(
 78 |                 input_ids,
 79 |                 attention_mask=attention_mask,
 80 |                 do_sample=config.misc.get("do_sample"),
 81 |                 temperature=config.temperature if config.misc.get("do_sample") else None,
 82 |                 min_length=run_config["max_tokens"],
 83 |                 max_length=run_config["max_tokens"],
 84 |                 pad_token_id=model.config.eos_token_id,
 85 |             )
 86 |     except Exception as e:
 87 |         logger.error(f"Error generating tokens: {e}")
 88 |         raise e
 89 |     time_1 = time()
 90 | 
 91 |     # Collect metrics
 92 |     output_tokens = len(output.cpu().numpy().tolist()[0]) if output is not None and output.numel() > 0 else 0
 93 |     vram_usage = get_vram_usage(int(GPU_DEVICE))
 94 | 
 95 |     metrics = {
 96 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 97 |         "requested_tokens": [run_config["max_tokens"]],
 98 |         "output_tokens": [output_tokens],
 99 |         "gpu_mem_usage": [vram_usage],
100 |         "generate_time": [time_1 - time_0],
101 |         "tokens_per_second": [output_tokens / (time_1 - time_0) if time_1 > time_0 else 0],
102 |     }
103 | 
104 |     del model
105 |     gc.collect()
106 |     torch.cuda.empty_cache()
107 | 
108 |     return metrics
109 | 


--------------------------------------------------------------------------------
/api/llm_bench/logging.py:
--------------------------------------------------------------------------------
  1 | """Logging utilities for the LLM benchmarks."""
  2 | 
  3 | import json
  4 | import logging.config
  5 | import os
  6 | from datetime import datetime
  7 | from typing import Any
  8 | from typing import Dict
  9 | from typing import Optional
 10 | from typing import Union
 11 | 
 12 | import pymongo
 13 | import pytz
 14 | from filelock import FileLock
 15 | from pymongo.collection import Collection
 16 | 
 17 | from llm_bench.config import CloudConfig
 18 | from llm_bench.config import ModelConfig
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def log_metrics(
 24 |     model_type: str,
 25 |     config: Union[ModelConfig, CloudConfig],
 26 |     metrics: Dict[str, Any],
 27 |     file_path: str,
 28 |     log_to_mongo: bool,
 29 |     mongo_uri: Optional[str] = None,
 30 |     mongo_db: Optional[str] = None,
 31 |     mongo_collection: Optional[str] = None,
 32 | ) -> None:
 33 |     """Logs metrics to a JSON file and optionally to MongoDB."""
 34 |     log_json(model_type, config, metrics, file_path)
 35 | 
 36 |     if log_to_mongo:
 37 |         assert mongo_uri, "mongo_uri not provided"
 38 |         assert mongo_db, "mongo_db not provided"
 39 |         assert mongo_collection, "mongo_collection not provided"
 40 |         log_mongo(
 41 |             model_type=model_type,
 42 |             config=config,
 43 |             metrics=metrics,
 44 |             uri=mongo_uri,
 45 |             db_name=mongo_db,
 46 |             collection_name=mongo_collection,
 47 |         )
 48 | 
 49 | 
 50 | def setup_database(uri: str, db_name: str, collection_name: str) -> Collection:
 51 |     client = pymongo.MongoClient(uri)
 52 |     db = client[db_name]
 53 |     collection = db[collection_name]
 54 |     return collection
 55 | 
 56 | 
 57 | def insert_into_benchmark_metrics(data: dict, collection: Collection) -> None:
 58 |     collection.insert_one(data)
 59 | 
 60 | 
 61 | def log_json(model_type: str, config: Union[ModelConfig, CloudConfig], metrics: Dict[str, Any], file_path: str) -> None:
 62 |     """Logs the metrics to a JSON file for a model run."""
 63 |     log_entry = {
 64 |         "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 65 |         "model_type": model_type,
 66 |         "model_name": config.model_name,
 67 |         "temperature": config.temperature,
 68 |         "requested_tokens": metrics["requested_tokens"],
 69 |         "output_tokens": metrics["output_tokens"],
 70 |         "generate_time": metrics["generate_time"],
 71 |         "tokens_per_second": metrics["tokens_per_second"],
 72 |         "misc": config.misc,
 73 |     }
 74 | 
 75 |     if model_type == "local":
 76 |         assert isinstance(config, ModelConfig)
 77 |         log_entry.update(
 78 |             {
 79 |                 "framework": config.framework,
 80 |                 "quantization_method": config.quantization_method,
 81 |                 "quantization_bits": config.quantization_bits,
 82 |                 "model_dtype": config.model_dtype,
 83 |                 "gpu_mem_usage": metrics["gpu_mem_usage"],
 84 |             }
 85 |         )
 86 |     elif model_type == "cloud":
 87 |         assert isinstance(config, CloudConfig)
 88 |         log_entry.update(
 89 |             {
 90 |                 "provider": config.provider,
 91 |                 "time_to_first_token": metrics["time_to_first_token"],
 92 |                 "times_between_tokens": metrics["times_between_tokens"],
 93 |             }
 94 |         )
 95 | 
 96 |     lock_path = f"{file_path}.lock"
 97 |     with FileLock(lock_path):
 98 |         if os.path.exists(file_path):
 99 |             with open(file_path, "r") as file:
100 |                 try:
101 |                     logs = json.load(file)
102 |                 except json.JSONDecodeError:
103 |                     logger.warning("Corrupted JSON detected. Attempting to fix.")
104 |                     logs = fix_corrupted_json(file_path)
105 |         else:
106 |             logs = []
107 | 
108 |         logs.append(log_entry)
109 | 
110 |         with open(file_path, "w") as file:
111 |             json.dump(logs, file, indent=4)
112 | 
113 |         logger.info(f"Logged to file: {file_path}")
114 | 
115 | 
116 | def fix_corrupted_json(file_path: str) -> list:
117 |     """Attempts to fix a corrupted JSON file by removing invalid entries."""
118 |     with open(file_path, "r") as file:
119 |         content = file.read()
120 | 
121 |     # Find the last valid JSON array closing bracket
122 |     last_valid_index = content.rfind("}]")
123 | 
124 |     if last_valid_index != -1:
125 |         # Calculate the number of lines removed
126 |         original_lines = content.splitlines()
127 |         fixed_content = content[: last_valid_index + 2]
128 |         fixed_lines = fixed_content.splitlines()
129 |         lines_removed = len(original_lines) - len(fixed_lines)
130 | 
131 |         with open(file_path, "w") as file:
132 |             file.write(fixed_content)
133 | 
134 |         logger.info(f"Fixed corrupted JSON. Removed {lines_removed} lines.")
135 |         return json.loads(fixed_content)
136 |     else:
137 |         logger.error("Could not find a valid JSON array closing bracket. No changes made.")
138 |         return []
139 | 
140 | 
141 | def log_mongo(
142 |     model_type: str,
143 |     config: Union[ModelConfig, CloudConfig],
144 |     metrics: Dict[str, Any],
145 |     uri: str,
146 |     db_name: str,
147 |     collection_name: str,
148 | ) -> None:
149 |     """Logs the metrics to MongoDB for a model run."""
150 |     assert model_type in ["local", "cloud"], f"Invalid model_type: {model_type}"
151 | 
152 |     logger.info(f"Logging metrics to MongoDB for {model_type} model {config.model_name}")
153 |     try:
154 |         collection = setup_database(uri, db_name, collection_name)
155 | 
156 |         # Settimestamps correctly
157 |         run_ts_utc = datetime.strptime(config.run_ts, "%Y-%m-%d %H:%M:%S").replace(tzinfo=pytz.UTC)
158 |         gen_ts_utc = datetime.strptime(metrics["gen_ts"], "%Y-%m-%d %H:%M:%S").replace(tzinfo=pytz.UTC)
159 | 
160 |         data = {
161 |             "run_ts": run_ts_utc,
162 |             "model_name": config.model_name,
163 |             "temperature": config.temperature,
164 |             "gen_ts": gen_ts_utc,
165 |             "requested_tokens": metrics["requested_tokens"],
166 |             "output_tokens": metrics["output_tokens"],
167 |             "generate_time": metrics["generate_time"],
168 |             "tokens_per_second": metrics["tokens_per_second"],
169 |             "misc": config.misc,
170 |         }
171 | 
172 |         if model_type == "local":
173 |             assert isinstance(config, ModelConfig)
174 |             data.update(
175 |                 {
176 |                     "framework": config.framework,
177 |                     "quantization_method": config.quantization_method,
178 |                     "quantization_bits": config.quantization_bits,
179 |                     "model_dtype": config.model_dtype,
180 |                     "gpu_mem_usage": metrics["gpu_mem_usage"],
181 |                 }
182 |             )
183 |         elif model_type == "cloud":
184 |             assert isinstance(config, CloudConfig)
185 |             data.update(
186 |                 {
187 |                     "provider": config.provider,
188 |                     "time_to_first_token": metrics["time_to_first_token"],
189 |                     "times_between_tokens": metrics["times_between_tokens"],
190 |                 }
191 |             )
192 | 
193 |         insert_into_benchmark_metrics(data, collection)
194 |         sanitized_uri = uri.split("@")[-1]  # Remove credentials part
195 |         logger.info(f"Logged: {config.model_name} | {sanitized_uri} | {db_name} | {collection_name}")
196 |     except Exception as e:
197 |         logger.exception(f"Error in log_to_mongo: {e}")
198 | 


--------------------------------------------------------------------------------
/api/llm_bench/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class BenchmarkRequest(BaseModel):
 7 |     provider: str
 8 |     model: str
 9 |     query: str
10 |     max_tokens: int = 256
11 |     temperature: float = 0.1
12 |     run_always: bool = False
13 |     debug: bool = False
14 | 
15 | 
16 | class BenchmarkResponse(BaseModel):
17 |     status: str
18 |     metrics: Optional[dict] = None
19 |     reason: Optional[str] = None
20 | 


--------------------------------------------------------------------------------
/api/llm_bench/utils.py:
--------------------------------------------------------------------------------
  1 | import logging.config
  2 | import os
  3 | import re
  4 | import shutil
  5 | from datetime import datetime
  6 | from datetime import timedelta
  7 | from typing import List
  8 | from typing import Optional
  9 | from typing import Tuple
 10 | from typing import Union
 11 | from typing import cast
 12 | 
 13 | import pynvml
 14 | from huggingface_hub import HfApi
 15 | from pymongo import MongoClient
 16 | 
 17 | from llm_bench.config import CloudConfig
 18 | from llm_bench.config import ModelConfig
 19 | from llm_bench.config import MongoConfig
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def fetch_hf_models(fetch_new: bool, cache_dir: str, library: str, created_days_ago: int) -> list[str]:
 25 |     if fetch_new:
 26 |         try:
 27 |             api = HfApi()
 28 |             now = datetime.now()
 29 |             one_month_ago = now - timedelta(days=created_days_ago)
 30 | 
 31 |             library_name = "transformers" if library in ["transformers", "hf-tgi"] else library
 32 | 
 33 |             # Fetch models sorted by downloads and filtered by text-generation
 34 |             models = api.list_models(
 35 |                 sort="downloads",
 36 |                 direction=-1,
 37 |                 task="text-generation",
 38 |                 library=library_name,
 39 |                 limit=10_000,
 40 |             )
 41 | 
 42 |             # Try to filter out 'gguf' models
 43 |             if library in ["transformers", "hf-tgi"]:
 44 |                 models = [model for model in models if "gguf" not in model.tags]
 45 |                 models = [model for model in models if "gguf" not in model.id.lower()]
 46 | 
 47 |             # Filter models modified in the past 30 days
 48 |             model_names = [
 49 |                 model.id
 50 |                 for model in models
 51 |                 if model.created_at and model.created_at.replace(tzinfo=None) > one_month_ago
 52 |             ]
 53 |             return model_names
 54 |         except Exception as e:
 55 |             print(f"Error fetching models from HuggingFace Hub: {e}")
 56 |             return []
 57 |     else:
 58 |         try:
 59 |             return get_cached_models(cache_dir)
 60 |         except Exception as e:
 61 |             print(f"Error fetching cached models: {e}")
 62 |             return []
 63 | 
 64 | 
 65 | def get_used_space_percent(directory: str) -> float:
 66 |     """Get the used space percentage of the file system containing the directory."""
 67 |     stat = os.statvfs(directory)
 68 |     return ((stat.f_blocks - stat.f_bfree) / stat.f_blocks) * 100
 69 | 
 70 | 
 71 | def get_model_directories(directory: str) -> List[str]:
 72 |     """Get a list of directories in the given directory, filtered by those starting with 'models--'."""
 73 |     return [
 74 |         os.path.join(directory, d)
 75 |         for d in os.listdir(directory)
 76 |         if os.path.isdir(os.path.join(directory, d)) and d.startswith("models--")
 77 |     ]
 78 | 
 79 | 
 80 | def get_oldest_directory(directories: List[str]) -> str:
 81 |     """Find the oldest directory in the given list."""
 82 |     oldest_directory = min(directories, key=lambda d: os.path.getmtime(d))
 83 |     return oldest_directory
 84 | 
 85 | 
 86 | def check_and_clean_space(directory: str, threshold: float = 90.0):
 87 |     # Check disk usage
 88 |     used_space = get_used_space_percent(directory)
 89 |     logger.info(f"Current disk usage: {used_space:.2f}% ({directory})")
 90 | 
 91 |     while used_space > threshold:
 92 |         # Get model directories
 93 |         model_dirs = get_model_directories(directory)
 94 | 
 95 |         # If there are no model directories, exit the loop
 96 |         if not model_dirs:
 97 |             logger.info("No model directories to remove.")
 98 |             break
 99 | 
100 |         # Find the oldest directory
101 |         oldest_dir = get_oldest_directory(model_dirs)
102 | 
103 |         # Remove the oldest directory
104 |         logger.info(f"Removing: {oldest_dir}")
105 |         shutil.rmtree(oldest_dir)
106 | 
107 |         # Recheck disk usage
108 |         used_space = get_used_space_percent(directory)
109 |         logger.info(f"Updated disk usage: {used_space:.2f}%")
110 | 
111 | 
112 | def get_cached_models(directory: str) -> list[str]:
113 |     """
114 |     Get a list of cached HF models in the given directory.
115 |     """
116 |     print(f"Getting cached models from directory: {directory}")
117 |     files = os.listdir(directory)
118 |     model_files = [f for f in files if f.startswith("models--")]
119 |     formatted_names = [f.removeprefix("models--").replace("--", "/") for f in model_files]
120 |     print(f"Found {len(formatted_names):,} cached models")
121 |     return formatted_names
122 | 
123 | 
124 | def extract_param_count(model_id: str) -> Optional[Tuple[str, float]]:
125 |     """
126 |     Extract the parameter count from the model name.
127 | 
128 |     Returns a tuple of the model name and its parameter count in millions,
129 |     or None if the pattern does not match.
130 |     """
131 |     # Special case for 'mixtral' models
132 |     if "mixtral" in model_id.lower():
133 |         # If it's a 'mixtral' model, set the numerical part to 56 billion
134 |         numerical_part = 56.0
135 |         unit = "B"
136 |     else:
137 |         # Use regex to extract the parameter size with a specific pattern
138 |         match = re.search(r"(\d+)x(\d+\.\d+|\d+)([MmBb])", model_id)
139 |         if not match:
140 |             # If no multiplier pattern is found, try matching without multiplier
141 |             match = re.search(r"(\d+\.\d+|\d+)([MmBb])", model_id)
142 |             if not match:
143 |                 return None
144 |             numerical_part = float(match.group(1))
145 |             unit = match.group(2).upper()
146 |         else:
147 |             # If multiplier pattern is found, calculate the total size
148 |             multiplier, size_str, unit = match.groups()
149 |             numerical_part = float(size_str) * int(multiplier)
150 |             unit = unit.upper()
151 | 
152 |     # Normalize parameter count to millions
153 |     if unit == "B":
154 |         numerical_part *= 1000  # Convert B to M
155 | 
156 |     return model_id, numerical_part
157 | 
158 | 
159 | def filter_model_size(model_ids: List[str], max_size_million: int) -> List[str]:
160 |     """
161 |     Filter models based on parameter count.
162 |     """
163 |     valid_models: List[str] = []
164 |     dropped_models: List[str] = []
165 | 
166 |     for model_id in model_ids:
167 |         result = extract_param_count(model_id)
168 |         if not result:
169 |             dropped_models.append(model_id)
170 |             continue
171 | 
172 |         # Unpack the model name and its parameter count
173 |         _, param_count_million = result
174 | 
175 |         # Filter based on parameter count
176 |         if param_count_million <= max_size_million:
177 |             valid_models.append(model_id)
178 |         else:
179 |             dropped_models.append(model_id)
180 | 
181 |     return valid_models
182 | 
183 | 
184 | def get_vram_usage(gpu_device: int) -> int:
185 |     pynvml.nvmlInit()
186 |     handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_device)
187 |     info = pynvml.nvmlDeviceGetMemoryInfo(handle)
188 |     pynvml.nvmlShutdown()
189 |     return cast(int, info.used)
190 | 
191 | 
192 | # Logger Configuration
193 | def setup_logger():
194 |     """Set up logging configuration."""
195 |     logging_config = {
196 |         "version": 1,
197 |         "disable_existing_loggers": False,
198 |         "formatters": {
199 |             "standard": {
200 |                 "format": "tgi - %(asctime)s - %(name)s - %(levelname)s - %(message)s",
201 |                 "datefmt": "%Y-%m-%d %H:%M:%S",
202 |             },
203 |         },
204 |         "handlers": {
205 |             "console": {
206 |                 "class": "logging.StreamHandler",
207 |                 "formatter": "standard",
208 |                 "level": logging.INFO,
209 |             },
210 |             "file": {
211 |                 "class": "logging.FileHandler",
212 |                 "filename": "./logs/llm_benchmarks.log",
213 |                 "formatter": "standard",
214 |                 "level": logging.DEBUG,
215 |             },
216 |         },
217 |         "root": {
218 |             "handlers": ["console", "file"],
219 |             "level": logging.DEBUG,
220 |         },
221 |     }
222 |     logging.config.dictConfig(logging_config)
223 | 
224 | 
225 | def has_existing_run(model_name: str, model_config: Union[CloudConfig, ModelConfig], mongo_config: MongoConfig) -> bool:
226 |     # Initialize MongoDB client and collection
227 |     client = MongoClient(mongo_config.uri)
228 |     db = client[mongo_config.db]
229 |     collection = db[mongo_config.collection]
230 | 
231 |     # Check if model has been benchmarked before
232 |     if isinstance(model_config, CloudConfig):
233 |         existing_config = collection.find_one(
234 |             {
235 |                 "provider": model_config.provider,
236 |                 "model_name": model_name,
237 |             }
238 |         )
239 |     elif isinstance(model_config, ModelConfig):
240 |         existing_config = collection.find_one(
241 |             {
242 |                 "framework": model_config.framework,
243 |                 "model_name": model_name,
244 |                 "quantization_method": model_config.quantization_method,
245 |                 "quantization_bits": model_config.quantization_bits,
246 |             }
247 |         )
248 |     else:
249 |         raise Exception(f"Invalid model_config type: {type(model_config)}")
250 | 
251 |     if existing_config:
252 |         logger.info("Model already benchmarked.")
253 |         client.close()
254 |         return True
255 |     else:
256 |         logger.info("Model not benchmarked.")
257 |         client.close()
258 |         return False
259 | 


--------------------------------------------------------------------------------
/api/run_cloud.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | from datetime import datetime
  4 | from typing import List
  5 | 
  6 | import dotenv
  7 | import httpx
  8 | import json5 as json
  9 | import typer
 10 | from llm_bench.cloud.logging import Logger
 11 | from llm_bench.types import BenchmarkRequest
 12 | from tenacity import retry
 13 | from tenacity import stop_after_attempt
 14 | from tenacity import wait_exponential
 15 | 
 16 | dotenv.load_dotenv()
 17 | 
 18 | 
 19 | # Initialize Logger
 20 | redis_url = os.getenv("REDIS_URL")
 21 | if not redis_url:
 22 |     raise ValueError("REDIS_URL environment variable is not set")
 23 | 
 24 | logger = Logger(
 25 |     logs_dir=os.getenv("LOGS_DIR", "./logs"),
 26 |     redis_url=redis_url,
 27 | )
 28 | 
 29 | # Constants
 30 | QUERY_TEXT = "Tell a long and happy story about the history of the world."
 31 | MAX_TOKENS = 64
 32 | TEMPERATURE = 0.1
 33 | FASTAPI_PORT = os.environ.get("FASTAPI_PORT_CLOUD")
 34 | assert FASTAPI_PORT, "FASTAPI_PORT environment variable not set"
 35 | server_path = f"http://localhost:{FASTAPI_PORT}/benchmark"
 36 | MAX_RETRIES = 3
 37 | 
 38 | # Load provider models from JSON
 39 | script_dir = os.path.dirname(os.path.abspath(__file__))
 40 | json_file_path = os.path.join(script_dir, "../cloud/models.json")
 41 | with open(json_file_path) as f:
 42 |     provider_models = json.load(f)
 43 | 
 44 | 
 45 | @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=5))
 46 | async def post_benchmark(request: BenchmarkRequest):
 47 |     timeout = httpx.Timeout(60.0, connect=10.0)
 48 |     start_time = datetime.now()
 49 | 
 50 |     async with httpx.AsyncClient(timeout=timeout) as client:
 51 |         try:
 52 |             response = await client.post(server_path, json=request.model_dump())
 53 |             response.raise_for_status()
 54 |         except httpx.HTTPError as e:
 55 |             error_msg = f"HTTP Error: {str(e)} (Status: {e.response.status_code if hasattr(e, 'response') else 'N/A'})"
 56 |             if hasattr(e, "response"):
 57 |                 try:
 58 |                     error_data = e.response.json()
 59 |                     if error_data:
 60 |                         error_msg += f" - {error_data}"
 61 |                 except Exception as e:
 62 |                     pass
 63 |             raise ValueError(error_msg)
 64 | 
 65 |     end_time = datetime.now()
 66 |     response_time = (end_time - start_time).total_seconds()
 67 |     response_data = response.json()
 68 | 
 69 |     if "error" in response_data or response_data.get("status") == "error":
 70 |         error_details = []
 71 |         if response_data.get("reason"):
 72 |             error_details.append(response_data["reason"])
 73 |         if response_data.get("message"):
 74 |             error_details.append(response_data["message"])
 75 |         if response_data.get("error"):
 76 |             error_details.append(str(response_data["error"]))
 77 |         if response_data.get("metrics"):
 78 |             error_details.append(f"metrics: {response_data['metrics']}")
 79 | 
 80 |         error_msg = " | ".join(filter(None, error_details))
 81 |         if not error_msg:
 82 |             error_msg = f"Server returned error status with response: {response_data}"
 83 |         raise ValueError(error_msg)
 84 | 
 85 |     return response_data, response_time
 86 | 
 87 | 
 88 | async def benchmark_with_retries(request: BenchmarkRequest):
 89 |     retry_count = 0
 90 |     last_error = None
 91 | 
 92 |     while retry_count < MAX_RETRIES:
 93 |         try:
 94 |             response_data, response_time = await post_benchmark(request)
 95 |             return {
 96 |                 "status": "success",
 97 |                 "data": response_data,
 98 |                 "response_time": response_time,
 99 |                 "retry_count": retry_count,
100 |             }
101 |         except Exception as e:
102 |             retry_count += 1
103 |             # Get the actual error message, not just the RetryError wrapper
104 |             if hasattr(e, "last_attempt") and hasattr(e.last_attempt, "exception"):
105 |                 last_error = str(e.last_attempt.exception())
106 |             else:
107 |                 last_error = str(e)
108 | 
109 |             if retry_count >= MAX_RETRIES:
110 |                 logger.log_error(f"❌ Error {request.model}: {last_error} (after {retry_count} attempts)")
111 |                 break
112 |             else:
113 |                 logger.log_info(
114 |                     f"Attempt {retry_count}/{MAX_RETRIES} failed for {request.model}: {last_error}, retrying..."
115 |                 )
116 |                 await asyncio.sleep(1)
117 | 
118 |     return {"status": "error", "message": last_error, "retry_count": retry_count, "response_time": None, "data": None}
119 | 
120 | 
121 | async def benchmark_provider(provider, limit, run_always, debug):
122 |     model_names = provider_models.get(provider, [])[:limit]
123 |     logger.log_info(f"Fetching {len(model_names)} models for provider: {provider}")
124 | 
125 |     for model in model_names:
126 |         request_config = BenchmarkRequest(
127 |             provider=provider,
128 |             model=model,
129 |             query=QUERY_TEXT,
130 |             max_tokens=MAX_TOKENS,
131 |             temperature=TEMPERATURE,
132 |             run_always=run_always,
133 |             debug=debug,
134 |         )
135 | 
136 |         result = await benchmark_with_retries(request_config)
137 |         if result["status"] == "success":
138 |             logger.log_info(f"✅ Success {model}, {result['data']}")
139 | 
140 |         yield {
141 |             "model": model,
142 |             "provider": provider,
143 |             "status": result["status"],
144 |             "data": result["data"],
145 |             "response_time": result["response_time"],
146 |             "error": result.get("message"),
147 |             "retry_count": result["retry_count"],
148 |         }
149 | 
150 | 
151 | app = typer.Typer()
152 | 
153 | 
154 | async def collect_provider_results(provider: str, limit: int, run_always: bool, debug: bool) -> List[dict]:
155 |     results = []
156 |     async for result in benchmark_provider(provider, limit, run_always, debug):
157 |         results.append(result)
158 |     return results
159 | 
160 | 
161 | @app.command()
162 | def main(
163 |     providers: str = typer.Option(
164 |         None,
165 |         "--providers",
166 |         help="Comma-separated providers to use for benchmarking (e.g. 'azure,openai'). Use 'all' for all providers.",
167 |     ),
168 |     limit: int = typer.Option(100, "--limit", help="Limit the number of models run."),
169 |     run_always: bool = typer.Option(False, "--run-always", is_flag=True, help="Flag to always run benchmarks."),
170 |     debug: bool = typer.Option(False, "--debug", is_flag=True, help="Flag to enable debug mode."),
171 | ) -> None:
172 |     async def async_main():
173 |         provider_list = None if not providers else [p.strip() for p in providers.split(",")]
174 |         if provider_list is None or "all" in provider_list:
175 |             provider_list = list(provider_models.keys())
176 |         logger.log_info(f"Running benchmarks for provider(s): {provider_list}")
177 | 
178 |         # Run all providers in parallel
179 |         provider_tasks = [collect_provider_results(provider, limit, run_always, debug) for provider in provider_list]
180 |         all_results_nested = await asyncio.gather(*provider_tasks)
181 | 
182 |         # Flatten results from all providers
183 |         all_results = [result for provider_results in all_results_nested for result in provider_results]
184 | 
185 |         logger.log_benchmark_status(all_results)
186 | 
187 |     asyncio.run(async_main())
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     app()
192 | 


--------------------------------------------------------------------------------
/api/run_gguf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict
  3 | 
  4 | import click
  5 | import requests
  6 | from llm_bench.local.gguf import fetch_gguf_files
  7 | 
  8 | FLASK_PORT = 5003
  9 | GGUF_DIR = "/gemini/gguf/"
 10 | assert GGUF_DIR, "GGUF_DIR environment variable not set"
 11 | 
 12 | 
 13 | @click.command()
 14 | @click.option("--limit", default=50, type=int, help="Limit the number of models to run for debugging.")
 15 | @click.option("--run-always", is_flag=True, help="Flag to always run benchmarks.")
 16 | @click.option("--log-level", default="INFO", help="Log level for the benchmarking server.")
 17 | def bench_gguf(limit: int, run_always: bool, log_level: str = "INFO"):
 18 |     """Benchmark all models on the gguf server."""
 19 | 
 20 |     # Fetch all models
 21 |     model_names = fetch_gguf_files(model_dir=GGUF_DIR)
 22 |     print(f"Fetched {len(model_names)} GGUF models")
 23 | 
 24 |     # Limit the number of models to run
 25 |     model_names = model_names[:limit]
 26 |     print(f"Will run benchmarks for {len(model_names)} models")
 27 | 
 28 |     # Run benchmarks
 29 |     model_status: Dict[str, int] = {}
 30 |     stop = False
 31 |     for model in model_names:
 32 |         if stop:
 33 |             break
 34 |         quant = get_quant_type(model)
 35 |         print(f"Running benchmark: {model}, quant: {quant[0]}, bits: {quant[1]}")
 36 | 
 37 |         config = {
 38 |             "model_name": model,
 39 |             "quant_method": "gguf",
 40 |             "quant_type": quant[0],
 41 |             "quant_bits": int(quant[1]),
 42 |             "query": "User: Tell me a long story about the history of the world.\nAI:",
 43 |             "max_tokens": 256,
 44 |             "n_gpu_layers": -1,
 45 |             "run_always": run_always,
 46 |             "log_level": log_level,
 47 |         }
 48 |         request_path = f"http://localhost:{FLASK_PORT}/benchmark"
 49 |         response = requests.post(request_path, data=config)
 50 | 
 51 |         response_code = response.status_code
 52 |         print(f"Finished benchmark: {model} with Status Code: {response_code}")
 53 | 
 54 |         model_status[model] = response_code
 55 | 
 56 |         if len(model_status) >= limit:
 57 |             stop = True
 58 | 
 59 |     print("All benchmark runs are finished.")
 60 | 
 61 |     # Summary of benchmark runs
 62 |     print("Summary of benchmark runs:")
 63 |     for model, code in model_status.items():
 64 |         print(f"Model: {model}, HTTP Response Code: {code} {'✅' if code == 200 else '❌'}")
 65 |     print("🎊 Done 🎊")
 66 | 
 67 | 
 68 | def get_quant_type(file: str) -> tuple:
 69 |     """Get quantization type and number of bits from file name."""
 70 |     parts = file.split(".")
 71 |     if len(parts) < 2:
 72 |         raise ValueError(f"Invalid file name format: {file}")
 73 | 
 74 |     quant_type = parts[-2]
 75 |     if not quant_type.startswith("Q"):
 76 |         raise ValueError(f"Unsupported quantization type: {quant_type}")
 77 | 
 78 |     bits_str = quant_type.split("_")[0][1:]
 79 |     if not bits_str.isdigit():
 80 |         raise ValueError(f"Invalid number of bits: {bits_str}")
 81 | 
 82 |     bits = int(bits_str)
 83 |     return quant_type, bits
 84 | 
 85 | 
 86 | def get_models_and_quant_types(model_dir: str) -> tuple:
 87 |     """Get list of .gguf models and their quant types from any dirs in model_dir."""
 88 | 
 89 |     model_names = []
 90 |     quant_types = []
 91 |     for root, dirs, files in os.walk(model_dir):
 92 |         for file in files:
 93 |             if file.endswith(".gguf"):
 94 |                 model_name = os.path.join(os.path.basename(root), file)
 95 |                 model_names.append(model_name)
 96 |                 quant_type = get_quant_type(file)
 97 |                 quant_types.append(quant_type)
 98 |     return model_names, quant_types
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     bench_gguf()
103 | 


--------------------------------------------------------------------------------
/api/run_hf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import click
  4 | import dotenv
  5 | from llm_bench.api import bench_all_models
  6 | from llm_bench.api import print_summary
  7 | from llm_bench.utils import fetch_hf_models
  8 | from llm_bench.utils import filter_model_size
  9 | 
 10 | dotenv.load_dotenv()
 11 | 
 12 | QUANT_TYPES = [
 13 |     "4bit",
 14 |     "8bit",
 15 |     None,
 16 | ]
 17 | QUERY_TEXT = "User: Tell me a long story about the history of the world.\nAI:"
 18 | MAX_TOKENS = 256
 19 | TEMPERATURE = 0.1
 20 | FLASK_PORT_HF_TF = os.environ.get("FLASK_PORT_HF_TF")
 21 | FLASK_PORT_HF_TGI = os.environ.get("FLASK_PORT_HF_TGI")
 22 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
 23 | assert FLASK_PORT_HF_TF, "FLASK_PORT_HF_TF environment variable not set"
 24 | assert FLASK_PORT_HF_TGI, "FLASK_PORT_HF_TGI environment variable not set"
 25 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
 26 | 
 27 | 
 28 | @click.command()
 29 | @click.option(
 30 |     "--framework",
 31 |     type=str,
 32 |     help="LLM API to call. Must be one of 'transformers', 'hf-tgi'",
 33 | )
 34 | @click.option(
 35 |     "--limit",
 36 |     default=100,
 37 |     type=int,
 38 |     help="Limit the number of models run.",
 39 | )
 40 | @click.option(
 41 |     "--max-size-billion",
 42 |     default=5,
 43 |     type=int,
 44 |     help="Maximum size of models in billion parameters.",
 45 | )
 46 | @click.option(
 47 |     "--run-always",
 48 |     is_flag=True,
 49 |     help="Flag to always run benchmarks.",
 50 | )
 51 | @click.option(
 52 |     "--fetch-new-models",
 53 |     is_flag=True,
 54 |     help="Fetch latest HF-Hub models.",
 55 | )
 56 | @click.option(
 57 |     "--created-days-ago",
 58 |     default=180,
 59 |     type=int,
 60 |     help="Fetch models created within the last N days.",
 61 | )
 62 | def main(
 63 |     framework: str,
 64 |     fetch_new_models: bool,
 65 |     limit: int,
 66 |     max_size_billion: int,
 67 |     run_always: bool,
 68 |     created_days_ago: int,
 69 | ) -> None:
 70 |     """
 71 |     Main entrypoint for benchmarking HuggingFace Transformers models.
 72 |     Can fetch latest models from the Hub or use the cached models.
 73 |     """
 74 | 
 75 |     # Gather models to run
 76 |     model_names = fetch_hf_models(
 77 |         fetch_new=fetch_new_models,
 78 |         cache_dir=CACHE_DIR,
 79 |         library=framework,
 80 |         created_days_ago=created_days_ago,
 81 |     )
 82 |     print(f"Fetched {len(model_names):,} models")
 83 | 
 84 |     # Filter based on parameter count
 85 |     valid_models = filter_model_size(model_names, max_size_billion * 1_000)
 86 |     print(f"Filtered max {max_size_billion}B params, now {len(valid_models):,} models")
 87 | 
 88 |     # Set port
 89 |     if framework == "transformers":
 90 |         flask_port = FLASK_PORT_HF_TF
 91 |     elif framework == "hf-tgi":
 92 |         flask_port = FLASK_PORT_HF_TGI
 93 |     else:
 94 |         raise ValueError(f"Invalid framework: {framework}")
 95 |     print(f"Running benchmarks on port: {flask_port}")
 96 | 
 97 |     # valid_models = [
 98 |     #     # "facebook/opt-125m",
 99 |     #     # "TheBloke/Llama-2-7B-Chat-GPTQ",
100 |     #     # "EleutherAI/pythia-160m",
101 |     #     # "TheBloke/Llama-2-7B-Chat-AWQ",
102 |     #     # "meta-llama/Llama-2-7b-chat-hf",
103 |     #     # "meta-llama/Meta-Llama-3-8B",
104 |     #     "mistralai/Mistral-7B-Instruct-v0.3",
105 |     # ]
106 | 
107 |     # Run benchmarks
108 |     model_status: dict[str, dict] = {}
109 |     bench_all_models(
110 |         framework,
111 |         QUANT_TYPES,
112 |         valid_models,
113 |         model_status,
114 |         limit,
115 |         run_always,
116 |         QUERY_TEXT,
117 |         MAX_TOKENS,
118 |         TEMPERATURE,
119 |         int(flask_port),
120 |     )
121 | 
122 |     # Print summary
123 |     print("All benchmark runs are finished.")
124 |     print_summary(model_status)
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 


--------------------------------------------------------------------------------
/api/run_test.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | 
 4 | async def main():
 5 |     print("Inside main function")
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     asyncio.run(main())
10 | 


--------------------------------------------------------------------------------
/api/run_vllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import click
 4 | from llm_bench.api import bench_all_models
 5 | from llm_bench.api import print_summary
 6 | from llm_bench.utils import fetch_hf_models
 7 | from llm_bench.utils import filter_model_size
 8 | 
 9 | QUANT_TYPES = [
10 |     None,
11 | ]
12 | QUERY_TEXT = "User: Tell me a long story about the history of the world.\nAI:"
13 | MAX_TOKENS = 256
14 | TEMPERATURE = 0.1
15 | FLASK_PORT = 5002
16 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
17 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
18 | 
19 | 
20 | @click.command()
21 | @click.option("--framework", help="Framework to use, must be 'vllm'.")
22 | @click.option("--limit", default=100, type=int, help="Limit the number of models fetched.")
23 | @click.option(
24 |     "--max-size-billion",
25 |     default=5,
26 |     type=int,
27 |     help="Maximum size of models in billion parameters.",
28 | )
29 | @click.option("--run-always", is_flag=True, help="Flag to always run benchmarks.")
30 | @click.option("--fetch-new-models", is_flag=True, help="Fetch latest HF-Hub models.")
31 | def main(
32 |     framework: str,
33 |     fetch_new_models: bool,
34 |     limit: int,
35 |     max_size_billion: int,
36 |     run_always: bool,
37 | ) -> None:
38 |     """
39 |     Main entrypoint for benchmarking HuggingFace Transformers models.
40 |     Can fetch latest models from the Hub or use the cached models.
41 |     """
42 |     print(f"Initial run_always value: {run_always}")
43 | 
44 |     # Gather models to run
45 |     model_names = fetch_hf_models(
46 |         fetch_new=fetch_new_models,
47 |         cache_dir=CACHE_DIR,
48 |         library="transformers",
49 |         created_days_ago=30,
50 |     )
51 |     print(f"Fetched {len(model_names)} models")
52 | 
53 |     # Filter based on parameter count
54 |     valid_models = filter_model_size(model_names, max_size_billion * 1_000)
55 |     print(f"Filtered down to {len(valid_models)} models")
56 | 
57 |     valid_models = [
58 |         # "facebook/opt-125m",
59 |         # "TheBloke/Llama-2-7B-Chat-GPTQ",
60 |         # "EleutherAI/pythia-160m",
61 |         # "TheBloke/Llama-2-7B-Chat-AWQ",
62 |         "meta-llama/Meta-Llama-3-8B",
63 |     ]
64 | 
65 |     # Run benchmarks
66 |     model_status: dict[str, dict] = {}
67 |     bench_all_models(
68 |         framework,
69 |         QUANT_TYPES,
70 |         valid_models,
71 |         model_status,
72 |         limit,
73 |         run_always,
74 |         QUERY_TEXT,
75 |         MAX_TOKENS,
76 |         TEMPERATURE,
77 |         FLASK_PORT,
78 |     )
79 | 
80 |     # Print summary
81 |     print("All benchmark runs are finished.")
82 |     print_summary(model_status)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/cloud/.env.example:
--------------------------------------------------------------------------------
 1 | # Misc
 2 | HF_HUB_CACHE=""
 3 | TOKENIZERS_PARALLELISM=False
 4 | MONGODB_URI=""
 5 | MONGODB_DB=""
 6 | MONGODB_COLLECTION_LOCAL=""
 7 | MONGODB_COLLECTION_CLOUD=""
 8 | 
 9 | # Port for cloud models server
10 | FASTAPI_PORT_CLOUD="5004"
11 | 
12 | # Redis
13 | REDIS_HOST="localhost"
14 | REDIS_PORT=6379
15 | REDIS_DB=0
16 | REDIS_PASSWORD=""
17 | 
18 | # Keys and tokens
19 | HF_TOKEN=""
20 | OPENAI_API_KEY=""
21 | ANTHROPIC_API_KEY=""
22 | GROQ_API_KEY=""
23 | 
24 | # AWS/Bedrock stuff
25 | AWS_PROFILE=""
26 | 
27 | # Anyscale
28 | ANYSCALE_BASE_URL=""
29 | ANYSCALE_API_KEY=""
30 | 
31 | # Together
32 | TOGETHER_BASE_URL=""
33 | TOGETHER_API_KEY=""
34 | 
35 | # Openrouter
36 | OPENROUTER_API_KEY=""
37 | OPENROUTER_BASE_URL=""
38 | 
39 | # Databricks
40 | DATABRICKS_API_KEY=""
41 | DATABRICKS_BASE_URL=""
42 | 
43 | # Azure Stuff (will need a key/url for each model you want to run)
44 | AZURE_L7_API_KEY=""
45 | AZURE_L7_POST_URL=""
46 | AZURE_L13_API_KEY=""
47 | AZURE_L13_POST_URL=""
48 | AZURE_L70_API_KEY=""
49 | AZURE_L70_POST_URL=""
50 | AZURE_MISTRAL_L_API_KEY=""
51 | AZURE_MISTRAL_L_POST_URL=""
52 | AZURE_COHERE_CMD_R_PLUS_API_KEY=""
53 | AZURE_COHERE_CMD_R_PLUS_POST_URL=""
54 | 
55 | # Runpod stuff
56 | RUNPOD_API_KEY=""
57 | RUNPOD_L13_URL=""
58 | 
59 | # Fireworks stuff
60 | FIREWORKS_BASE_URL=""
61 | FIREWORKS_API_KEY=""
62 | 
63 | # deepinfra stuff
64 | DEEPINFRA_BASE_URL=""
65 | DEEPINFRA_API_KEY=""
66 | 


--------------------------------------------------------------------------------
/cloud/Dockerfile-cloud:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | # Set environment variables
 4 | ENV PYTHONDONTWRITEBYTECODE 1
 5 | ENV PYTHONUNBUFFERED 1
 6 | # ENV GOOGLE_APPLICATION_CREDENTIALS=/service-account.json
 7 | 
 8 | # Set up GCloud SDK
 9 | # RUN curl -sSL https://sdk.cloud.google.com | bash
10 | 
11 | # Install system dependencies
12 | RUN apt-get update && \
13 |     apt-get install -y \
14 |         git \
15 |         python3-pip
16 | 
17 | # Install api stuff
18 | RUN pip3 install pymongo pynvml flask pytz fastapi uvicorn httpx typer redis gunicorn tenacity filelock huggingface_hub
19 | 
20 | # Install cloud providers
21 | RUN pip3 install boto3 google-cloud-aiplatform openai tiktoken anthropic groq
22 | 
23 | # Set the working directory
24 | WORKDIR /app
25 | 
26 | # Copy over repo code
27 | COPY . /app
28 | 
29 | # Set the entrypoint
30 | CMD ["sh", "-c", "gunicorn -w 8 -k uvicorn.workers.UvicornWorker llm_bench.cloud.server:app --bind 0.0.0.0:${FASTAPI_PORT_CLOUD}"]


--------------------------------------------------------------------------------
/cloud/README.md:
--------------------------------------------------------------------------------
 1 | # How to Run
 2 | 1. Create an `.env` file from `.env.example` and fill in the necessary details. You only need to fill in the specific providers you want. Some providers such as Azure annoyingly give you a unique `base_url` and `api_key` for each model you want to use. You can find these details in the Azure portal.
 3 | 2. Run `docker compose -f docker-compose.cloud.yml up --build` to start the server. Everything should build and start up correctly. But I have only tested this on my VPS running Ubuntu 22.04. If you run into any issues, please let me know.
 4 | 3. The FastAPI server will now be acceping request with the env var `FASTAPI_PORT_CLOUD` which I have set at `5004`. You can change this in the `.env` file. The simplest way to build the requests is using the api I put together in `api/llm_bench`. Go into `api/` and create a new Python environment, and install via `poetry install` using the `pyproject.toml` file.
 5 | 4. Once the environment is installed and activated, you can run benchmarks with `run_cloud.py`. For example to run `openai` you can call `python run_cloud.py --providers openai`. You can optionally use `--providers all` and it will operate multiple runs concurrently based on how many `uvicorn` workers you have set in the `Dockerfile-cloud` file. I have set this to `8` by default.
 6 | 5. The results will be primarily logged to MongoDB. This isn't my ideal method but the initial stages involved a lot of schema changes and this was simple to manage at the beginning. I will be moving to a more structured database in the future. You can also check the `logs/` folder for a printed out version of the results. This is useful for debugging and checking the results of the runs as they happen.
 7 | 
 8 | 
 9 | ## Logging
10 | There are a few ways to view progress:
11 | - `run_cloud.py` output: this simply shows a pass or fail for each request with some basic information.
12 | - `./logs/benchmarks_cloud.log`: shows all the metrics and information for each request as a simple text file.
13 | - `./logs/benchmarks_cloud.json`: same as above but formats into a more machine parsable json file.
14 | - MongoDB: if you set the `.env` variable `LOG_TO_MONGO` as True, the logs will be stored in a MongoDB database. If this is the case, you will be required to also provide `MONGODB_URI`, `MONGODB_DB`, and `MONGODB_COLLECTION_CLOUD` in the `.env` file. This is my primary logging system as it enables me to view realtime results in my react frontend I also built for this project at [llm-benchmarks.com](https://llm-benchmarks.com). It also enables the feature where the benchmarking script will check if a particular model config has been run before and skip it unless you set the run param `--run-always`. This is useful for debugging and testing new models.
15 | 


--------------------------------------------------------------------------------
/cloud/docker-compose.cloud.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   bench_cloud:
 3 |     image: bench_cloud_img
 4 |     build:
 5 |       context: .
 6 |       dockerfile: Dockerfile-cloud
 7 |     env_file:
 8 |       - .env
 9 |     volumes:
10 |       - ../api/llm_bench:/app/llm_bench
11 |       - ./logs/:/var/log
12 |       - /home/drose/.aws:/root/.aws
13 |       - /home/drose/.config/gcloud:/root/.config/gcloud
14 |     network_mode: host
15 | 


--------------------------------------------------------------------------------
/cloud/models.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "openai": [
  3 |         "gpt-4",
  4 |         "gpt-4-turbo",
  5 |         "gpt-4o",
  6 |         "gpt-4o-2024-05-13",
  7 |         "gpt-4o-2024-08-06",
  8 |         "gpt-4o-2024-11-20",
  9 |         "gpt-4o-mini",
 10 |         "gpt-4o-mini-2024-07-18",
 11 |         "gpt-4.5-preview",
 12 |         "gpt-3.5-turbo",
 13 |         "gpt-3.5-turbo-instruct"
 14 |     ],
 15 |     "anthropic": [
 16 |         "claude-2.1",
 17 |         "claude-3-haiku-20240307",
 18 |         "claude-3-5-haiku-20241022",
 19 |         "claude-3-sonnet-20240229",
 20 |         "claude-3-5-sonnet-20240620",
 21 |         "claude-3-7-sonnet-20250219",
 22 |         "claude-3-opus-20240229"
 23 |     ],
 24 |     "bedrock": [
 25 |         "anthropic.claude-3-haiku-20240307-v1:0",
 26 |         "us.anthropic.claude-3-5-haiku-20241022-v1:0",
 27 |         "anthropic.claude-3-sonnet-20240229-v1:0",
 28 |         "us.anthropic.claude-3-5-sonnet-20240620-v1:0",
 29 |         "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
 30 |         "us.anthropic.claude-3-opus-20240229-v1:0",
 31 |         "amazon.titan-text-lite-v1",
 32 |         "meta.llama3-8b-instruct-v1:0",
 33 |         "meta.llama3-70b-instruct-v1:0",
 34 |         "meta.llama3-1-8b-instruct-v1:0",
 35 |         "meta.llama3-1-70b-instruct-v1:0",
 36 |         "meta.llama3-1-405b-instruct-v1:0",
 37 |         "us.meta.llama3-2-1b-instruct-v1:0",
 38 |         "us.meta.llama3-2-3b-instruct-v1:0",
 39 |         "us.meta.llama3-2-11b-instruct-v1:0",
 40 |         "us.meta.llama3-2-90b-instruct-v1:0",
 41 |         "mistral.mistral-7b-instruct-v0:2",
 42 |         "mistral.mixtral-8x7b-instruct-v0:1",
 43 |         "mistral.mistral-small-2402-v1:0",
 44 |         "mistral.mistral-large-2402-v1:0",
 45 |         "cohere.command-r-v1:0",
 46 |         "cohere.command-r-plus-v1:0",
 47 |         "amazon.nova-pro-v1:0",
 48 |         "amazon.nova-lite-v1:0",
 49 |         "amazon.nova-micro-v1:0"
 50 |     ],
 51 |     "vertex": [
 52 |         "gemini-1.0-pro",
 53 |         "gemini-1.5-pro-002",
 54 |         "gemini-1.5-flash-002",
 55 |         "claude-3-haiku@20240307",
 56 |         "claude-3-5-haiku@20241022",
 57 |         "claude-3-sonnet@20240229",
 58 |         "claude-3-5-sonnet@20240620",
 59 |         "claude-3-opus@20240229",
 60 |         "meta/llama-3.2-90b-vision-instruct-maas"
 61 |     ],
 62 |     "together": [
 63 |         "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
 64 |         "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
 65 |         "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
 66 |         "meta-llama/Llama-Vision-Free", // 1b
 67 |         "meta-llama/Llama-3.2-3B-Instruct-Turbo",
 68 |         "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
 69 |         "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
 70 |         "mistralai/Mistral-7B-Instruct-v0.2",
 71 |         "mistralai/Mixtral-8x7B-Instruct-v0.1",
 72 |         "codellama/CodeLlama-34b-Instruct-hf",
 73 |         "qwen/Qwen2.5-7B-Instruct-Turbo",
 74 |         "qwen/Qwen2.5-72B-Instruct-Turbo",
 75 |         "google/gemma-2-9b-it",
 76 |         "google/gemma-2-27b-it",
 77 |         "mistralai/Mistral-7B-Instruct-v0.3",
 78 |         "deepseek-ai/deepseek-llm-67b-chat",
 79 |         "deepseek-ai/DeepSeek-V3",
 80 |         "deepseek-ai/DeepSeek-R1"
 81 |     ],
 82 |     "azure": [
 83 |         "llama-2-7b-chat",
 84 |         "llama-2-13b-chat",
 85 |         "llama-2-70b-chat",
 86 |         "mistral-large",
 87 |         "cohere-cmd-r-plus"
 88 |     ],
 89 |     "fireworks": [
 90 |         "accounts/fireworks/models/llama-v3-8b-instruct",
 91 |         "accounts/fireworks/models/llama-v3-8b-instruct-hf",
 92 |         "accounts/fireworks/models/llama-v3-70b-instruct",
 93 |         "accounts/fireworks/models/llama-v3-70b-instruct-hf",
 94 |         "accounts/fireworks/models/llama-v3p1-8b-instruct",
 95 |         "accounts/fireworks/models/llama-v3p1-70b-instruct",
 96 |         "accounts/fireworks/models/llama-v3p1-405b-instruct",
 97 |         "accounts/fireworks/models/llama-v3p2-1b-instruct",
 98 |         "accounts/fireworks/models/llama-v3p2-3b-instruct",
 99 |         "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
100 |         "accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
101 |         "accounts/fireworks/models/mixtral-8x7b-instruct",
102 |         "accounts/fireworks/models/mixtral-8x7b-instruct-hf",
103 |         "accounts/fireworks/models/mixtral-8x22b-instruct",
104 |         "accounts/fireworks/models/qwen2p5-72b-instruct",
105 |         "accounts/fireworks/models/qwen2p5-coder-32b-instruct",
106 |         "accounts/fireworks/models/starcoder-7b",
107 |         "accounts/fireworks/models/starcoder-16b",
108 |         "accounts/fireworks/models/gemma2-9b-it",
109 |         "accounts/yi-01-ai/models/yi-large",
110 |         "accounts/fireworks/models/deepseek-v3",
111 |         "accounts/fireworks/models/deepseek-r1"
112 |     ],
113 |     "deepinfra": [
114 |         "meta-llama/Llama-2-70b-chat-hf",
115 |         "meta-llama/Llama-2-7b-chat-hf",
116 |         "codellama/CodeLlama-34b-Instruct-hf",
117 |         "meta-llama/Meta-Llama-3-8B-Instruct",
118 |         "meta-llama/Meta-Llama-3-70B-Instruct",
119 |         "meta-llama/Meta-Llama-3.1-8B-Instruct",
120 |         "meta-llama/Meta-Llama-3.1-70B-Instruct",
121 |         "meta-llama/Meta-Llama-3.1-405B-Instruct",
122 |         "meta-llama/Llama-3.2-1B-Instruct",
123 |         "meta-llama/Llama-3.2-3B-Instruct",
124 |         "meta-llama/Llama-3.2-11B-Vision-Instruct",
125 |         "meta-llama/Llama-3.2-90B-Vision-Instruct",
126 |         "mistralai/Mixtral-8x22B-Instruct-v0.1",
127 |         "mistralai/Mistral-7B-Instruct-v0.2",
128 |         "databricks/dbrx-instruct",
129 |         "bigcode/starcoder2-15b",
130 |         "Qwen/Qwen2.5-72B-Instruct",
131 |         "deepseek-ai/DeepSeek-R1"
132 |     ],
133 |     "groq": [
134 |         "llama3-8b-8192",
135 |         "llama3-70b-8192",
136 |         "llama-3.1-8b-instant",
137 |         "llama-3.3-70b-versatile",
138 |         "llama-guard-3-8b",
139 |         "llama-3.2-1b-preview",
140 |         "llama-3.2-3b-preview",
141 |         "llama-3.2-11b-vision-preview",
142 |         "llama-3.2-90b-vision-preview",
143 |         "llama-3.3-70b-specdec",
144 |         "llama-3.1-70b-specdec",
145 |         "mixtral-8x7b-32768",
146 |         "gemma2-9b-it"
147 |     ],
148 |     "lambda":[
149 |         "llama3.1-8b-instruct",
150 |         "llama3.1-70b-instruct-fp8",
151 |         "llama3.1-405b-instruct-fp8",
152 |         "llama3.2-3b-instruct"
153 |     ]
154 | }
155 | 


--------------------------------------------------------------------------------
/local/.env.example:
--------------------------------------------------------------------------------
 1 | # HuggingFace
 2 | HF_TOKEN=""
 3 | HF_HUB_CACHE=""
 4 | 
 5 | # Logging
 6 | LOG_TO_MONGO=True
 7 | MONGODB_URI=""
 8 | MONGODB_DB=""
 9 | MONGODB_COLLECTION_LOCAL=""
10 | 
11 | # GPU
12 | GPU_DEVICE="0"
13 | 
14 | # Flask Ports
15 | FLASK_PORT_HF_TF=5000
16 | FLASK_PORT_HF_TGI=5001
17 | 
18 | # Redis
19 | REDIS_HOST=""
20 | REDIS_PORT="6379"
21 | REDIS_DB=0
22 | REDIS_PASSWORD=""


--------------------------------------------------------------------------------
/local/README.md:
--------------------------------------------------------------------------------
 1 | # Local Benchmarks
 2 | 
 3 | ## Overview
 4 | This directory contains benchmarks that are run locally on the machine. Each framework runs within a standalone container that are all integrated into a single docker compose file. The benchmarks are run using a relevant script from the /scripts directory. The current options are:
 5 | - run_hf.py
 6 |     - can be used for both Transformers and Text-Generation-Inference benchmarks
 7 |     - **Options:**
 8 |         - `--framework TEXT`: LLM API to call. Must be one of 'transformers', 'hf-tgi'
 9 |         - `--limit INTEGER`: Limit the number of models run.
10 |         - `--max-size-billion INTEGER`: Maximum size of models in billion parameters.
11 |         - `--run-always`: Flag to always run benchmarks.
12 |         - `--fetch-new-models`: Fetch latest HF-Hub models.
13 |         - `--help`: Show this message and exit.
14 | - run_vllm.py
15 |     - Used for the VLLM benchmarks
16 |     - **Options:**
17 |         - `--framework TEXT`: Framework to use, must be 'vllm'.
18 |         - `--limit INTEGER`: Limit the number of models fetched.
19 |         - `--max-size-billion INTEGER`: Maximum size of models in billion parameters.
20 |         - `--run-always`: Flag to always run benchmarks.
21 |         - `--fetch-new-models`: Fetch latest HF-Hub models.
22 |         - `--help`: Show this message and exit.
23 | - run_gguf.py
24 |     - Used for the GGUF/llama-cpp benchmarks
25 |     - **Options:**
26 |         - `--limit INTEGER`: Limit the number of models to run for debugging.
27 |         - `--run-always`: Flag to always run benchmarks.
28 |         - `--log-level TEXT`: Log level for the benchmarking server.
29 |         - `--help`: Show this message and exit.
30 | 
31 | 
32 | ## Getting Started
33 | It should be as simple as setting the correct `.env` variables and building the docker containers with the following commands:
34 | ```bash
35 | cp .env.example .env # fill out the .env file with the correct values
36 | docker compose -f docker-compose.local.yml up --build
37 | ```
38 | 
39 | ## Example Usage
40 | 
41 | To run the Huggingface Transformers benchmark, use the following command:
42 | ```bash
43 | python local/run_hf.py --framework transformers --limit 5 --max-size-billion 10 --run-always
44 | ```
45 | 
46 | To run the Huggingface Text-Generation-Inference benchmark, use the following command:
47 | ```bash
48 | python run_hf.py --framework hf-tgi --limit 5 --max-size-billion 10 --run-always
49 | ```
50 | 
51 | To run the VLLM benchmark, use the following command:
52 | ```bash
53 | python run_vllm.py --framework vllm --limit 5 --max-size-billion 10 --run-always
54 | ```
55 | 
56 | To run the GGUF/llama-cpp benchmark, use the following command:
57 | ```bash
58 | python run_gguf.py --limit 5 --run-always --log-level DEBUG
59 | ```
60 | 
61 | ## Logging
62 | There are a few ways to view progress:
63 | - `run_{framework}.py` output: this simply shows a pass or fail for each request with some basic information.
64 | - `./logs/benchmarks_local.log`: shows all the metrics and information for each request as a simple text file.
65 | - `./logs/benchmarks_local.json`: same as above but formats into a more machine parsable json file.
66 | - MongoDB: if you set the `.env` variable `LOG_TO_MONGO` as True, the logs will be stored in a MongoDB database. If this is the case, you will be required to also provide `MONGODB_URI`, `MONGODB_DB`, and `MONGODB_COLLECTION_LOCAL` in the `.env` file. This is my primary logging system as it enables me to view realtime results in my react frontend I also built for this project at [llm-benchmarks.com](https://llm-benchmarks.com). It also enables the feature where the benchmarking script will check if a particular model config has been run before and skip it unless you set the run param `--run-always`. This is useful for debugging and testing new models.
67 | 


--------------------------------------------------------------------------------
/local/docker-compose.local.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   bench_transformers:
 3 |     image: bench_transformers_img
 4 |     build:
 5 |       context: .
 6 |       dockerfile: huggingface/Dockerfile-huggingface
 7 |     runtime: nvidia
 8 |     env_file:
 9 |       - .env
10 |     deploy:
11 |       resources:
12 |         reservations:
13 |           devices:
14 |             - driver: nvidia
15 |               count: 1
16 |               capabilities: [gpu]
17 |     volumes:
18 |       - ../api/llm_bench:/app/llm_bench
19 |       - ./huggingface/llm_bench_hf:/app/llm_bench_hf
20 |       - ./logs/:/var/log
21 |       - /gemini/hf:/models/hf
22 |     network_mode: host
23 |     environment:
24 |       - CUDA_VISIBLE_DEVICES=${GPU_DEVICE}
25 |   bench_gguf:
26 |     image: bench_gguf_img
27 |     build:
28 |       context: .
29 |       dockerfile: gguf/Dockerfile-gguf
30 |     runtime: nvidia
31 |     env_file:
32 |       - .env
33 |     deploy:
34 |       resources:
35 |         reservations:
36 |           devices:
37 |             - driver: nvidia
38 |               count: 1
39 |               capabilities: [gpu]
40 |     volumes:
41 |       - ../api/llm_bench:/app/llm_bench
42 |       - ./gguf/llm_bench_gguf:/app/llm_bench_gguf
43 |       - ./logs/:/var/log
44 |       - /gemini/gguf:/models/gguf
45 |     network_mode: host
46 |     environment:
47 |       - CUDA_VISIBLE_DEVICES=${GPU_DEVICE}
48 |   bench_vllm:
49 |     image: bench_vllm_img
50 |     build:
51 |       context: .
52 |       dockerfile: vllm/Dockerfile-vllm
53 |     runtime: nvidia
54 |     env_file:
55 |       - .env
56 |     deploy:
57 |       resources:
58 |         reservations:
59 |           devices:
60 |             - driver: nvidia
61 |               count: 1
62 |               capabilities: [gpu]
63 |     volumes:
64 |       - ../api/llm_bench:/app/llm_bench
65 |       - ./vllm/llm_bench_vllm:/app/llm_bench_vllm
66 |       - ./logs/:/var/log
67 |       - /gemini/hf:/models/hf
68 |     network_mode: host
69 |     environment:
70 |       - CUDA_VISIBLE_DEVICES=${GPU_DEVICE}
71 | 


--------------------------------------------------------------------------------
/local/gguf/Dockerfile-gguf:
--------------------------------------------------------------------------------
 1 | # Base image
 2 | FROM nvidia/cuda:12.1.1-base-ubuntu22.04
 3 | 
 4 | # Set environment variables
 5 | ENV PYTHONDONTWRITEBYTECODE 1
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install necessary packages and dependencies
 9 | RUN apt-get update && apt-get upgrade -y && \
10 |     apt-get install -y --no-install-recommends \
11 |     git \
12 |     build-essential \
13 |     python3 \
14 |     python3-pip \
15 |     cuda-cudart-12-1 \
16 |     cuda-libraries-12-1 \
17 |     cuda-nvtx-12-1 && \
18 |     rm -rf /var/lib/apt/lists/*
19 | 
20 | # Set up pip, poetry, and install project dependencies
21 | WORKDIR /app
22 | COPY ./gguf/pyproject.toml /app/
23 | RUN pip3 install --upgrade pip poetry && \
24 |     poetry config virtualenvs.create false && \
25 |     poetry install && \
26 |     pip3 install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
27 | 
28 | # Create log directory and file
29 | RUN mkdir -p /var/log_gguf && touch /var/log_gguf/gguf.log
30 | 
31 | # Run Flask
32 | ENTRYPOINT ["python3", "-m", "llm_bench_gguf.server"]


--------------------------------------------------------------------------------
/local/gguf/create_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define the model names
 4 | model_names=(
 5 | "Llama-2-7b-chat-hf"
 6 | "Llama-2-13b-chat-hf"
 7 | "Llama-2-70b-chat-hf"
 8 | )
 9 | # Activate the conda environment
10 | activate bench
11 | 
12 | # Set the environment variable
13 | export HF_HUB_CACHE="/gemini/tmp"
14 | 
15 | # Loop over the model names
16 | for model in "${model_names[@]}"
17 | do
18 |   python ./llm_bench_gguf/create_model.py -m meta-llama/${model}
19 | done
20 | 


--------------------------------------------------------------------------------
/local/gguf/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [ "poetry-core",]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | description = "API and generation for benchmarking llama-cpp / GGUF models"
 7 | authors = ["David Rose <david@drose.io>"]
 8 | package-mode = false
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.10,<3.13"
12 | flask = "^3.0.3"
13 | numpy = "^1.26.4"
14 | click = "^8.1.7"
15 | pymongo = "^4.6.3"
16 | pynvml = "^11.5.0"
17 | pytz = "^2024.1"
18 | 


--------------------------------------------------------------------------------
/local/huggingface/Dockerfile-huggingface:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.1.1-base-ubuntu22.04
 2 | 
 3 | # Set environment variables to avoid Python creating .pyc files and buffering stdout and stderr
 4 | ENV PYTHONDONTWRITEBYTECODE 1
 5 | ENV PYTHONUNBUFFERED 1
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | # Install system dependencies
 9 | RUN apt-get update && \
10 |     apt-get install -y --no-install-recommends python3-pip python3-distutils git && \
11 |     apt-get clean
12 | 
13 | # Upgrade pip and install poetry
14 | RUN pip3 install --upgrade pip poetry
15 | 
16 | # Set the working directory
17 | WORKDIR /app
18 | 
19 | # Disable Poetry's virtual environment
20 | RUN poetry config virtualenvs.create false
21 | 
22 | # Install dependencies using Poetry
23 | COPY ./huggingface/pyproject.toml /app/
24 | RUN poetry install
25 | 
26 | # Install AutoGPTQ (had issue with poetry install)
27 | RUN pip3 install auto-gptq
28 | 
29 | # Set the default command to run the Flask app
30 | ENTRYPOINT python3 -m llm_bench.local.hf.server --port=${FLASK_PORT_HF_TF}


--------------------------------------------------------------------------------
/local/huggingface/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [ "poetry-core",]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | description = "API and generation for benchmarking HuggingFace (TGI, Transformers)"
 7 | authors = ["David Rose <david@drose.io>"]
 8 | package-mode = false
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.10,<3.13"
12 | torch = "^2.3.1"
13 | transformers = "^4.41.2"
14 | accelerate = "^0.31.0"
15 | bitsandbytes = "^0.43.1"
16 | optimum = "^1.20.0"
17 | flask = "^3.0.3"
18 | numpy = "^1.26.4"
19 | click = "^8.1.7"
20 | pymongo = "^4.7.3"
21 | pynvml = "^11.5.0"
22 | pytz = "^2024.1"
23 | 


--------------------------------------------------------------------------------
/local/huggingface/tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the directory of the current script
 4 | script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
 5 | 
 6 | # Load environment variables from .env file in the parent directory
 7 | env $(grep -v '^#' "$script_dir/../.env" | xargs)
 8 | 
 9 | # Get the port from the environment variable or use default value
10 | port=${FLASK_PORT_HF_TGI:5001}
11 | 
12 | # Activate the conda environment
13 | source "$(conda info --base)/etc/profile.d/conda.sh"
14 | conda activate bench_hf
15 | 
16 | # set env for LOG_DIR
17 | export LOG_DIR="/home/drose/git/llm-benchmarks/local/logs/"
18 | 
19 | # Run the Python script
20 | python "/home/drose/git/llm-benchmarks/api/llm_bench/local/hf/server.py" --port "$port"


--------------------------------------------------------------------------------
/local/vllm/Dockerfile-vllm:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 2 | 
 3 | # Set environment variables
 4 | ENV PYTHONDONTWRITEBYTECODE 1
 5 | ENV PYTHONUNBUFFERED 1
 6 | 
 7 | # Install system dependencies
 8 | RUN apt-get update && apt-get upgrade -y && \
 9 |     apt-get install -y --no-install-recommends \
10 |     git \
11 |     build-essential \
12 |     python3 \
13 |     python3-pip \
14 |     cuda-cudart-12-1 \
15 |     cuda-libraries-12-1 \
16 |     cuda-nvtx-12-1 && \
17 |     rm -rf /var/lib/apt/lists/*
18 | 
19 | # Upgrade pip
20 | WORKDIR /app
21 | COPY ./vllm/pyproject.toml /app/
22 | RUN pip3 install --upgrade pip poetry && \
23 |     poetry config virtualenvs.create false && \
24 |     poetry install
25 | 
26 | # Install flash-attn using pip
27 | RUN pip3 install packaging ninja && \
28 |     pip3 install flash-attn --no-build-isolation
29 | 
30 | # Run Flask
31 | ENTRYPOINT [ "python3", "-m", "llm_bench_vllm.api" ]
32 | 


--------------------------------------------------------------------------------
/local/vllm/llm_bench_vllm/__init__.py:
--------------------------------------------------------------------------------
1 | from .generate import generate
2 | from .server import call_vllm
3 | 
4 | __all__ = [
5 |     "call_vllm",
6 |     "generate",
7 | ]
8 | 


--------------------------------------------------------------------------------
/local/vllm/llm_bench_vllm/generate.py:
--------------------------------------------------------------------------------
 1 | """LLM generation and benchmarking for vLLM library."""
 2 | 
 3 | import gc
 4 | import logging.config
 5 | import os
 6 | import time
 7 | from datetime import datetime
 8 | 
 9 | import torch
10 | from llm_bench.config import ModelConfig
11 | from llm_bench.utils import get_vram_usage
12 | from vllm import LLM
13 | from vllm import SamplingParams
14 | from vllm.distributed.parallel_state import destroy_model_parallel
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | GPU_DEVICE = os.environ.get("GPU_DEVICE")
19 | assert GPU_DEVICE, "GPU_DEVICE environment variable not set"
20 | 
21 | 
22 | def generate(config: ModelConfig, run_config: dict) -> dict:
23 |     """Run vLLM inference and return metrics."""
24 | 
25 |     quant_str = f"{config.quantization_method}_{config.quantization_bits}" or "none"
26 |     logger.info(f"Running benchmark: {config.model_name}, quant: {quant_str}")
27 | 
28 |     output_tokens, vram_usage, time_0, time_1 = 0, 0, 0, 0
29 |     model = None
30 | 
31 |     with torch.no_grad():
32 |         try:
33 |             # Load model
34 |             model = LLM(
35 |                 model=config.model_name,
36 |                 download_dir=os.environ.get("HF_HUB_CACHE"),
37 |                 trust_remote_code=True,
38 |             )
39 |             # Set params
40 |             sampling_params = SamplingParams(temperature=0.1, top_p=0.95)
41 | 
42 |             # Generate tokens
43 |             time_0 = time.time()
44 |             output = model.generate(run_config["query"], sampling_params)
45 |             time_1 = time.time()
46 | 
47 |             # Collect metrics
48 |             output_tokens = len(output[0].outputs[0].token_ids)
49 |             vram_usage = get_vram_usage(int(GPU_DEVICE))
50 |         except Exception as e:
51 |             logger.error(f"Error during vLLM generation: {e}")
52 |             raise e
53 |         finally:
54 |             # Ensure model and CUDA memory is cleaned up
55 |             destroy_model_parallel()
56 |             if model is not None:
57 |                 del model
58 |             gc.collect()
59 |             if torch.cuda.is_available():
60 |                 torch.cuda.empty_cache()
61 | 
62 |             time.sleep(3)
63 | 
64 |     metrics = {
65 |         "gen_ts": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
66 |         "requested_tokens": [run_config["max_tokens"]],
67 |         "output_tokens": [output_tokens],
68 |         "gpu_mem_usage": [vram_usage],
69 |         "generate_time": [time_1 - time_0],
70 |         "tokens_per_second": [output_tokens / (time_1 - time_0) if time_1 > time_0 else 0],
71 |     }
72 | 
73 |     return metrics
74 | 


--------------------------------------------------------------------------------
/local/vllm/llm_bench_vllm/server.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import Tuple
  5 | from typing import Union
  6 | 
  7 | from flask import Flask
  8 | from flask import jsonify
  9 | from flask import request
 10 | from flask.wrappers import Response
 11 | from llm_bench.config import ModelConfig
 12 | from llm_bench.config import MongoConfig
 13 | from llm_bench.logging import log_metrics
 14 | from llm_bench.utils import check_and_clean_space
 15 | from llm_bench.utils import has_existing_run
 16 | 
 17 | from llm_bench_vllm.generate import generate
 18 | 
 19 | LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 20 | LOG_DIR = os.environ.get("LOG_DIR", "/var/log")
 21 | LOG_FILE_TXT = os.path.join(LOG_DIR, "benchmarks_local.log")
 22 | LOG_FILE_JSON = os.path.join(LOG_DIR, "benchmarks_local.json")
 23 | LOG_TO_MONGO = os.getenv("LOG_TO_MONGO", "False").lower() in ("true", "1", "t")
 24 | MONGODB_URI = os.environ.get("MONGODB_URI")
 25 | MONGODB_DB = os.environ.get("MONGODB_DB")
 26 | MONGODB_COLLECTION_LOCAL = os.environ.get("MONGODB_COLLECTION_LOCAL")
 27 | 
 28 | CACHE_DIR = os.environ.get("HF_HUB_CACHE")
 29 | assert CACHE_DIR, "HF_HUB_CACHE environment variable not set"
 30 | 
 31 | logging.basicConfig(filename=os.path.join(LOG_DIR, LOG_FILE_TXT), level=LOG_LEVEL)
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | DO_SAMPLE = False
 35 | FRAMEWORK = "vllm"
 36 | FLASK_PORT = 5002
 37 | 
 38 | app = Flask(__name__)
 39 | 
 40 | 
 41 | @app.route("/benchmark", methods=["POST"])
 42 | def call_vllm() -> Union[Response, Tuple[Response, int]]:
 43 |     """Enables the use a POST request to call the benchmarking function."""
 44 |     try:
 45 |         model_name = request.form.get("model_name", type=str)
 46 |         query = request.form.get("query", default=None, type=str)
 47 |         quant_method = request.form.get("quant_method", default=None, type=str)
 48 |         quant_bits = request.form.get("quant_bits", default=None, type=str)
 49 |         max_tokens = request.form.get("max_tokens", default=256, type=int)
 50 |         temperature = request.form.get("temperature", default=0.1, type=float)
 51 | 
 52 |         run_always_str = request.form.get("run_always", "False").lower()
 53 |         run_always = run_always_str == "true"
 54 | 
 55 |         quant_str = f"{quant_method}_{quant_bits}" if quant_method is not None else "none"
 56 |         logger.info(f"Received request for model: {model_name}, quant: {quant_str}")
 57 |         logger.info(f"run_always: {run_always}")
 58 | 
 59 |         assert model_name, "model_name not set"
 60 | 
 61 |         # Create model config
 62 |         model_config = ModelConfig(
 63 |             framework=FRAMEWORK,
 64 |             model_name=model_name,
 65 |             run_ts=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 66 |             model_dtype="torch.float16",
 67 |             quantization_method=quant_method,
 68 |             quantization_bits=quant_bits,
 69 |             temperature=temperature,
 70 |             misc={"do_sample": DO_SAMPLE},
 71 |         )
 72 | 
 73 |         run_config = {
 74 |             "query": query,
 75 |             "max_tokens": max_tokens,
 76 |         }
 77 | 
 78 |         # Check if model has been benchmarked before
 79 |         if LOG_TO_MONGO:
 80 |             mongo_config = MongoConfig(
 81 |                 uri=MONGODB_URI,  # type: ignore
 82 |                 db=MONGODB_DB,  # type: ignore
 83 |                 collection=MONGODB_COLLECTION_LOCAL,  # type: ignore
 84 |             )
 85 |             existing_run = has_existing_run(model_name, model_config, mongo_config)
 86 |             if existing_run:
 87 |                 if run_always:
 88 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 89 |                     logger.info("Re-running benchmark anyway because run_always is True")
 90 |                 else:
 91 |                     logger.info(f"Model has been benchmarked before: {model_name}, quant: {quant_str}")
 92 |                     return jsonify({"status": "skipped", "reason": "model has been benchmarked before"}), 200
 93 |             else:
 94 |                 logger.info(f"Model has not been benchmarked before: {model_name}, quant: {quant_str}")
 95 | 
 96 |         # Check and clean disk space if needed
 97 |         check_and_clean_space(directory=CACHE_DIR, threshold=90.0)
 98 | 
 99 |         metrics = generate(model_config, run_config)
100 |         assert metrics, "metrics is empty"
101 | 
102 |         # Log metrics
103 |         log_metrics(
104 |             model_type="local",
105 |             config=model_config,
106 |             metrics=metrics,
107 |             file_path=os.path.join(LOG_DIR, LOG_FILE_JSON),
108 |             log_to_mongo=LOG_TO_MONGO,
109 |             mongo_uri=MONGODB_URI,
110 |             mongo_db=MONGODB_DB,
111 |             mongo_collection=MONGODB_COLLECTION_LOCAL,
112 |         )
113 | 
114 |         # print metrics
115 |         logger.info(f"===== Model: {model_name} =====")
116 |         logger.info(f"Requested tokens: {run_config['max_tokens']}")
117 |         logger.info(f"Output tokens: {metrics['output_tokens'][0]}")
118 |         logger.info(f"GPU mem usage: {(metrics['gpu_mem_usage'][0] / 1024**3) :.2f}GB")
119 |         logger.info(f"Generate time: {metrics['generate_time'][0]:.2f} s")
120 |         logger.info(f"Tokens per second: {metrics['tokens_per_second'][0]:.2f}")
121 | 
122 |         return jsonify({"status": "success"}), 200
123 |     except Exception as e:
124 |         logger.exception(f"Error in call_benchmark: {e}")
125 |         return jsonify({"status": "error", "reason": str(e)}), 500
126 | 
127 | 
128 | app.run(host="0.0.0.0", port=FLASK_PORT)
129 | 


--------------------------------------------------------------------------------
/local/vllm/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "poetry-core",
 4 |     "packaging",
 5 |     "wheel"
 6 | ]
 7 | build-backend = "poetry.core.masonry.api"
 8 | 
 9 | [tool.poetry]
10 | name = "llm-bench-vllm"
11 | description = "API and generation for benchmarking VLLM"
12 | version = "0.1.0"
13 | authors = ["David Rose <david@drose.io>"]
14 | package-mode = false
15 | 
16 | [tool.poetry.dependencies]
17 | python = ">=3.10,<3.13"
18 | vllm = "^0.4.0.post1"
19 | flask = "^3.0.3"
20 | numpy = "^1.26.4"
21 | click = "^8.1.7"
22 | pymongo = "^4.6.3"
23 | pynvml = "^11.5.0"
24 | pytz = "^2024.1"
25 | einops = "^0.7.0"


--------------------------------------------------------------------------------
/models_config.yaml:
--------------------------------------------------------------------------------
 1 | gpt2_models:
 2 |   - gpt2
 3 |   - gpt2-medium
 4 |   - gpt2-large
 5 |   - gpt2-xl
 6 | 
 7 | dolly_models:
 8 |   - databricks/dolly-v2-3b
 9 |   - databricks/dolly-v2-7b
10 |   - databricks/dolly-v2-12b
11 | 
12 | llama_models:
13 |   - decapoda-research/llama-7b-hf
14 |   - decapoda-research/llama-13b-hf
15 |   - decapoda-research/llama-30b-hf
16 |   - decapoda-research/llama-65b-hf
17 |   - chainyo/alpaca-lora-7b
18 |   - chavinlo/alpaca-13b
19 | 
20 | openai_text_models:
21 |   - text-curie-001
22 |   - text-babbage-001
23 |   - text-ada-001
24 |   - text-davinci-002
25 |   - text-davinci-003
26 | 
27 | openai_chat_models:
28 |   - gpt-3.5-turbo
29 |   - gpt-4
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llm-benchmarks"
 3 | version = "0.1.0"
 4 | description = "Benchmarking local and cloud LLMs"
 5 | authors = [
 6 |     { name = "David Rose", email = "david@drose.io" }
 7 | ]
 8 | requires-python = ">=3.11,<3.13"
 9 | 
10 | dependencies = [
11 |     "httpx>=0.27.0",
12 |     "typer>=0.12.3",
13 |     "tenacity>=9.0.0",
14 |     "python-dotenv>=1.0.1",
15 |     "json5>=0.9.25",
16 |     "fastapi>=0.110.2",
17 |     "uvicorn>=0.29.0",
18 |     "flask>=3.0.3",
19 |     "pymongo>=4.6.3",
20 |     "pynvml>=11.5.0",
21 |     "redis>=5.0.3",
22 |     "pytz>=2024.1",
23 |     "gunicorn>=23.0.0",
24 |     "filelock>=3.17.0",
25 |     "openai>=1.23.2",
26 |     "huggingface-hub>=0.29.0",
27 |     "boto3>=1.36.24",
28 |     "google-cloud-aiplatform>=1.81.0",
29 |     "openai>=1.63.2",
30 |     "tiktoken>=0.9.0",
31 |     "anthropic>=0.46.0",
32 |     "groq>=0.18.0",
33 |     "pre-commit>=3.7.0",
34 |     "ipykernel>=6.29.4",
35 | ]
36 | 
37 | [build-system]
38 | requires = ["hatchling"]
39 | build-backend = "hatchling.build"
40 | 
41 | [tool.hatch.build.targets.wheel]
42 | packages = ["api"]
43 | 
44 | [tool.ruff]
45 | line-length = 120
46 | 
47 | [tool.ruff.lint]
48 | select = ["I", "E", "F", "Q000"]
49 | 
50 | [tool.ruff.lint.isort]
51 | force-single-line = true
52 | 
53 | [tool.ruff.lint.flake8-quotes]
54 | docstring-quotes = "double"
55 | inline-quotes = "double"
56 | multiline-quotes = "double"
57 | 
58 | [tool.ruff.format]
59 | quote-style = "double"
60 | indent-style = "space"
61 | line-ending = "lf"
62 | 
63 | 


--------------------------------------------------------------------------------
/static/benchmarks_all_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/benchmarks_all_models.png


--------------------------------------------------------------------------------
/static/benchmarks_large_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/benchmarks_large_models.png


--------------------------------------------------------------------------------
/static/dolly2_compare_size_and_quant_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/dolly2_compare_size_and_quant_inference.png


--------------------------------------------------------------------------------
/static/falcon_compare_quantization_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/falcon_compare_quantization_inference.png


--------------------------------------------------------------------------------
/static/ggml-hf-llama-compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/ggml-hf-llama-compare.png


--------------------------------------------------------------------------------
/static/gpt2_compare_quantization_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/gpt2_compare_quantization_inference.png


--------------------------------------------------------------------------------
/static/llama_compare_size_and_quant_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cipher982/llm-benchmarks/39a1519086c0f993ccc340b875d4da09e3e22cf6/static/llama_compare_size_and_quant_inference.png


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | max-complexity = 10
4 | 


--------------------------------------------------------------------------------