├── .gitignore ├── .python-version ├── Dockerfile ├── LICENSE ├── README.md ├── compose.yaml ├── proxy_server.py ├── pyproject.toml └── src └── llama_cpp_runner ├── __init__.py ├── main.py └── py.typed /.gitignore: -------------------------------------------------------------------------------- 1 | llama_cpp_cache/ 2 | cache/ 3 | dist/ 4 | models/* -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install only essential packages and clean up in one layer to reduce image size 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | curl \ 8 | wget \ 9 | git \ 10 | build-essential \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # Copy only necessary files 15 | COPY pyproject.toml README.md LICENSE /app/ 16 | COPY src/ /app/src/ 17 | 18 | # Install the package in development mode and required dependencies 19 | RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn 20 | 21 | # Create volume mount points 22 | VOLUME /models 23 | VOLUME /cache 24 | 25 | # Create proxy server script directly in the Dockerfile 26 | RUN echo 'import os\n\ 27 | import uvicorn\n\ 28 | from fastapi import FastAPI, Request\n\ 29 | from fastapi.responses import StreamingResponse, JSONResponse\n\ 30 | from llama_cpp_runner.main import LlamaCpp\n\ 31 | \n\ 32 | app = FastAPI(title="LlamaCpp Proxy")\n\ 33 | \n\ 34 | # Initialize the LlamaCpp class\n\ 35 | models_dir = os.environ.get("MODELS_DIR", "/models")\n\ 36 | cache_dir = os.environ.get("CACHE_DIR", "/cache")\n\ 37 | verbose = os.environ.get("VERBOSE", "true").lower() == "true"\n\ 38 | timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))\n\ 39 | \n\ 40 | print(f"Models directory: {models_dir}")\n\ 41 | print(f"Cache directory: {cache_dir}")\n\ 42 | \n\ 43 | # Create the LlamaCpp instance\n\ 44 | llama_runner = LlamaCpp(\n\ 45 | models_dir=models_dir,\n\ 46 | cache_dir=cache_dir, \n\ 47 | verbose=verbose, \n\ 48 | timeout_minutes=timeout\n\ 49 | )\n\ 50 | \n\ 51 | @app.get("/")\n\ 52 | def read_root():\n\ 53 | """Get server status and list of available models."""\n\ 54 | return {"status": "running", "models": llama_runner.list_models()}\n\ 55 | \n\ 56 | @app.post("/v1/chat/completions")\n\ 57 | async def chat_completions(request: Request):\n\ 58 | """Forward chat completion requests to the LlamaCpp server."""\n\ 59 | try:\n\ 60 | body = await request.json()\n\ 61 | \n\ 62 | if "model" not in body:\n\ 63 | return JSONResponse(\n\ 64 | status_code=400,\n\ 65 | content={"error": "Model not specified in request"}\n\ 66 | )\n\ 67 | \n\ 68 | try:\n\ 69 | result = llama_runner.chat_completion(body)\n\ 70 | \n\ 71 | # Handle streaming responses\n\ 72 | if body.get("stream", False):\n\ 73 | async def generate():\n\ 74 | for line in result:\n\ 75 | if line:\n\ 76 | yield f"data: {line}\\n\\n"\n\ 77 | yield "data: [DONE]\\n\\n"\n\ 78 | \n\ 79 | return StreamingResponse(generate(), media_type="text/event-stream")\n\ 80 | else:\n\ 81 | return result\n\ 82 | except Exception as e:\n\ 83 | return JSONResponse(\n\ 84 | status_code=500,\n\ 85 | content={"error": str(e)}\n\ 86 | )\n\ 87 | except Exception as e:\n\ 88 | return JSONResponse(\n\ 89 | status_code=400,\n\ 90 | content={"error": f"Invalid request: {str(e)}"}\n\ 91 | )\n\ 92 | \n\ 93 | @app.get("/models")\n\ 94 | def list_models():\n\ 95 | """List all available models."""\n\ 96 | return {"models": llama_runner.list_models()}\n\ 97 | \n\ 98 | if __name__ == "__main__":\n\ 99 | print("Starting LlamaCpp Proxy Server on port 3636")\n\ 100 | models = llama_runner.list_models()\n\ 101 | print(f"Available models: {models}")\n\ 102 | if not models:\n\ 103 | print("WARNING: No models found in the models directory.")\n\ 104 | uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py 105 | 106 | # Expose the proxy server port 107 | EXPOSE 3636 108 | 109 | # Set environment variables 110 | ENV PYTHONUNBUFFERED=1 111 | ENV MODELS_DIR=/models 112 | ENV CACHE_DIR=/cache 113 | ENV VERBOSE=true 114 | ENV TIMEOUT_MINUTES=30 115 | 116 | # Command to run when the container starts 117 | CMD ["python", "/app/proxy_server.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Open WebUI (Timothy Jaeryang Baek) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🦙 llama-cpp-runner 2 | 3 | `llama-cpp-runner` is the ultimate Python library for running [llama.cpp](https://github.com/ggerganov/llama.cpp) with zero hassle. It automates the process of downloading prebuilt binaries from the upstream repo, keeping you always **up to date** with the latest developments. All while requiring no complicated setups—everything works **out-of-the-box**. 4 | 5 | ## Key Features 🌟 6 | 7 | 1. **Always Up-to-Date**: Automatically fetches the latest prebuilt binaries from the upstream llama.cpp GitHub repo. No need to worry about staying current. 8 | 2. **Zero Dependencies**: No need to manually install compilers or build binaries. Everything is handled for you during installation. 9 | 3. **Model Flexibility**: Seamlessly load and serve **GGUF** models stored locally or from Hugging Face with ease. 10 | 4. **Built-in HTTP Server**: Automatically spins up a server for chat interactions and manages idle timeouts to save resources. 11 | 5. **Cross-Platform Support**: Works on **Windows**, **Linux**, and **macOS** with automatic detection for AVX/AVX2/AVX512/ARM architectures. 12 | 13 | 14 | ## Why Use `llama-cpp-runner`? 15 | 16 | - **Out-of-the-box experience**: Forget about setting up complex environments for building. Just install and get started! 🛠️ 17 | - **Streamlined Model Serving**: Effortlessly manage multiple models and serve them with an integrated HTTP server. 18 | - **Fast Integration**: Use prebuilt binaries from upstream so you can spend more time building and less time troubleshooting. 19 | 20 | ## Installation 🚀 21 | 22 | Installing `llama-cpp-runner` is quick and easy! Just use pip: 23 | 24 | ```bash 25 | pip install llama-cpp-runner 26 | ``` 27 | 28 | ## Optional Installation (Docker) 29 | 30 | Clone the repository 31 | 32 | ```bash 33 | git clone https://github.com/open-webui/llama-cpp-runner 34 | ``` 35 | 36 | Build and run 37 | 38 | ```bash 39 | docker compose up -d 40 | ``` 41 | 42 | ## Usage 📖 43 | 44 | ### Initialize the Runner 45 | 46 | ```python 47 | from llama_cpp_runner import LlamaCpp 48 | 49 | llama_runner = LlamaCpp(models_dir="path/to/models", verbose=True) 50 | 51 | # List all available GGUF models 52 | models = llama_runner.list_models() 53 | print("Available Models:", models) 54 | ``` 55 | 56 | ### Chat Completion 57 | 58 | ```python 59 | response = llama_runner.chat_completion({ 60 | "model": "your-model-name.gguf", 61 | "messages": [{"role": "user", "content": "Hello, Llama!"}], 62 | "stream": False 63 | }) 64 | 65 | print(response) 66 | ``` 67 | 68 | ## How It Works 🛠️ 69 | 70 | 1. Automatically detects your system architecture (e.g., AVX, AVX2, ARM) and platform. 71 | 2. Downloads and extracts the prebuilt llama.cpp binaries from the official repo. 72 | 3. Spins up a lightweight HTTP server for chat interactions. 73 | 74 | ## Advantages 👍 75 | 76 | - **Hassle-Free**: No need to compile binaries or manage system-specific dependencies. 77 | - **Latest Features, Always**: Stay up to date with llama.cpp’s improvements with every release. 78 | - **Optimized for Your System**: Automatically fetches the best binary for your architecture. 79 | 80 | ## Supported Platforms 🖥️ 81 | 82 | - Windows 83 | - macOS 84 | - Linux 85 | 86 | ## Contributing 💻 87 | 88 | We’d love your contributions! Bug reports, feature requests, and pull requests are all welcome. 89 | 90 | ## License 📜 91 | 92 | This library is open-source and distributed under the MIT license. 93 | 94 | Happy chatting with llama.cpp! 🚀 -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | owui-llama-cpp-runner: 3 | build: . 4 | container_name: owui-llama-cpp-runner 5 | ports: 6 | - "3636:3636" 7 | volumes: 8 | - ./models:/models # local mount 9 | - ./cache:/cache # local mount 10 | # Remove . from the paths above to use native docker volumes 11 | environment: 12 | - MODELS_DIR=/models 13 | - CACHE_DIR=/cache 14 | - VERBOSE=true 15 | - TIMEOUT_MINUTES=30 16 | - LD_LIBRARY_PATH=/cache/llama_cpp/build/bin 17 | restart: unless-stopped 18 | healthcheck: 19 | test: ["CMD", "curl", "-f", "http://localhost:3636/"] 20 | interval: 30s 21 | timeout: 10s 22 | retries: 3 23 | start_period: 40s -------------------------------------------------------------------------------- /proxy_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uvicorn 3 | from fastapi import FastAPI, Request 4 | from fastapi.responses import StreamingResponse, JSONResponse 5 | from llama_cpp_runner.main import LlamaCpp 6 | 7 | app = FastAPI(title="LlamaCpp Proxy") 8 | 9 | # Initialize the LlamaCpp class 10 | models_dir = os.environ.get("MODELS_DIR", "/models") 11 | cache_dir = os.environ.get("CACHE_DIR", "/cache") 12 | verbose = os.environ.get("VERBOSE", "true").lower() == "true" 13 | timeout = int(os.environ.get("TIMEOUT_MINUTES", "30")) 14 | 15 | print(f"Models directory: {models_dir}") 16 | print(f"Cache directory: {cache_dir}") 17 | 18 | # Create the LlamaCpp instance 19 | llama_runner = LlamaCpp( 20 | models_dir=models_dir, 21 | cache_dir=cache_dir, 22 | verbose=verbose, 23 | timeout_minutes=timeout 24 | ) 25 | 26 | @app.get("/") 27 | def read_root(): 28 | """Get server status and list of available models.""" 29 | return {"status": "running", "models": llama_runner.list_models()} 30 | 31 | @app.post("/v1/chat/completions") 32 | async def chat_completions(request: Request): 33 | """Forward chat completion requests to the LlamaCpp server.""" 34 | try: 35 | body = await request.json() 36 | 37 | if "model" not in body: 38 | return JSONResponse( 39 | status_code=400, 40 | content={"error": "Model not specified in request"} 41 | ) 42 | 43 | try: 44 | result = llama_runner.chat_completion(body) 45 | 46 | # Handle streaming responses 47 | if body.get("stream", False): 48 | async def generate(): 49 | for line in result: 50 | if line: 51 | yield f"data: {line}\n\n" 52 | yield "data: [DONE]\n\n" 53 | 54 | return StreamingResponse(generate(), media_type="text/event-stream") 55 | else: 56 | return result 57 | except Exception as e: 58 | return JSONResponse( 59 | status_code=500, 60 | content={"error": str(e)} 61 | ) 62 | except Exception as e: 63 | return JSONResponse( 64 | status_code=400, 65 | content={"error": f"Invalid request: {str(e)}"} 66 | ) 67 | 68 | @app.get("/models") 69 | def list_models(): 70 | """List all available models.""" 71 | return {"models": llama_runner.list_models()} 72 | 73 | if __name__ == "__main__": 74 | print("Starting LlamaCpp Proxy Server on port 3636") 75 | models = llama_runner.list_models() 76 | print(f"Available models: {models}") 77 | if not models: 78 | print("WARNING: No models found in the models directory.") 79 | uvicorn.run(app, host="0.0.0.0", port=3636) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llama-cpp-runner" 3 | version = "0.0.1" 4 | description = "Quick and easy way to run large language models (LLMs) with llama.cpp" 5 | readme = "README.md" 6 | authors = [ 7 | { name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" } 8 | ] 9 | requires-python = ">=3.11" 10 | dependencies = [] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | -------------------------------------------------------------------------------- /src/llama_cpp_runner/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cpp_runner.main import LlamaCppServer 2 | 3 | 4 | def hello() -> str: 5 | return "Hello from llama-cpp-runner!" 6 | -------------------------------------------------------------------------------- /src/llama_cpp_runner/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import requests 4 | import zipfile 5 | import json 6 | import subprocess 7 | import threading 8 | import stat 9 | import time 10 | import socket 11 | 12 | import os 13 | import platform 14 | import requests 15 | import zipfile 16 | import json 17 | 18 | 19 | class LlamaCpp: 20 | def __init__( 21 | self, 22 | models_dir, 23 | cache_dir="~/.llama_cpp_runner", 24 | verbose=False, 25 | timeout_minutes=5, 26 | pinned_version=None, 27 | ): 28 | """ 29 | Initialize the LlamaCpp class. 30 | Args: 31 | models_dir (str): Directory where GGUF models are stored. 32 | cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'. 33 | verbose (bool): Whether to enable verbose logging. 34 | timeout_minutes (int): Timeout for shutting down idle servers. 35 | pinned_version (str or None): Pinned release version of llama.cpp binaries. 36 | """ 37 | self.models_dir = models_dir 38 | self.cache_dir = os.path.expanduser( 39 | cache_dir 40 | ) # Ensure cache is in a fixed location 41 | self.verbose = verbose 42 | self.timeout_minutes = timeout_minutes 43 | self.pinned_version = pinned_version # Optional pinned version 44 | self.llama_cpp_path = ( 45 | self._install_llama_cpp_binaries() 46 | ) # Install the required binaries 47 | self.servers = ( 48 | {} 49 | ) # Maintain a mapping of model names to LlamaCppServer instances 50 | 51 | def list_models(self): 52 | """ 53 | List all GGUF models available in the `models_dir`. 54 | Returns: 55 | list: A list of model names (files ending in ".gguf"). 56 | """ 57 | if not os.path.exists(self.models_dir): 58 | self._log(f"Models directory does not exist: {self.models_dir}") 59 | return [] 60 | models = [f for f in os.listdir(self.models_dir) if f.endswith(".gguf")] 61 | self._log(f"Available models: {models}") 62 | return models 63 | 64 | def chat_completion(self, body): 65 | """ 66 | Handle chat completion requests. 67 | Args: 68 | body (dict): The payload for the chat completion request. It must contain the "model" key. 69 | Returns: 70 | dict or generator: Response from the server (non-streaming or streaming mode). 71 | """ 72 | if "model" not in body: 73 | raise ValueError("The request body must contain a 'model' key.") 74 | model_name = body["model"] 75 | gguf_path = os.path.join(self.models_dir, model_name) 76 | if not os.path.exists(gguf_path): 77 | raise FileNotFoundError(f"Model file not found: {gguf_path}") 78 | # Check if the server for this model is already running 79 | if model_name not in self.servers or not self.servers[model_name]._server_url: 80 | self._log(f"Initializing a new server for model: {model_name}") 81 | self.servers[model_name] = self._create_server(gguf_path) 82 | server = self.servers[model_name] 83 | return server.chat_completion(body) 84 | 85 | def _create_server(self, gguf_path): 86 | """ 87 | Create a new LlamaCppServer instance for the given model. 88 | Args: 89 | gguf_path (str): Path to the GGUF model file. 90 | Returns: 91 | LlamaCppServer: A new server instance. 92 | """ 93 | return LlamaCppServer( 94 | llama_cpp_path=self.llama_cpp_path, 95 | gguf_path=gguf_path, 96 | cache_dir=self.cache_dir, 97 | verbose=self.verbose, 98 | timeout_minutes=self.timeout_minutes, 99 | ) 100 | 101 | def _install_llama_cpp_binaries(self): 102 | """ 103 | Download and install llama.cpp binaries. 104 | Returns: 105 | str: Path to the installed llama.cpp binaries. 106 | """ 107 | self._log("Installing llama.cpp binaries...") 108 | try: 109 | # Use pinned version if provided, otherwise fetch the latest release 110 | release_info = self._get_release_info() 111 | assets = release_info["assets"] 112 | asset = self._get_appropriate_asset(assets) 113 | if not asset: 114 | raise RuntimeError("No appropriate binary found for your system.") 115 | asset_name = asset["name"] 116 | 117 | # Check if cached binaries match the required version 118 | if self._check_cache(release_info, asset): 119 | self._log("Using cached llama.cpp binaries.") 120 | else: 121 | if not self._internet_available(): 122 | raise RuntimeError( 123 | "No cached binary available and unable to fetch from the internet." 124 | ) 125 | self._download_and_unzip(asset["browser_download_url"], asset_name) 126 | self._update_cache_info(release_info, asset) 127 | 128 | except Exception as e: 129 | self._log(f"Error during binary installation: {e}") 130 | raise 131 | 132 | return os.path.join(self.cache_dir, "llama_cpp") 133 | 134 | def _get_release_info(self): 135 | """ 136 | Fetch metadata of the specified release (pinned or latest) from GitHub. 137 | Returns: 138 | dict: Release information. 139 | """ 140 | if self.pinned_version: 141 | api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}" 142 | else: 143 | api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" 144 | 145 | if not self._internet_available(): 146 | # Fall back to cache if no internet access 147 | raise RuntimeError("No internet access and no cached version available.") 148 | 149 | response = requests.get(api_url) 150 | if response.status_code == 200: 151 | return response.json() 152 | else: 153 | error_reason = f"Failed to fetch release info: HTTP {response.status_code}" 154 | raise RuntimeError(error_reason) 155 | 156 | def _get_appropriate_asset(self, assets): 157 | """ 158 | Select the appropriate binary asset for the current system. 159 | Args: 160 | assets (list): List of asset metadata from the release. 161 | Returns: 162 | dict or None: Matching asset metadata, or None if no match found. 163 | """ 164 | system = platform.system().lower() 165 | machine = platform.machine().lower() 166 | processor = platform.processor() 167 | if system == "windows": 168 | if "arm" in machine: 169 | return next((a for a in assets if "win-arm64" in a["name"]), None) 170 | elif "avx512" in processor: 171 | return next((a for a in assets if "win-avx512-x64" in a["name"]), None) 172 | elif "avx2" in processor: 173 | return next((a for a in assets if "win-avx2-x64" in a["name"]), None) 174 | elif "avx" in processor: 175 | return next((a for a in assets if "win-avx-x64" in a["name"]), None) 176 | else: 177 | return next((a for a in assets if "win-noavx-x64" in a["name"]), None) 178 | elif system == "darwin": 179 | if "arm" in machine: 180 | return next((a for a in assets if "macos-arm64" in a["name"]), None) 181 | else: 182 | return next((a for a in assets if "macos-x64" in a["name"]), None) 183 | elif system == "linux": 184 | return next((a for a in assets if "ubuntu-x64" in a["name"]), None) 185 | return None 186 | 187 | def _check_cache(self, release_info, asset): 188 | """ 189 | Check whether the latest binaries are already cached. 190 | Args: 191 | release_info (dict): Metadata of the latest release. 192 | asset (dict): Metadata of the selected asset. 193 | Returns: 194 | bool: True if the cached binary matches the required release, False otherwise. 195 | """ 196 | cache_info_path = os.path.join(self.cache_dir, "cache_info.json") 197 | if os.path.exists(cache_info_path): 198 | with open(cache_info_path, "r") as f: 199 | cache_info = json.load(f) 200 | if ( 201 | cache_info.get("tag_name") == release_info["tag_name"] 202 | and cache_info.get("asset_name") == asset["name"] 203 | ): 204 | return True 205 | return False 206 | 207 | def _download_and_unzip(self, url, asset_name): 208 | """ 209 | Download and extract llama.cpp binaries. 210 | Args: 211 | url (str): URL of the asset to download. 212 | asset_name (str): Name of the asset file. 213 | """ 214 | os.makedirs(self.cache_dir, exist_ok=True) 215 | zip_path = os.path.join(self.cache_dir, asset_name) 216 | self._log(f"Downloading binary from: {url}") 217 | response = requests.get(url) 218 | if response.status_code == 200: 219 | with open(zip_path, "wb") as file: 220 | file.write(response.content) 221 | self._log(f"Successfully downloaded: {asset_name}") 222 | else: 223 | raise RuntimeError(f"Failed to download binary: {url}") 224 | extract_dir = os.path.join(self.cache_dir, "llama_cpp") 225 | with zipfile.ZipFile(zip_path, "r") as zip_ref: 226 | zip_ref.extractall(extract_dir) 227 | self._log(f"Extracted binaries to: {extract_dir}") 228 | 229 | def _update_cache_info(self, release_info, asset): 230 | """ 231 | Update cache metadata with the downloaded release info. 232 | Args: 233 | release_info (dict): Metadata of the latest release. 234 | asset (dict): Metadata of the downloaded asset. 235 | """ 236 | cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]} 237 | cache_info_path = os.path.join(self.cache_dir, "cache_info.json") 238 | with open(cache_info_path, "w") as f: 239 | json.dump(cache_info, f) 240 | 241 | def _internet_available(self): 242 | """ 243 | Check for internet connectivity. 244 | Returns: 245 | bool: True if the internet is accessible, False otherwise. 246 | """ 247 | try: 248 | requests.get("https://api.github.com", timeout=3) 249 | return True 250 | except requests.ConnectionError: 251 | return False 252 | 253 | def _log(self, message): 254 | """ 255 | Print a log message if verbosity is enabled. 256 | Args: 257 | message (str): Log message to print. 258 | """ 259 | if self.verbose: 260 | print(f"[LlamaCpp] {message}") 261 | 262 | 263 | class LlamaCppServer: 264 | def __init__( 265 | self, 266 | llama_cpp_path=None, 267 | gguf_path=None, 268 | cache_dir="./cache", 269 | hugging_face=False, 270 | verbose=False, 271 | timeout_minutes=5, 272 | ): 273 | """ 274 | Initialize the LlamaCppServer. 275 | 276 | Args: 277 | llama_cpp_path (str): Path to the llama.cpp binaries. 278 | gguf_path (str): Path to the GGUF model file. 279 | cache_dir (str): Directory to store llama.cpp binaries and related files. 280 | hugging_face (bool): Whether the model is hosted on Hugging Face. 281 | verbose (bool): Enable verbose logging. 282 | timeout_minutes (int): Timeout duration for shutting down idle servers. 283 | """ 284 | self.verbose = verbose 285 | self.hugging_face = hugging_face 286 | self.cache_dir = cache_dir 287 | self.llama_cpp_path = llama_cpp_path 288 | self.gguf_path = gguf_path 289 | self.server_process = None 290 | self._server_url = None 291 | self._server_thread = None 292 | self.port = None 293 | self.last_used = time.time() # Tracks the last time the server was used 294 | self.timeout_minutes = timeout_minutes 295 | self._auto_terminate_thread = None 296 | 297 | # Validate llama_cpp_path 298 | if llama_cpp_path is None: 299 | raise ValueError("llama_cpp_path must be provided.") 300 | elif not os.path.exists(llama_cpp_path): 301 | raise FileNotFoundError( 302 | f"Specified llama_cpp_path not found: {llama_cpp_path}" 303 | ) 304 | 305 | # Validate gguf_path 306 | if gguf_path and not os.path.exists(gguf_path) and not hugging_face: 307 | raise FileNotFoundError(f"Specified gguf_path not found: {gguf_path}") 308 | 309 | # Start the server if gguf_path is provided 310 | if gguf_path: 311 | self._start_server_in_thread() 312 | self._start_auto_terminate_thread() 313 | 314 | @property 315 | def url(self): 316 | """Return the URL where the server is running.""" 317 | if self._server_url is None: 318 | # If the server URL is not available, ensure the server spins up again 319 | self._log("Server is off. Restarting the server...") 320 | self._start_server_in_thread() 321 | self._start_auto_terminate_thread() 322 | # Wait for the thread to start the server 323 | while self._server_url is None: 324 | time.sleep(1) 325 | 326 | # Update the last-used timestamp whenever this property is accessed 327 | self.last_used = time.time() 328 | return self._server_url 329 | 330 | def kill(self): 331 | """Kill the server process and clean up.""" 332 | if self.server_process and self.server_process.poll() is None: 333 | self.server_process.terminate() 334 | self.server_process.wait() 335 | self.server_process = None 336 | self._server_url = None 337 | self.port = None 338 | self._log("Llama server successfully killed.") 339 | 340 | if self._server_thread and self._server_thread.is_alive(): 341 | self._server_thread.join() 342 | 343 | if self._auto_terminate_thread and self._auto_terminate_thread.is_alive(): 344 | self._auto_terminate_thread.join() 345 | 346 | def chat_completion(self, payload): 347 | """ 348 | Send a chat completion request to the server. 349 | 350 | Args: 351 | payload (dict): Payload for the chat completion request. 352 | 353 | Returns: 354 | dict or generator: Response from the server (non-streaming or streaming mode). 355 | """ 356 | if self._server_url is None: 357 | self._log( 358 | "Server is off. Restarting the server before making the request..." 359 | ) 360 | self._start_server_in_thread() 361 | self._start_auto_terminate_thread() 362 | # Wait for the thread to start the server 363 | while self._server_url is None: 364 | time.sleep(1) 365 | 366 | # Reset the last-used timestamp 367 | self.last_used = time.time() 368 | endpoint = f"{self._server_url}/v1/chat/completions" 369 | self._log(f"Sending chat completion request to {endpoint}...") 370 | 371 | # Check if streaming is enabled in the payload 372 | if payload.get("stream", False): 373 | self._log(f"Streaming mode enabled. Returning a generator.") 374 | response = requests.post(endpoint, json=payload, stream=True) 375 | if response.status_code == 200: 376 | # Return a generator for streaming responses 377 | def stream_response(): 378 | for line in response.iter_lines(decode_unicode=True): 379 | yield line 380 | 381 | return stream_response() 382 | else: 383 | self._log( 384 | f"Request failed with status code: {response.status_code} - {response.text}" 385 | ) 386 | response.raise_for_status() 387 | else: 388 | # Non-streaming mode 389 | response = requests.post(endpoint, json=payload) 390 | if response.status_code == 200: 391 | self._log("Request successful.") 392 | return response.json() 393 | else: 394 | self._log( 395 | f"Request failed with status code: {response.status_code} - {response.text}" 396 | ) 397 | response.raise_for_status() 398 | 399 | def _start_server_in_thread(self): 400 | """Start the server in a separate thread.""" 401 | 402 | def target(): 403 | try: 404 | self._start_server() 405 | except Exception as e: 406 | self._log(f"Failed to start server: {e}") 407 | 408 | self._server_thread = threading.Thread(target=target, daemon=True) 409 | self._server_thread.start() 410 | 411 | def _start_auto_terminate_thread(self): 412 | """Start the auto-terminate thread that monitors idle time.""" 413 | 414 | def monitor_idle_time(): 415 | while True: 416 | time.sleep(10) 417 | if ( 418 | self.server_process and self.server_process.poll() is None 419 | ): # Server is running 420 | elapsed_time = time.time() - self.last_used 421 | if elapsed_time > self.timeout_minutes * 60: 422 | self._log( 423 | "Server has been idle for too long. Auto-terminating..." 424 | ) 425 | self.kill() 426 | break 427 | 428 | self._auto_terminate_thread = threading.Thread( 429 | target=monitor_idle_time, daemon=True 430 | ) 431 | self._auto_terminate_thread.start() 432 | 433 | def _start_server(self): 434 | """Start the llama-server.""" 435 | if not self.gguf_path or ( 436 | not self.hugging_face and not os.path.exists(self.gguf_path) 437 | ): 438 | raise ValueError( 439 | f"GGUF model path is not specified or invalid: {self.gguf_path}" 440 | ) 441 | 442 | server_binary = os.path.join( 443 | self.llama_cpp_path, "build", "bin", "llama-server" 444 | ) 445 | if not os.path.exists(server_binary): 446 | raise FileNotFoundError(f"Server binary not found: {server_binary}") 447 | 448 | # Ensure the binary is executable 449 | self._set_executable(server_binary) 450 | 451 | # Find an available port 452 | self.port = self._find_available_port(start_port=10000) 453 | if self.port is None: 454 | raise RuntimeError("No available port found between 10000 and 11000.") 455 | 456 | self._log(f"Starting server with binary: {server_binary}") 457 | self._log(f"Using GGUF path: {self.gguf_path}") 458 | self._log(f"Using port: {self.port}") 459 | 460 | commands = [server_binary] 461 | if self.hugging_face: 462 | commands.extend(["-hf", self.gguf_path, "--port", str(self.port)]) 463 | else: 464 | commands.extend(["-m", self.gguf_path, "--port", str(self.port)]) 465 | 466 | self.server_process = subprocess.Popen( 467 | commands, 468 | stdout=subprocess.PIPE, 469 | stderr=subprocess.STDOUT, 470 | universal_newlines=True, 471 | ) 472 | 473 | # Wait for the server to confirm it is ready by monitoring its output 474 | self._server_url = None 475 | for line in iter(self.server_process.stdout.readline, ""): 476 | self._log(line.strip()) 477 | if "listening on" in line: 478 | self._server_url = f"http://localhost:{self.port}" 479 | self._log(f"Server is now accessible at {self._server_url}") 480 | break 481 | 482 | if not self._server_url: 483 | raise RuntimeError("Failed to confirm server is running.") 484 | 485 | def _find_available_port(self, start_port=10000, end_port=11000): 486 | """Find an available port between `start_port` and `end_port`.""" 487 | for port in range(start_port, end_port): 488 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: 489 | if sock.connect_ex(("localhost", port)) != 0: 490 | return port 491 | return None 492 | 493 | def _set_executable(self, file_path): 494 | """Ensure the file at `file_path` is executable.""" 495 | if platform.system() != "Windows": 496 | current_mode = os.stat(file_path).st_mode 497 | os.chmod( 498 | file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH 499 | ) 500 | 501 | def _log(self, message): 502 | """Print a log message if verbosity is enabled.""" 503 | if self.verbose: 504 | print(f"[LlamaCppServer] {message}") 505 | -------------------------------------------------------------------------------- /src/llama_cpp_runner/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-webui/llama-cpp-runner/300cf7f5713807ad50cd61a374a95207dfd06cd8/src/llama_cpp_runner/py.typed --------------------------------------------------------------------------------