├── .gitignore
├── .python-version
├── Dockerfile
├── LICENSE
├── README.md
├── compose.yaml
├── proxy_server.py
├── pyproject.toml
└── src
    └── llama_cpp_runner
        ├── __init__.py
        ├── main.py
        └── py.typed


/.gitignore:
--------------------------------------------------------------------------------
1 | llama_cpp_cache/
2 | cache/
3 | dist/
4 | models/*


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM python:3.11-slim
  2 | 
  3 | WORKDIR /app
  4 | 
  5 | # Install only essential packages and clean up in one layer to reduce image size
  6 | RUN apt-get update && apt-get install -y --no-install-recommends \
  7 |     curl \
  8 |     wget \
  9 |     git \
 10 |     build-essential \
 11 |     && apt-get clean \
 12 |     && rm -rf /var/lib/apt/lists/*
 13 | 
 14 | # Copy only necessary files
 15 | COPY pyproject.toml README.md LICENSE /app/
 16 | COPY src/ /app/src/
 17 | 
 18 | # Install the package in development mode and required dependencies
 19 | RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn
 20 | 
 21 | # Create volume mount points
 22 | VOLUME /models
 23 | VOLUME /cache
 24 | 
 25 | # Create proxy server script directly in the Dockerfile
 26 | RUN echo 'import os\n\
 27 | import uvicorn\n\
 28 | from fastapi import FastAPI, Request\n\
 29 | from fastapi.responses import StreamingResponse, JSONResponse\n\
 30 | from llama_cpp_runner.main import LlamaCpp\n\
 31 | \n\
 32 | app = FastAPI(title="LlamaCpp Proxy")\n\
 33 | \n\
 34 | # Initialize the LlamaCpp class\n\
 35 | models_dir = os.environ.get("MODELS_DIR", "/models")\n\
 36 | cache_dir = os.environ.get("CACHE_DIR", "/cache")\n\
 37 | verbose = os.environ.get("VERBOSE", "true").lower() == "true"\n\
 38 | timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))\n\
 39 | \n\
 40 | print(f"Models directory: {models_dir}")\n\
 41 | print(f"Cache directory: {cache_dir}")\n\
 42 | \n\
 43 | # Create the LlamaCpp instance\n\
 44 | llama_runner = LlamaCpp(\n\
 45 |     models_dir=models_dir,\n\
 46 |     cache_dir=cache_dir, \n\
 47 |     verbose=verbose, \n\
 48 |     timeout_minutes=timeout\n\
 49 | )\n\
 50 | \n\
 51 | @app.get("/")\n\
 52 | def read_root():\n\
 53 |     """Get server status and list of available models."""\n\
 54 |     return {"status": "running", "models": llama_runner.list_models()}\n\
 55 | \n\
 56 | @app.post("/v1/chat/completions")\n\
 57 | async def chat_completions(request: Request):\n\
 58 |     """Forward chat completion requests to the LlamaCpp server."""\n\
 59 |     try:\n\
 60 |         body = await request.json()\n\
 61 |         \n\
 62 |         if "model" not in body:\n\
 63 |             return JSONResponse(\n\
 64 |                 status_code=400,\n\
 65 |                 content={"error": "Model not specified in request"}\n\
 66 |             )\n\
 67 |         \n\
 68 |         try:\n\
 69 |             result = llama_runner.chat_completion(body)\n\
 70 |             \n\
 71 |             # Handle streaming responses\n\
 72 |             if body.get("stream", False):\n\
 73 |                 async def generate():\n\
 74 |                     for line in result:\n\
 75 |                         if line:\n\
 76 |                             yield f"data: {line}\\n\\n"\n\
 77 |                     yield "data: [DONE]\\n\\n"\n\
 78 |                 \n\
 79 |                 return StreamingResponse(generate(), media_type="text/event-stream")\n\
 80 |             else:\n\
 81 |                 return result\n\
 82 |         except Exception as e:\n\
 83 |             return JSONResponse(\n\
 84 |                 status_code=500,\n\
 85 |                 content={"error": str(e)}\n\
 86 |             )\n\
 87 |     except Exception as e:\n\
 88 |         return JSONResponse(\n\
 89 |             status_code=400,\n\
 90 |             content={"error": f"Invalid request: {str(e)}"}\n\
 91 |         )\n\
 92 | \n\
 93 | @app.get("/models")\n\
 94 | def list_models():\n\
 95 |     """List all available models."""\n\
 96 |     return {"models": llama_runner.list_models()}\n\
 97 | \n\
 98 | if __name__ == "__main__":\n\
 99 |     print("Starting LlamaCpp Proxy Server on port 3636")\n\
100 |     models = llama_runner.list_models()\n\
101 |     print(f"Available models: {models}")\n\
102 |     if not models:\n\
103 |         print("WARNING: No models found in the models directory.")\n\
104 |     uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py
105 | 
106 | # Expose the proxy server port
107 | EXPOSE 3636
108 | 
109 | # Set environment variables
110 | ENV PYTHONUNBUFFERED=1
111 | ENV MODELS_DIR=/models
112 | ENV CACHE_DIR=/cache
113 | ENV VERBOSE=true
114 | ENV TIMEOUT_MINUTES=30
115 | 
116 | # Command to run when the container starts
117 | CMD ["python", "/app/proxy_server.py"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Open WebUI (Timothy Jaeryang Baek)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🦙 llama-cpp-runner
 2 | 
 3 | `llama-cpp-runner` is the ultimate Python library for running [llama.cpp](https://github.com/ggerganov/llama.cpp) with zero hassle. It automates the process of downloading prebuilt binaries from the upstream repo, keeping you always **up to date** with the latest developments. All while requiring no complicated setups—everything works **out-of-the-box**.
 4 | 
 5 | ## Key Features 🌟
 6 | 
 7 | 1. **Always Up-to-Date**: Automatically fetches the latest prebuilt binaries from the upstream llama.cpp GitHub repo. No need to worry about staying current.
 8 | 2. **Zero Dependencies**: No need to manually install compilers or build binaries. Everything is handled for you during installation.
 9 | 3. **Model Flexibility**: Seamlessly load and serve **GGUF** models stored locally or from Hugging Face with ease.
10 | 4. **Built-in HTTP Server**: Automatically spins up a server for chat interactions and manages idle timeouts to save resources.
11 | 5. **Cross-Platform Support**: Works on **Windows**, **Linux**, and **macOS** with automatic detection for AVX/AVX2/AVX512/ARM architectures.
12 | 
13 | 
14 | ## Why Use `llama-cpp-runner`?
15 | 
16 | - **Out-of-the-box experience**: Forget about setting up complex environments for building. Just install and get started! 🛠️  
17 | - **Streamlined Model Serving**: Effortlessly manage multiple models and serve them with an integrated HTTP server.
18 | - **Fast Integration**: Use prebuilt binaries from upstream so you can spend more time building and less time troubleshooting.
19 | 
20 | ## Installation 🚀
21 | 
22 | Installing `llama-cpp-runner` is quick and easy! Just use pip:
23 | 
24 | ```bash
25 | pip install llama-cpp-runner
26 | ```
27 | 
28 | ## Optional Installation (Docker)
29 | 
30 | Clone the repository 
31 | 
32 | ```bash
33 | git clone https://github.com/open-webui/llama-cpp-runner
34 | ```
35 | 
36 | Build and run
37 | 
38 | ```bash
39 | docker compose up -d
40 | ```
41 | 
42 | ## Usage 📖
43 | 
44 | ### Initialize the Runner
45 | 
46 | ```python
47 | from llama_cpp_runner import LlamaCpp
48 | 
49 | llama_runner = LlamaCpp(models_dir="path/to/models", verbose=True)
50 | 
51 | # List all available GGUF models
52 | models = llama_runner.list_models()
53 | print("Available Models:", models)
54 | ```
55 | 
56 | ### Chat Completion
57 | 
58 | ```python
59 | response = llama_runner.chat_completion({
60 |     "model": "your-model-name.gguf",
61 |     "messages": [{"role": "user", "content": "Hello, Llama!"}],
62 |     "stream": False
63 | })
64 | 
65 | print(response)
66 | ```
67 | 
68 | ## How It Works 🛠️
69 | 
70 | 1. Automatically detects your system architecture (e.g., AVX, AVX2, ARM) and platform.
71 | 2. Downloads and extracts the prebuilt llama.cpp binaries from the official repo.
72 | 3. Spins up a lightweight HTTP server for chat interactions.
73 | 
74 | ## Advantages 👍
75 | 
76 | - **Hassle-Free**: No need to compile binaries or manage system-specific dependencies.  
77 | - **Latest Features, Always**: Stay up to date with llama.cpp’s improvements with every release.  
78 | - **Optimized for Your System**: Automatically fetches the best binary for your architecture.
79 | 
80 | ## Supported Platforms 🖥️
81 | 
82 | - Windows
83 | - macOS
84 | - Linux
85 | 
86 | ## Contributing 💻
87 | 
88 | We’d love your contributions! Bug reports, feature requests, and pull requests are all welcome. 
89 | 
90 | ## License 📜
91 | 
92 | This library is open-source and distributed under the MIT license.  
93 | 
94 | Happy chatting with llama.cpp! 🚀


--------------------------------------------------------------------------------
/compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   owui-llama-cpp-runner:
 3 |     build: .
 4 |     container_name: owui-llama-cpp-runner
 5 |     ports:
 6 |       - "3636:3636"
 7 |     volumes:
 8 |       - ./models:/models # local mount
 9 |       - ./cache:/cache   # local mount
10 |     # Remove . from the paths above to use native docker volumes
11 |     environment:
12 |       - MODELS_DIR=/models
13 |       - CACHE_DIR=/cache
14 |       - VERBOSE=true
15 |       - TIMEOUT_MINUTES=30
16 |       - LD_LIBRARY_PATH=/cache/llama_cpp/build/bin
17 |     restart: unless-stopped
18 |     healthcheck:
19 |       test: ["CMD", "curl", "-f", "http://localhost:3636/"]
20 |       interval: 30s
21 |       timeout: 10s
22 |       retries: 3
23 |       start_period: 40s


--------------------------------------------------------------------------------
/proxy_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uvicorn
 3 | from fastapi import FastAPI, Request
 4 | from fastapi.responses import StreamingResponse, JSONResponse
 5 | from llama_cpp_runner.main import LlamaCpp
 6 | 
 7 | app = FastAPI(title="LlamaCpp Proxy")
 8 | 
 9 | # Initialize the LlamaCpp class
10 | models_dir = os.environ.get("MODELS_DIR", "/models")
11 | cache_dir = os.environ.get("CACHE_DIR", "/cache")
12 | verbose = os.environ.get("VERBOSE", "true").lower() == "true"
13 | timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))
14 | 
15 | print(f"Models directory: {models_dir}")
16 | print(f"Cache directory: {cache_dir}")
17 | 
18 | # Create the LlamaCpp instance
19 | llama_runner = LlamaCpp(
20 |     models_dir=models_dir,
21 |     cache_dir=cache_dir, 
22 |     verbose=verbose, 
23 |     timeout_minutes=timeout
24 | )
25 | 
26 | @app.get("/")
27 | def read_root():
28 |     """Get server status and list of available models."""
29 |     return {"status": "running", "models": llama_runner.list_models()}
30 | 
31 | @app.post("/v1/chat/completions")
32 | async def chat_completions(request: Request):
33 |     """Forward chat completion requests to the LlamaCpp server."""
34 |     try:
35 |         body = await request.json()
36 |         
37 |         if "model" not in body:
38 |             return JSONResponse(
39 |                 status_code=400,
40 |                 content={"error": "Model not specified in request"}
41 |             )
42 |         
43 |         try:
44 |             result = llama_runner.chat_completion(body)
45 |             
46 |             # Handle streaming responses
47 |             if body.get("stream", False):
48 |                 async def generate():
49 |                     for line in result:
50 |                         if line:
51 |                             yield f"data: {line}\n\n"
52 |                     yield "data: [DONE]\n\n"
53 |                 
54 |                 return StreamingResponse(generate(), media_type="text/event-stream")
55 |             else:
56 |                 return result
57 |         except Exception as e:
58 |             return JSONResponse(
59 |                 status_code=500,
60 |                 content={"error": str(e)}
61 |             )
62 |     except Exception as e:
63 |         return JSONResponse(
64 |             status_code=400,
65 |             content={"error": f"Invalid request: {str(e)}"}
66 |         )
67 | 
68 | @app.get("/models")
69 | def list_models():
70 |     """List all available models."""
71 |     return {"models": llama_runner.list_models()}
72 | 
73 | if __name__ == "__main__":
74 |     print("Starting LlamaCpp Proxy Server on port 3636")
75 |     models = llama_runner.list_models()
76 |     print(f"Available models: {models}")
77 |     if not models:
78 |         print("WARNING: No models found in the models directory.")
79 |     uvicorn.run(app, host="0.0.0.0", port=3636)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llama-cpp-runner"
 3 | version = "0.0.1"
 4 | description = "Quick and easy way to run large language models (LLMs) with llama.cpp"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }
 8 | ]
 9 | requires-python = ">=3.11"
10 | dependencies = []
11 | 
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 | 


--------------------------------------------------------------------------------
/src/llama_cpp_runner/__init__.py:
--------------------------------------------------------------------------------
1 | from llama_cpp_runner.main import LlamaCppServer
2 | 
3 | 
4 | def hello() -> str:
5 |     return "Hello from llama-cpp-runner!"
6 | 


--------------------------------------------------------------------------------
/src/llama_cpp_runner/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import requests
  4 | import zipfile
  5 | import json
  6 | import subprocess
  7 | import threading
  8 | import stat
  9 | import time
 10 | import socket
 11 | 
 12 | import os
 13 | import platform
 14 | import requests
 15 | import zipfile
 16 | import json
 17 | 
 18 | 
 19 | class LlamaCpp:
 20 |     def __init__(
 21 |         self,
 22 |         models_dir,
 23 |         cache_dir="~/.llama_cpp_runner",
 24 |         verbose=False,
 25 |         timeout_minutes=5,
 26 |         pinned_version=None,
 27 |     ):
 28 |         """
 29 |         Initialize the LlamaCpp class.
 30 |         Args:
 31 |             models_dir (str): Directory where GGUF models are stored.
 32 |             cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'.
 33 |             verbose (bool): Whether to enable verbose logging.
 34 |             timeout_minutes (int): Timeout for shutting down idle servers.
 35 |             pinned_version (str or None): Pinned release version of llama.cpp binaries.
 36 |         """
 37 |         self.models_dir = models_dir
 38 |         self.cache_dir = os.path.expanduser(
 39 |             cache_dir
 40 |         )  # Ensure cache is in a fixed location
 41 |         self.verbose = verbose
 42 |         self.timeout_minutes = timeout_minutes
 43 |         self.pinned_version = pinned_version  # Optional pinned version
 44 |         self.llama_cpp_path = (
 45 |             self._install_llama_cpp_binaries()
 46 |         )  # Install the required binaries
 47 |         self.servers = (
 48 |             {}
 49 |         )  # Maintain a mapping of model names to LlamaCppServer instances
 50 | 
 51 |     def list_models(self):
 52 |         """
 53 |         List all GGUF models available in the `models_dir`.
 54 |         Returns:
 55 |             list: A list of model names (files ending in ".gguf").
 56 |         """
 57 |         if not os.path.exists(self.models_dir):
 58 |             self._log(f"Models directory does not exist: {self.models_dir}")
 59 |             return []
 60 |         models = [f for f in os.listdir(self.models_dir) if f.endswith(".gguf")]
 61 |         self._log(f"Available models: {models}")
 62 |         return models
 63 | 
 64 |     def chat_completion(self, body):
 65 |         """
 66 |         Handle chat completion requests.
 67 |         Args:
 68 |             body (dict): The payload for the chat completion request. It must contain the "model" key.
 69 |         Returns:
 70 |             dict or generator: Response from the server (non-streaming or streaming mode).
 71 |         """
 72 |         if "model" not in body:
 73 |             raise ValueError("The request body must contain a 'model' key.")
 74 |         model_name = body["model"]
 75 |         gguf_path = os.path.join(self.models_dir, model_name)
 76 |         if not os.path.exists(gguf_path):
 77 |             raise FileNotFoundError(f"Model file not found: {gguf_path}")
 78 |         # Check if the server for this model is already running
 79 |         if model_name not in self.servers or not self.servers[model_name]._server_url:
 80 |             self._log(f"Initializing a new server for model: {model_name}")
 81 |             self.servers[model_name] = self._create_server(gguf_path)
 82 |         server = self.servers[model_name]
 83 |         return server.chat_completion(body)
 84 | 
 85 |     def _create_server(self, gguf_path):
 86 |         """
 87 |         Create a new LlamaCppServer instance for the given model.
 88 |         Args:
 89 |             gguf_path (str): Path to the GGUF model file.
 90 |         Returns:
 91 |             LlamaCppServer: A new server instance.
 92 |         """
 93 |         return LlamaCppServer(
 94 |             llama_cpp_path=self.llama_cpp_path,
 95 |             gguf_path=gguf_path,
 96 |             cache_dir=self.cache_dir,
 97 |             verbose=self.verbose,
 98 |             timeout_minutes=self.timeout_minutes,
 99 |         )
100 | 
101 |     def _install_llama_cpp_binaries(self):
102 |         """
103 |         Download and install llama.cpp binaries.
104 |         Returns:
105 |             str: Path to the installed llama.cpp binaries.
106 |         """
107 |         self._log("Installing llama.cpp binaries...")
108 |         try:
109 |             # Use pinned version if provided, otherwise fetch the latest release
110 |             release_info = self._get_release_info()
111 |             assets = release_info["assets"]
112 |             asset = self._get_appropriate_asset(assets)
113 |             if not asset:
114 |                 raise RuntimeError("No appropriate binary found for your system.")
115 |             asset_name = asset["name"]
116 | 
117 |             # Check if cached binaries match the required version
118 |             if self._check_cache(release_info, asset):
119 |                 self._log("Using cached llama.cpp binaries.")
120 |             else:
121 |                 if not self._internet_available():
122 |                     raise RuntimeError(
123 |                         "No cached binary available and unable to fetch from the internet."
124 |                     )
125 |                 self._download_and_unzip(asset["browser_download_url"], asset_name)
126 |                 self._update_cache_info(release_info, asset)
127 | 
128 |         except Exception as e:
129 |             self._log(f"Error during binary installation: {e}")
130 |             raise
131 | 
132 |         return os.path.join(self.cache_dir, "llama_cpp")
133 | 
134 |     def _get_release_info(self):
135 |         """
136 |         Fetch metadata of the specified release (pinned or latest) from GitHub.
137 |         Returns:
138 |             dict: Release information.
139 |         """
140 |         if self.pinned_version:
141 |             api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}"
142 |         else:
143 |             api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
144 | 
145 |         if not self._internet_available():
146 |             # Fall back to cache if no internet access
147 |             raise RuntimeError("No internet access and no cached version available.")
148 | 
149 |         response = requests.get(api_url)
150 |         if response.status_code == 200:
151 |             return response.json()
152 |         else:
153 |             error_reason = f"Failed to fetch release info: HTTP {response.status_code}"
154 |             raise RuntimeError(error_reason)
155 | 
156 |     def _get_appropriate_asset(self, assets):
157 |         """
158 |         Select the appropriate binary asset for the current system.
159 |         Args:
160 |             assets (list): List of asset metadata from the release.
161 |         Returns:
162 |             dict or None: Matching asset metadata, or None if no match found.
163 |         """
164 |         system = platform.system().lower()
165 |         machine = platform.machine().lower()
166 |         processor = platform.processor()
167 |         if system == "windows":
168 |             if "arm" in machine:
169 |                 return next((a for a in assets if "win-arm64" in a["name"]), None)
170 |             elif "avx512" in processor:
171 |                 return next((a for a in assets if "win-avx512-x64" in a["name"]), None)
172 |             elif "avx2" in processor:
173 |                 return next((a for a in assets if "win-avx2-x64" in a["name"]), None)
174 |             elif "avx" in processor:
175 |                 return next((a for a in assets if "win-avx-x64" in a["name"]), None)
176 |             else:
177 |                 return next((a for a in assets if "win-noavx-x64" in a["name"]), None)
178 |         elif system == "darwin":
179 |             if "arm" in machine:
180 |                 return next((a for a in assets if "macos-arm64" in a["name"]), None)
181 |             else:
182 |                 return next((a for a in assets if "macos-x64" in a["name"]), None)
183 |         elif system == "linux":
184 |             return next((a for a in assets if "ubuntu-x64" in a["name"]), None)
185 |         return None
186 | 
187 |     def _check_cache(self, release_info, asset):
188 |         """
189 |         Check whether the latest binaries are already cached.
190 |         Args:
191 |             release_info (dict): Metadata of the latest release.
192 |             asset (dict): Metadata of the selected asset.
193 |         Returns:
194 |             bool: True if the cached binary matches the required release, False otherwise.
195 |         """
196 |         cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
197 |         if os.path.exists(cache_info_path):
198 |             with open(cache_info_path, "r") as f:
199 |                 cache_info = json.load(f)
200 |             if (
201 |                 cache_info.get("tag_name") == release_info["tag_name"]
202 |                 and cache_info.get("asset_name") == asset["name"]
203 |             ):
204 |                 return True
205 |         return False
206 | 
207 |     def _download_and_unzip(self, url, asset_name):
208 |         """
209 |         Download and extract llama.cpp binaries.
210 |         Args:
211 |             url (str): URL of the asset to download.
212 |             asset_name (str): Name of the asset file.
213 |         """
214 |         os.makedirs(self.cache_dir, exist_ok=True)
215 |         zip_path = os.path.join(self.cache_dir, asset_name)
216 |         self._log(f"Downloading binary from: {url}")
217 |         response = requests.get(url)
218 |         if response.status_code == 200:
219 |             with open(zip_path, "wb") as file:
220 |                 file.write(response.content)
221 |             self._log(f"Successfully downloaded: {asset_name}")
222 |         else:
223 |             raise RuntimeError(f"Failed to download binary: {url}")
224 |         extract_dir = os.path.join(self.cache_dir, "llama_cpp")
225 |         with zipfile.ZipFile(zip_path, "r") as zip_ref:
226 |             zip_ref.extractall(extract_dir)
227 |         self._log(f"Extracted binaries to: {extract_dir}")
228 | 
229 |     def _update_cache_info(self, release_info, asset):
230 |         """
231 |         Update cache metadata with the downloaded release info.
232 |         Args:
233 |             release_info (dict): Metadata of the latest release.
234 |             asset (dict): Metadata of the downloaded asset.
235 |         """
236 |         cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
237 |         cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
238 |         with open(cache_info_path, "w") as f:
239 |             json.dump(cache_info, f)
240 | 
241 |     def _internet_available(self):
242 |         """
243 |         Check for internet connectivity.
244 |         Returns:
245 |             bool: True if the internet is accessible, False otherwise.
246 |         """
247 |         try:
248 |             requests.get("https://api.github.com", timeout=3)
249 |             return True
250 |         except requests.ConnectionError:
251 |             return False
252 | 
253 |     def _log(self, message):
254 |         """
255 |         Print a log message if verbosity is enabled.
256 |         Args:
257 |             message (str): Log message to print.
258 |         """
259 |         if self.verbose:
260 |             print(f"[LlamaCpp] {message}")
261 | 
262 | 
263 | class LlamaCppServer:
264 |     def __init__(
265 |         self,
266 |         llama_cpp_path=None,
267 |         gguf_path=None,
268 |         cache_dir="./cache",
269 |         hugging_face=False,
270 |         verbose=False,
271 |         timeout_minutes=5,
272 |     ):
273 |         """
274 |         Initialize the LlamaCppServer.
275 | 
276 |         Args:
277 |             llama_cpp_path (str): Path to the llama.cpp binaries.
278 |             gguf_path (str): Path to the GGUF model file.
279 |             cache_dir (str): Directory to store llama.cpp binaries and related files.
280 |             hugging_face (bool): Whether the model is hosted on Hugging Face.
281 |             verbose (bool): Enable verbose logging.
282 |             timeout_minutes (int): Timeout duration for shutting down idle servers.
283 |         """
284 |         self.verbose = verbose
285 |         self.hugging_face = hugging_face
286 |         self.cache_dir = cache_dir
287 |         self.llama_cpp_path = llama_cpp_path
288 |         self.gguf_path = gguf_path
289 |         self.server_process = None
290 |         self._server_url = None
291 |         self._server_thread = None
292 |         self.port = None
293 |         self.last_used = time.time()  # Tracks the last time the server was used
294 |         self.timeout_minutes = timeout_minutes
295 |         self._auto_terminate_thread = None
296 | 
297 |         # Validate llama_cpp_path
298 |         if llama_cpp_path is None:
299 |             raise ValueError("llama_cpp_path must be provided.")
300 |         elif not os.path.exists(llama_cpp_path):
301 |             raise FileNotFoundError(
302 |                 f"Specified llama_cpp_path not found: {llama_cpp_path}"
303 |             )
304 | 
305 |         # Validate gguf_path
306 |         if gguf_path and not os.path.exists(gguf_path) and not hugging_face:
307 |             raise FileNotFoundError(f"Specified gguf_path not found: {gguf_path}")
308 | 
309 |         # Start the server if gguf_path is provided
310 |         if gguf_path:
311 |             self._start_server_in_thread()
312 |             self._start_auto_terminate_thread()
313 | 
314 |     @property
315 |     def url(self):
316 |         """Return the URL where the server is running."""
317 |         if self._server_url is None:
318 |             # If the server URL is not available, ensure the server spins up again
319 |             self._log("Server is off. Restarting the server...")
320 |             self._start_server_in_thread()
321 |             self._start_auto_terminate_thread()
322 |             # Wait for the thread to start the server
323 |             while self._server_url is None:
324 |                 time.sleep(1)
325 | 
326 |         # Update the last-used timestamp whenever this property is accessed
327 |         self.last_used = time.time()
328 |         return self._server_url
329 | 
330 |     def kill(self):
331 |         """Kill the server process and clean up."""
332 |         if self.server_process and self.server_process.poll() is None:
333 |             self.server_process.terminate()
334 |             self.server_process.wait()
335 |             self.server_process = None
336 |             self._server_url = None
337 |             self.port = None
338 |             self._log("Llama server successfully killed.")
339 | 
340 |         if self._server_thread and self._server_thread.is_alive():
341 |             self._server_thread.join()
342 | 
343 |         if self._auto_terminate_thread and self._auto_terminate_thread.is_alive():
344 |             self._auto_terminate_thread.join()
345 | 
346 |     def chat_completion(self, payload):
347 |         """
348 |         Send a chat completion request to the server.
349 | 
350 |         Args:
351 |             payload (dict): Payload for the chat completion request.
352 | 
353 |         Returns:
354 |             dict or generator: Response from the server (non-streaming or streaming mode).
355 |         """
356 |         if self._server_url is None:
357 |             self._log(
358 |                 "Server is off. Restarting the server before making the request..."
359 |             )
360 |             self._start_server_in_thread()
361 |             self._start_auto_terminate_thread()
362 |             # Wait for the thread to start the server
363 |             while self._server_url is None:
364 |                 time.sleep(1)
365 | 
366 |         # Reset the last-used timestamp
367 |         self.last_used = time.time()
368 |         endpoint = f"{self._server_url}/v1/chat/completions"
369 |         self._log(f"Sending chat completion request to {endpoint}...")
370 | 
371 |         # Check if streaming is enabled in the payload
372 |         if payload.get("stream", False):
373 |             self._log(f"Streaming mode enabled. Returning a generator.")
374 |             response = requests.post(endpoint, json=payload, stream=True)
375 |             if response.status_code == 200:
376 |                 # Return a generator for streaming responses
377 |                 def stream_response():
378 |                     for line in response.iter_lines(decode_unicode=True):
379 |                         yield line
380 | 
381 |                 return stream_response()
382 |             else:
383 |                 self._log(
384 |                     f"Request failed with status code: {response.status_code} - {response.text}"
385 |                 )
386 |                 response.raise_for_status()
387 |         else:
388 |             # Non-streaming mode
389 |             response = requests.post(endpoint, json=payload)
390 |             if response.status_code == 200:
391 |                 self._log("Request successful.")
392 |                 return response.json()
393 |             else:
394 |                 self._log(
395 |                     f"Request failed with status code: {response.status_code} - {response.text}"
396 |                 )
397 |                 response.raise_for_status()
398 | 
399 |     def _start_server_in_thread(self):
400 |         """Start the server in a separate thread."""
401 | 
402 |         def target():
403 |             try:
404 |                 self._start_server()
405 |             except Exception as e:
406 |                 self._log(f"Failed to start server: {e}")
407 | 
408 |         self._server_thread = threading.Thread(target=target, daemon=True)
409 |         self._server_thread.start()
410 | 
411 |     def _start_auto_terminate_thread(self):
412 |         """Start the auto-terminate thread that monitors idle time."""
413 | 
414 |         def monitor_idle_time():
415 |             while True:
416 |                 time.sleep(10)
417 |                 if (
418 |                     self.server_process and self.server_process.poll() is None
419 |                 ):  # Server is running
420 |                     elapsed_time = time.time() - self.last_used
421 |                     if elapsed_time > self.timeout_minutes * 60:
422 |                         self._log(
423 |                             "Server has been idle for too long. Auto-terminating..."
424 |                         )
425 |                         self.kill()
426 |                         break
427 | 
428 |         self._auto_terminate_thread = threading.Thread(
429 |             target=monitor_idle_time, daemon=True
430 |         )
431 |         self._auto_terminate_thread.start()
432 | 
433 |     def _start_server(self):
434 |         """Start the llama-server."""
435 |         if not self.gguf_path or (
436 |             not self.hugging_face and not os.path.exists(self.gguf_path)
437 |         ):
438 |             raise ValueError(
439 |                 f"GGUF model path is not specified or invalid: {self.gguf_path}"
440 |             )
441 | 
442 |         server_binary = os.path.join(
443 |             self.llama_cpp_path, "build", "bin", "llama-server"
444 |         )
445 |         if not os.path.exists(server_binary):
446 |             raise FileNotFoundError(f"Server binary not found: {server_binary}")
447 | 
448 |         # Ensure the binary is executable
449 |         self._set_executable(server_binary)
450 | 
451 |         # Find an available port
452 |         self.port = self._find_available_port(start_port=10000)
453 |         if self.port is None:
454 |             raise RuntimeError("No available port found between 10000 and 11000.")
455 | 
456 |         self._log(f"Starting server with binary: {server_binary}")
457 |         self._log(f"Using GGUF path: {self.gguf_path}")
458 |         self._log(f"Using port: {self.port}")
459 | 
460 |         commands = [server_binary]
461 |         if self.hugging_face:
462 |             commands.extend(["-hf", self.gguf_path, "--port", str(self.port)])
463 |         else:
464 |             commands.extend(["-m", self.gguf_path, "--port", str(self.port)])
465 | 
466 |         self.server_process = subprocess.Popen(
467 |             commands,
468 |             stdout=subprocess.PIPE,
469 |             stderr=subprocess.STDOUT,
470 |             universal_newlines=True,
471 |         )
472 | 
473 |         # Wait for the server to confirm it is ready by monitoring its output
474 |         self._server_url = None
475 |         for line in iter(self.server_process.stdout.readline, ""):
476 |             self._log(line.strip())
477 |             if "listening on" in line:
478 |                 self._server_url = f"http://localhost:{self.port}"
479 |                 self._log(f"Server is now accessible at {self._server_url}")
480 |                 break
481 | 
482 |         if not self._server_url:
483 |             raise RuntimeError("Failed to confirm server is running.")
484 | 
485 |     def _find_available_port(self, start_port=10000, end_port=11000):
486 |         """Find an available port between `start_port` and `end_port`."""
487 |         for port in range(start_port, end_port):
488 |             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
489 |                 if sock.connect_ex(("localhost", port)) != 0:
490 |                     return port
491 |         return None
492 | 
493 |     def _set_executable(self, file_path):
494 |         """Ensure the file at `file_path` is executable."""
495 |         if platform.system() != "Windows":
496 |             current_mode = os.stat(file_path).st_mode
497 |             os.chmod(
498 |                 file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
499 |             )
500 | 
501 |     def _log(self, message):
502 |         """Print a log message if verbosity is enabled."""
503 |         if self.verbose:
504 |             print(f"[LlamaCppServer] {message}")
505 | 


--------------------------------------------------------------------------------
/src/llama_cpp_runner/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-webui/llama-cpp-runner/300cf7f5713807ad50cd61a374a95207dfd06cd8/src/llama_cpp_runner/py.typed


--------------------------------------------------------------------------------