├── src
    ├── xllamacpp
    │   ├── __init__.pxd
    │   ├── __init__.py
    │   ├── server.h
    │   ├── server.pxd
    │   ├── memory.py
    │   └── server.cpp
    └── llama.cpp
    │   ├── src
    │       └── server.cpp
    │   └── include
    │       └── ggml-backend.h
├── .gitattributes
├── assets
    ├── logo.png
    ├── logo-white.png
    ├── logo.svg
    └── logo-white.svg
├── tests
    ├── dummy.gguf
    ├── data
    │   └── 11_truck.png
    ├── conftest.py
    ├── test_memory.py
    ├── bge-m3-metadata.json
    ├── test_params.py
    ├── test_server.py
    └── test_server_http.py
├── .gitmodules
├── MANIFEST.in
├── requirements.txt
├── pyproject.toml
├── LICENSE
├── Makefile
├── scripts
    ├── get-releases.sh
    ├── releases-to-pep-503.sh
    ├── setup.sh
    ├── copy_libs.py
    └── .clang-format
├── .github
    └── workflows
    │   ├── release-github-pypi.yaml
    │   ├── ci.yaml
    │   ├── build-wheel.yaml
    │   └── build-wheel-cuda-hip.yaml
├── .gitignore
├── setup.py
└── README.md


/src/xllamacpp/__init__.pxd:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | src/xllamacpp/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/tests/dummy.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/tests/dummy.gguf


--------------------------------------------------------------------------------
/assets/logo-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/assets/logo-white.png


--------------------------------------------------------------------------------
/tests/data/11_truck.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/tests/data/11_truck.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/llama.cpp"]
2 | 	path = thirdparty/llama.cpp
3 | 	url = https://github.com/ggml-org/llama.cpp.git
4 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | ROOT = Path(__file__).parent.parent
 4 | 
 5 | 
 6 | import pytest
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def model_path():
11 |     return str(ROOT / "models")
12 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include *.pxd
2 | global-include *.pyx
3 | global-include *.pxi
4 | include scripts/setup.sh
5 | include scripts/copy_libs.py
6 | include requirements.txt
7 | include Makefile
8 | recursive-include thirdparty/llama.cpp *
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # build requirements
 2 | setuptools
 3 | cython
 4 | wheel
 5 | 
 6 | # # macos fix wheel tool
 7 | # delocate; sys_platform == 'darwin'
 8 | 
 9 | # # macos fix wheel tool
10 | # auditwheel; sys_platform == 'linux'
11 | 
12 | # # windows fix wheel tool
13 | # delvewheel; sys_platform == 'win32'
14 | 
15 | # runtime requirements (optional)
16 | # numpy
17 | 
18 | # testing tools (optional)
19 | # pytest pytest-cov pytest-memray
20 | 


--------------------------------------------------------------------------------
/src/xllamacpp/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022-2025 XProbe Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .xllamacpp import *
16 | from .memory import estimate_gpu_layers
17 | 
18 | from . import _version
19 | 
20 | __version__ = _version.get_versions()["version"]
21 | if __version__ == "0+unknown":
22 |     print(_version.get_versions())
23 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "xllamacpp"
 3 | dynamic = ["version", "license"]
 4 | description = "A Python wrapper of llama.cpp"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "codingl2k1", email = "codingl2k1@outlook.com" }
 8 | ]
 9 | requires-python = ">3.8"
10 | dependencies = []
11 | 
12 | [project.optional-dependencies]
13 | test = ['pytest', 'pytest-cov']
14 | all = ['gguf', 'orjson']
15 | 
16 | [build-system]
17 | requires = ["setuptools >= 61", "cython", "versioneer[toml]"]
18 | build-backend = "setuptools.build_meta"
19 | 
20 | [tool.setuptools]
21 | include-package-data = false
22 | 
23 | [tool.pytest.ini_options]
24 | pythonpath = ["src"]
25 | testpaths = ["tests"]
26 | 
27 | [tool.versioneer]
28 | VCS = "git"
29 | style = "pep440"
30 | versionfile_source = "src/xllamacpp/_version.py"
31 | versionfile_build = "xllamacpp/_version.py"
32 | tag_prefix = "v"
33 | parentdir_prefix = "xllamacpp-"
34 | 
35 | [tool.black]
36 | required-version = "25.1.0"
37 | include = '\.pyi?$'
38 | exclude = '_version.py'
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Shakeeb Alireza
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # set path so `llama-cli` etc.. be in path
 2 | export PATH := $(PWD)/bin:$(PATH)
 3 | export MACOSX_DEPLOYMENT_TARGET := 12
 4 | 
 5 | # models
 6 | MODEL := bge-reranker-v2-m3-Q2_K.gguf
 7 | 
 8 | THIRDPARTY := $(PWD)/thirdparty
 9 | LLAMACPP := $(THIRDPARTY)/llama.cpp
10 | 
11 | .PHONY: all build wheel clean test download
12 | 
13 | all: build
14 | 
15 | build:
16 | 	@bash scripts/setup.sh
17 | 	python setup.py build_ext --inplace
18 | 
19 | wheel:
20 | 	@python setup.py bdist_wheel
21 | 
22 | clean:
23 | 	@rm -rf build dist src/llama.cpp src/*.egg-inf thirdparty/llama.cpp/build o .pytest_cache .coverage
24 | 
25 | test: build
26 | 	@pytest
27 | 
28 | $(MODEL):
29 | 	@mkdir -p models && cd models && \
30 | 		curl --output Llama-3.2-1B-Instruct-Q8_0.gguf -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf && \
31 | 		curl --output tinygemma3-Q8_0.gguf -L https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/tinygemma3-Q8_0.gguf && \
32 | 		curl --output mmproj-tinygemma3.gguf -L https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf && \
33 | 		curl --output Qwen3-Embedding-0.6B-Q8_0.gguf -L https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF/resolve/main/Qwen3-Embedding-0.6B-Q8_0.gguf && \
34 | 		curl --output bge-reranker-v2-m3-Q2_K.gguf -L https://modelscope.cn/models/gpustack/bge-reranker-v2-m3-GGUF/resolve/master/bge-reranker-v2-m3-Q2_K.gguf
35 | 
36 | download: $(MODEL)
37 | 	@echo "minimal model downloaded to models directory"
38 | 


--------------------------------------------------------------------------------
/scripts/get-releases.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to get all releases
 4 | get_all_releases() {
 5 |     local page=1
 6 |     local per_page=100
 7 |     local releases=""
 8 |     local new_releases
 9 | 
10 |     # Prepare headers
11 |     local headers=(-H "Accept: application/vnd.github.v3+json")
12 |     if [ -n "$GITHUB_TOKEN" ]; then
13 |         headers+=(-H "Authorization: Bearer $GITHUB_TOKEN")
14 |     fi
15 | 
16 |     while true; do
17 |         response=$(curl -s "${headers[@]}" \
18 |                         "https://api.github.com/repos/xorbitsai/xllamacpp/releases?page=$page&per_page=$per_page")
19 |         
20 |         # Check if the response is valid JSON
21 |         if ! echo "$response" | jq empty > /dev/null 2>&1; then
22 |             echo "Error: Invalid response from GitHub API" >&2
23 |             echo "Response: $response" >&2
24 |             return 1
25 |         fi
26 | 
27 |         new_releases=$(echo "$response" | jq -r '.[].tag_name')
28 |         if [ -z "$new_releases" ]; then
29 |             break
30 |         fi
31 |         releases="$releases $new_releases"
32 |         ((page++))
33 |     done
34 | 
35 |     echo $releases
36 | }
37 | 
38 | # Get all releases and save to file
39 | releases=$(get_all_releases)
40 | if [ $? -ne 0 ]; then
41 |     echo "Failed to fetch releases. Please check your internet connection and try again later." >&2
42 |     exit 1
43 | fi
44 | 
45 | echo "$releases" | tr ' ' '\n' > all_releases.txt
46 | 
47 | echo "All releases have been saved to all_releases.txt"
48 | 


--------------------------------------------------------------------------------
/src/xllamacpp/server.h:
--------------------------------------------------------------------------------
 1 | #include <memory>
 2 | #include <string>
 3 | #include <thread>
 4 | #include <vector>
 5 | 
 6 | #include "common.h"
 7 | 
 8 | struct server_context;
 9 | struct server_routes;
10 | 
11 | namespace xllamacpp {
12 | 
13 | std::string get_system_info();
14 | 
15 | std::vector<ggml_backend_dev_props> get_device_info();
16 | 
17 | typedef bool (*Callback)(std::string &&, void *py_cb);
18 | 
19 | // Convert a JSON schema string into a llama.cpp grammar string for structured
20 | // outputs
21 | std::string json_schema_to_grammar_str(const std::string &schema_json_str);
22 | 
23 | class Server {
24 | public:
25 |   Server(const common_params &params);
26 |   ~Server();
27 | 
28 |   std::string listening_address() const;
29 | 
30 |   std::string handle_metrics();
31 | 
32 |   std::string handle_embeddings(const std::string &input_json_str);
33 | 
34 |   std::string handle_rerank(const std::string &input_json_str);
35 | 
36 |   void handle_completions(const std::string &prompt_json_str,
37 |                           Callback res_error, void *py_cb_error,
38 |                           Callback res_ok, void *py_cb_ok);
39 | 
40 |   void handle_chat_completions(const std::string &prompt_json_str,
41 |                                Callback res_error, void *py_cb_error,
42 |                                Callback res_ok, void *py_cb_ok);
43 | 
44 | private:
45 |   common_params _params;
46 |   std::string _listening_address;
47 |   // Incomplete type of server_context
48 |   std::shared_ptr<server_context> _ctx_server;
49 |   std::shared_ptr<server_routes> _routes;
50 |   std::thread _loop_thread;
51 | };
52 | 
53 | void parse_tensor_buffer_overrides(
54 |     const std::string &value,
55 |     std::vector<llama_model_tensor_buft_override> &overrides);
56 | void build_tensor_buffer_overrides(
57 |     const std::vector<llama_model_tensor_buft_override> &overrides,
58 |     std::string &value);
59 | } // namespace xllamacpp
60 | 


--------------------------------------------------------------------------------
/src/xllamacpp/server.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override
 4 | from libcpp cimport bool as c_bool
 5 | from libcpp.string cimport string as std_string
 6 | from libcpp.vector cimport vector as std_vector
 7 | 
 8 | cdef extern from "server.h" namespace "xllamacpp" nogil:
 9 |     std_string c_get_system_info "xllamacpp::get_system_info" ()
10 | 
11 |     std_vector[ggml_backend_dev_props] c_get_device_info "xllamacpp::get_device_info" ()
12 | 
13 |     std_string c_json_schema_to_grammar_str "xllamacpp::json_schema_to_grammar_str" (const std_string & schema_json_str) except +
14 | 
15 |     ctypedef c_bool (*Callback "xllamacpp::Callback")(std_string &&, void *py_cb)
16 |     cdef cppclass CServer "xllamacpp::Server":
17 | 
18 |         CServer(const common_params& params) except +
19 | 
20 |         std_string listening_address() except +
21 | 
22 |         std_string handle_metrics() except +
23 | 
24 |         std_string handle_embeddings(const std_string &input_json_str) except +
25 | 
26 |         std_string handle_rerank(const std_string &input_json_str) except +
27 | 
28 |         void handle_completions(const std_string &prompt_json_str,
29 |                 Callback res_error,
30 |                 void *py_cb_error,
31 |                 Callback res_ok,
32 |                 void *py_cb_ok) except +
33 |         
34 |         void handle_chat_completions(const std_string &prompt_json_str,
35 |                 Callback res_error,
36 |                 void *py_cb_error,
37 |                 Callback res_ok,
38 |                 void *py_cb_ok) except +
39 | 
40 |     void c_parse_tensor_buffer_overrides "xllamacpp::parse_tensor_buffer_overrides" (
41 |         const std_string & value, std_vector[llama_model_tensor_buft_override] & overrides) except +
42 |     void c_build_tensor_buffer_overrides "xllamacpp::build_tensor_buffer_overrides" (
43 |         const std_vector[llama_model_tensor_buft_override] & overrides, std_string & value) except +
44 | 


--------------------------------------------------------------------------------
/.github/workflows/release-github-pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Wheels Index
 2 | 
 3 | on:
 4 |   # Trigger on new release
 5 |   workflow_run:
 6 |     workflows: ["Build Wheels (CUDA & HIP)"]
 7 |     types:
 8 |       - completed
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
14 | permissions:
15 |   contents: read
16 |   pages: write
17 |   id-token: write
18 | 
19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
21 | concurrency:
22 |   group: "pages"
23 |   cancel-in-progress: false
24 | 
25 | jobs:
26 |   # Single deploy job since we're just deploying
27 |   deploy:
28 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
29 |     environment:
30 |       name: github-pages
31 |       url: ${{ steps.deployment.outputs.page_url }}
32 |     runs-on: ubuntu-latest
33 |     steps:
34 |       - name: Checkout
35 |         uses: actions/checkout@v4
36 |       - name: Setup Pages
37 |         uses: actions/configure-pages@v5
38 |       - name: Build
39 |         env:
40 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
41 |         run: |
42 |           ./scripts/get-releases.sh
43 |           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
44 |           ./scripts/releases-to-pep-503.sh index/whl/cu128 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu128$'
45 |           ./scripts/releases-to-pep-503.sh index/whl/rocm-6.3.4 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm-6.3.4$'
46 |           ./scripts/releases-to-pep-503.sh index/whl/rocm-6.4.1 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm-6.4.1$'
47 |           ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan*'
48 |       - name: Upload artifact
49 |         uses: actions/upload-pages-artifact@v3
50 |         with:
51 |           # Upload entire repository
52 |           path: 'index'
53 |       - name: Deploy to GitHub Pages
54 |         id: deployment
55 |         uses: actions/deploy-pages@v4
56 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Python CI
 2 | 
 3 | # on:
 4 | #   push:
 5 | #     tags:
 6 | #       - '*'
 7 | #   workflow_dispatch:
 8 | 
 9 | on:
10 |   push:
11 |     branches:
12 |       - '*'
13 |   pull_request:
14 |     types: ['opened', 'reopened', 'synchronize']
15 | 
16 | concurrency:
17 |   group: ${{ github.workflow }}-${{ github.ref }}
18 |   cancel-in-progress: true
19 | 
20 | jobs:
21 |   lint:
22 |     runs-on: ${{ matrix.os }}
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         os: [ "ubuntu-latest" ]
27 |         python-version: [ "3.11" ]
28 |     steps:
29 |       - name: Check out code
30 |         uses: actions/checkout@v3
31 |         with:
32 |           fetch-depth: 0
33 |           submodules: recursive
34 |       - name: Set up Python environment
35 |         uses: actions/setup-python@v4
36 |         with:
37 |           python-version: "3.11"
38 |       - name: black
39 |         uses: psf/black@stable
40 |         with:
41 |           src: "src/xllamacpp"
42 |           options: "--check --verbose"
43 |           use_pyproject: true
44 |       - name: clang-format
45 |         uses: jidicula/clang-format-action@v4.15.0
46 |         with:
47 |             clang-format-version: '16'
48 |             check-path: 'src/xllamacpp'
49 | 
50 |   build_test_job:
51 |     runs-on: ${{ matrix.os }}
52 |     needs: lint
53 |     defaults:
54 |       run:
55 |         shell: bash -l {0}
56 |     strategy:
57 |       fail-fast: false
58 |       matrix:
59 |         os: [ "ubuntu-latest", "macos-latest", "windows-latest" ]
60 |         python-version: [ "3.11", "3.13" ]
61 | 
62 |     steps:
63 |       - name: Check out code
64 |         uses: actions/checkout@v3
65 |         with:
66 |           fetch-depth: 0
67 |           submodules: recursive
68 | 
69 |       - name: Set up conda ${{ matrix.python-version }}
70 |         uses: conda-incubator/setup-miniconda@v3
71 |         with:
72 |           python-version: ${{ matrix.python-version }}
73 | 
74 |       # Fix "version `GLIBCXX_3.4.30' not found (required by xoscar_store.cpython-311-x86_64-linux-gnu.so)" issue
75 |       - name: Install libstdcxx-ng
76 |         if: ${{ matrix.os == 'ubuntu-latest' }}
77 |         run: |
78 |             conda install -c conda-forge libstdcxx-ng
79 | 
80 |       - name: Install dependencies
81 |         env:
82 |           OS: ${{ matrix.os }}
83 |         run: |
84 |           pip install -r requirements.txt
85 |           pip install pytest pytest-timeout requests gguf orjson
86 |           make clean
87 |           make
88 |           make download
89 |         working-directory: .
90 | 
91 |       - name: Test with pytest
92 |         run: |
93 |             pytest --timeout=1500 -W ignore::PendingDeprecationWarning tests
94 |         working-directory: .
95 | 


--------------------------------------------------------------------------------
/scripts/releases-to-pep-503.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Enable exit on error
  4 | set -e
  5 | 
  6 | # Function for logging
  7 | log_error() {
  8 |     echo "ERROR: $1" >&2
  9 | }
 10 | 
 11 | log_info() {
 12 |     echo "INFO: $1"
 13 | }
 14 | 
 15 | # Get output directory or default to index/whl/cpu
 16 | output_dir=${1:-"index/whl/cpu"}
 17 | 
 18 | # Get pattern from second arg or default to valid python package version pattern
 19 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
 20 | 
 21 | # Get the current directory (where the script is run from)
 22 | current_dir="$(pwd)"
 23 | 
 24 | # Check if all_releases.txt exists
 25 | if [ ! -f "$current_dir/all_releases.txt" ]; then
 26 |     log_error "all_releases.txt not found in the current directory."
 27 |     exit 1
 28 | fi
 29 | 
 30 | # Create output directory
 31 | mkdir -p "$output_dir"
 32 | 
 33 | # Create an index html file
 34 | cat << EOF > "$output_dir/index.html"
 35 | <!DOCTYPE html>
 36 | <html>
 37 |   <head></head>
 38 |   <body>
 39 |     <a href="xllamacpp/">xllamacpp</a>
 40 |     <br>
 41 |   </body>
 42 | </html>
 43 | 
 44 | EOF
 45 | 
 46 | # Create xllamacpp directory
 47 | mkdir -p "$output_dir/xllamacpp"
 48 | 
 49 | # Create an index html file in xllamacpp directory
 50 | cat << EOF > "$output_dir/xllamacpp/index.html"
 51 | <!DOCTYPE html>
 52 | <html>
 53 |   <body>
 54 |     <h1>Links for xllamacpp</h1>
 55 | EOF
 56 | 
 57 | # Temporary aggregation directory for per-version links
 58 | tmp_dir="$output_dir/.tmp_xllamacpp_links"
 59 | rm -rf "$tmp_dir"
 60 | mkdir -p "$tmp_dir"
 61 | 
 62 | # Filter releases by pattern
 63 | releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
 64 | 
 65 | # Prepare curl headers
 66 | headers=('--header' 'Accept: application/vnd.github.v3+json')
 67 | if [ -n "$GITHUB_TOKEN" ]; then
 68 |     headers+=('--header' "authorization: Bearer $GITHUB_TOKEN")
 69 | fi
 70 | headers+=('--header' 'content-type: application/json')
 71 | 
 72 | # For each release, get all assets
 73 | for release in $releases; do
 74 |     log_info "Processing release: $release"
 75 |     response=$(curl -s "${headers[@]}" \
 76 |                     "https://api.github.com/repos/xorbitsai/xllamacpp/releases/tags/$release")
 77 |     
 78 |     if [ -z "$response" ]; then
 79 |         log_error "Empty response from GitHub API for release $release"
 80 |         continue
 81 |     fi
 82 | 
 83 |     if ! echo "$response" | jq -e '.assets' > /dev/null 2>&1; then
 84 |         log_error "Invalid or unexpected response from GitHub API for release $release"
 85 |         log_error "Response: $response"
 86 |         continue
 87 |     fi
 88 | 
 89 |     # Get release version from release ie v0.1.0-cu121 -> v0.1.0
 90 |     release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
 91 |     # Track first-seen order of versions
 92 |     if [ ! -f "$tmp_dir/.${release_version}.seen" ]; then
 93 |         echo "$release_version" >> "$tmp_dir/order.txt"
 94 |         : > "$tmp_dir/${release_version}.html"
 95 |         touch "$tmp_dir/.${release_version}.seen"
 96 |     fi
 97 |     
 98 |     wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
 99 |     if [ -z "$wheel_urls" ]; then
100 |         log_error "No wheel files found for release $release"
101 |         continue
102 |     fi
103 | 
104 |     echo "$wheel_urls" | while read -r asset; do
105 |         echo "    <a href=\"$asset\">$asset</a>" >> "$tmp_dir/${release_version}.html"
106 |         echo "    <br>" >> "$tmp_dir/${release_version}.html"
107 |     done
108 | done
109 | 
110 | if [ -f "$tmp_dir/order.txt" ]; then
111 |     while IFS= read -r ver; do
112 |         echo "    <h2>$ver</h2>" >> "$output_dir/xllamacpp/index.html"
113 |         cat "$tmp_dir/${ver}.html" >> "$output_dir/xllamacpp/index.html"
114 |     done < "$tmp_dir/order.txt"
115 | fi
116 | 
117 | # Close HTML and clean up
118 | echo "  </body>" >> "$output_dir/xllamacpp/index.html"
119 | echo "</html>" >> "$output_dir/xllamacpp/index.html"
120 | echo "" >> "$output_dir/xllamacpp/index.html"
121 | 
122 | rm -rf "$tmp_dir"
123 | 
124 | log_info "Index generation complete. Output directory: $output_dir"
125 | 


--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
  1 | # scripts/setup.sh [download_last_working] [release-tag]
  2 | #
  3 | # setup.sh 			: (default run) downloads, builds and install last working release of llama.cpp
  4 | # setup.sh 1 		: like default
  5 | # setup.sh 0    	: downloads, builds and install bleeding edge llama.cpp from repo
  6 | # setup.sh 1 <tag>	: downloads, builds and install <tag> release of llama.cpp
  7 | 
  8 | CWD=$(pwd)
  9 | THIRDPARTY=${CWD}/thirdparty
 10 | 
 11 | build_llamacpp() {
 12 | 	echo "update from llama.cpp main repo"
 13 | 	PROJECT=${THIRDPARTY}/llama.cpp
 14 | 	PREFIX=${CWD}/src/llama.cpp
 15 | 	NPROC=2
 16 | 	cd ${PROJECT} && \
 17 | 		mkdir -p build &&
 18 | 		cd build
 19 |   # Base CMake arguments
 20 |   local cmake_args=(
 21 |     "-DBUILD_SHARED_LIBS=OFF"
 22 |     "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
 23 |     "-DCMAKE_INSTALL_LIBDIR=lib"
 24 |     "-DLLAMA_CURL=OFF"
 25 | 	"-DLLAMA_LLGUIDANCE=ON"
 26 |   )
 27 | 
 28 |   # Add any additional CMake arguments from environment
 29 |   if [ -n "${CMAKE_ARGS}" ]; then
 30 |     cmake_args+=(${CMAKE_ARGS})
 31 |   fi
 32 | 
 33 |   # Build targets
 34 |   local targets=("common" "llama" "ggml" "ggml-cpu" "mtmd" "cpp-httplib" "server-context" "llama-server")
 35 |   
 36 |   if [[ -n "${XLLAMACPP_BUILD_CUDA}" ]]; then
 37 |     echo "Building for CUDA"
 38 |     cmake_args+=(
 39 |       "-DGGML_NATIVE=OFF"
 40 |       "-DGGML_CUDA=ON"
 41 |       "-DGGML_CUDA_FORCE_MMQ=ON"
 42 | 	  "-DCMAKE_CUDA_ARCHITECTURES=all"
 43 |     )
 44 |     targets+=("ggml-cuda")
 45 |   elif [[ -n "${XLLAMACPP_BUILD_HIP}" ]]; then
 46 |     echo "Building for AMD GPU"
 47 |     cmake_args+=(
 48 |       "-DGGML_NATIVE=OFF"
 49 |       "-DAMDGPU_TARGETS=gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 50 |       "-DCMAKE_HIP_COMPILER=$(hipconfig -l)/clang"
 51 |       "-DGGML_HIP_ROCWMMA_FATTN=ON"
 52 |       "-DGGML_HIP=ON"
 53 |     )
 54 |     targets+=("ggml-hip")
 55 |   elif [[ -n "${XLLAMACPP_BUILD_VULKAN}" ]]; then
 56 |     if [[ "$(uname -s)" == "Darwin" ]]; then
 57 |       cmake_args+=("-DCMAKE_BUILD_RPATH=@loader_path")
 58 |       if [[ "$(uname -m)" == "x86_64" ]]; then
 59 |         echo "Building for Intel with Vulkan"
 60 |         cmake_args+=(
 61 |           "-DGGML_METAL=OFF"
 62 |           "-DGGML_VULKAN=ON"
 63 |         )
 64 |         targets+=("ggml-blas" "ggml-vulkan")
 65 |       else
 66 |         echo "Building for Apple Silicon with Vulkan is not supported"
 67 |         exit 1
 68 |       fi
 69 |     else
 70 |       echo "Building with Vulkan"
 71 |       cmake_args+=(
 72 |         "-DGGML_NATIVE=OFF"
 73 |         "-DGGML_VULKAN=ON"
 74 |       )
 75 |       targets+=("ggml-vulkan")
 76 |     fi
 77 |   elif [[ -n "${XLLAMACPP_BUILD_AARCH64}" ]]; then
 78 |     echo "Building for aarch64"
 79 |     cmake_args+=(
 80 |       "-DGGML_NATIVE=OFF"
 81 |       "-DGGML_CPU_ARM_ARCH=armv8-a"
 82 |     )
 83 |     # Add ggml-blas target if BLAS is enabled via CMAKE_ARGS
 84 |     if [[ "${CMAKE_ARGS:-}" == *"-DGGML_BLAS=ON"* ]]; then
 85 |       echo "BLAS is enabled via CMAKE_ARGS, adding ggml-blas to build targets"
 86 |       targets+=("ggml-blas")
 87 |     fi
 88 |   else
 89 |     if [[ "$(uname -s)" == "Darwin" ]]; then
 90 |       cmake_args+=("-DCMAKE_BUILD_RPATH=@loader_path")
 91 |       if [[ "$(uname -m)" == "x86_64" ]]; then
 92 |         echo "Building for Intel"
 93 |         cmake_args+=("-DGGML_METAL=OFF")
 94 |         targets+=("ggml-blas")
 95 |       else
 96 |         echo "Building for Apple Silicon"
 97 |         cmake_args+=("-DGGML_METAL_EMBED_LIBRARY=ON")
 98 |         targets+=("ggml-blas" "ggml-metal")
 99 |       fi
100 |     else
101 |       echo "Building for non-MacOS CPU (optimize for native CPU)"
102 |       # Let CMake handle GGML_BLAS from environment
103 |       if [[ "${CMAKE_ARGS:-}" == *"-DGGML_BLAS=ON"* ]]; then
104 |         echo "BLAS is enabled via CMAKE_ARGS, adding ggml-blas to build targets"
105 |         targets+=("ggml-blas")
106 |       fi
107 |     fi
108 |   fi
109 | 
110 |   # Run CMake and build
111 |   echo "Running CMake with arguments: ${cmake_args[*]}"
112 |   echo "Building targets: ${targets[*]}"
113 | 
114 |   cmake .. "${cmake_args[@]}" && \
115 |   cmake --build . --config Release -j ${NPROC} --target "${targets[@]}"
116 |   rm -rf ${PREFIX}
117 |   python ${CWD}/scripts/copy_libs.py
118 |   cd ${CWD}
119 | }
120 | 
121 | build_llamacpp
122 | 


--------------------------------------------------------------------------------
/scripts/copy_libs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to copy:
  4 | 1. All .a and .lib files from thirdparty/llama.cpp/build to src/llama.cpp/lib
  5 | 2. All .h, .hpp files from thirdparty/llama.cpp to src/llama.cpp/include
  6 | 3. All .cpp, .cc files from thirdparty/llama.cpp to src/llama.cpp/src
  7 | """
  8 | 
  9 | import os
 10 | import shutil
 11 | import glob
 12 | import logging
 13 | 
 14 | ROOT = os.path.join(os.path.dirname(__file__), "..")
 15 | 
 16 | 
 17 | def copy_library_files():
 18 |     # Define source and destination directories
 19 |     src_dir = os.path.join(ROOT, "thirdparty", "llama.cpp", "build")
 20 |     dst_dir = os.path.join(ROOT, "src", "llama.cpp", "lib")
 21 | 
 22 |     # Create destination directory if it doesn't exist
 23 |     if not os.path.exists(dst_dir):
 24 |         logging.info(f"Creating destination directory: {dst_dir}")
 25 |         os.makedirs(dst_dir, exist_ok=True)
 26 | 
 27 |     # Recursively find all static library files
 28 |     lib_files = []
 29 |     for ext in (".a", ".lib"):
 30 |         pattern = os.path.join(src_dir, "**", f"*{ext}")
 31 |         lib_files.extend(glob.glob(pattern, recursive=True))
 32 | 
 33 |     if not lib_files:
 34 |         logging.warning(f"No .a or .lib files found in {src_dir}")
 35 |         return
 36 | 
 37 |     linked_count = 0
 38 |     skipped_count = 0
 39 | 
 40 |     for lib_file in lib_files:
 41 |         filename = os.path.basename(lib_file)
 42 |         dst_file = os.path.join(dst_dir, filename)
 43 | 
 44 |         # Skip if link or file already exists
 45 |         if os.path.exists(dst_file):
 46 |             logging.info(f"Skipping {filename} - already exists at {dst_file}")
 47 |             skipped_count += 1
 48 |             continue
 49 | 
 50 |         logging.info(f"Linking {lib_file} -> {dst_file}")
 51 |         os.symlink(lib_file, dst_file)
 52 |         linked_count += 1
 53 | 
 54 |     logging.info(
 55 |         f"Successfully linked {linked_count} libraries to {dst_dir} "
 56 |         f"({skipped_count} skipped)"
 57 |     )
 58 | 
 59 | 
 60 | def copy_source_files(target, source_paths):
 61 |     # Define source base directory and destination directory
 62 |     src_base = os.path.join(ROOT, "thirdparty", "llama.cpp")
 63 |     dst_dir = os.path.join(ROOT, "src", "llama.cpp", target)
 64 | 
 65 |     # Create destination directory if it doesn't exist
 66 |     if not os.path.exists(dst_dir):
 67 |         logging.info(f"Creating destination directory: {dst_dir}")
 68 |         os.makedirs(dst_dir, exist_ok=True)
 69 | 
 70 |     # Copy each file to destination
 71 |     copied_count = 0
 72 |     skipped_count = 0
 73 |     for rel_path in source_paths:
 74 |         src_file = os.path.join(src_base, rel_path)
 75 | 
 76 |         # Skip if source file doesn't exist
 77 |         if not os.path.exists(src_file):
 78 |             logging.warning(f"Source file not found: {src_file}")
 79 |             continue
 80 | 
 81 |         # Create subdirectories in destination if needed
 82 |         filename = os.path.basename(rel_path)
 83 |         dst_file = os.path.join(dst_dir, filename)
 84 | 
 85 |         # Skip if destination file already exists
 86 |         if os.path.exists(dst_file):
 87 |             logging.info(f"Skipping {filename} - already exists at {dst_file}")
 88 |             skipped_count += 1
 89 |             continue
 90 | 
 91 |         logging.info(f"Copying {src_file} to {dst_file}")
 92 |         shutil.copy2(src_file, dst_file)
 93 |         copied_count += 1
 94 | 
 95 |     logging.info(
 96 |         f"Successfully copied {copied_count} source files to {dst_dir} ({skipped_count} skipped)"
 97 |     )
 98 | 
 99 | 
100 | def main():
101 |     # Setup logging
102 |     logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
103 | 
104 |     # Copy library files
105 |     copy_library_files()
106 | 
107 |     # Copy header files
108 |     copy_source_files(
109 |         "include",
110 |         [
111 |             "common/common.h",
112 |             "ggml/include/ggml.h",
113 |             "ggml/include/ggml-backend.h",
114 |             "include/llama.h",
115 |         ],
116 |     )
117 |     copy_source_files(
118 |         "src",
119 |         [
120 |             "tools/server/server.cpp",
121 |         ],
122 |     )
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/tests/test_memory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022-2025 XProbe Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import os.path
 15 | import json
 16 | from dataclasses import dataclass
 17 | 
 18 | from xllamacpp import estimate_gpu_layers
 19 | from xllamacpp.memory import graph_size
 20 | 
 21 | TEST_GGUF = os.path.join(os.path.dirname(os.path.abspath(__file__)), "dummy.gguf")
 22 | TEST_METADATA_JSON = os.path.join(
 23 |     os.path.dirname(os.path.abspath(__file__)), "bge-m3-metadata.json"
 24 | )
 25 | 
 26 | 
 27 | def test_estimate_gpu_layers():
 28 |     estimate = estimate_gpu_layers(
 29 |         [{"name": "CPU", "memory_free": 0}],
 30 |         TEST_GGUF,
 31 |         [],
 32 |         context_length=2048,
 33 |         batch_size=512,
 34 |         num_parallel=1,
 35 |         kv_cache_type="",
 36 |     )
 37 |     assert estimate.layers == 0
 38 |     assert estimate.graph == 0
 39 | 
 40 |     graph_partial_offload = 202377216
 41 |     graph_full_offload = 171968512
 42 |     layer_size = 33554436
 43 |     projector_size = 0
 44 |     memory_layer_output = 4
 45 |     gpu_minimum_memory = 2048
 46 | 
 47 |     gpus = [
 48 |         {"name": "cuda", "memory_min": gpu_minimum_memory},
 49 |         {"name": "cuda", "memory_min": gpu_minimum_memory},
 50 |     ]
 51 | 
 52 |     @dataclass
 53 |     class _TestInfo:
 54 |         layer0: int  # type: ignore
 55 |         layer1: int  # type: ignore
 56 |         expect0: int  # type: ignore
 57 |         expect1: int  # type: ignore
 58 | 
 59 |     test_data = [
 60 |         _TestInfo(*v)
 61 |         for v in [
 62 |             [1, 1, 1, 1],
 63 |             [2, 1, 2, 1],
 64 |             [2, 2, 2, 2],
 65 |             [1, 2, 1, 2],
 66 |             [3, 3, 3, 3],
 67 |             [4, 4, 3, 3],
 68 |             [6, 6, 3, 3],
 69 |             [0, 3, 0, 3],
 70 |         ]
 71 |     ]
 72 |     for i, s in enumerate(test_data):
 73 |         gpus[0]["memory_free"] = 0
 74 |         gpus[1]["memory_free"] = 0
 75 |         gpus[0]["memory_free"] += projector_size
 76 |         if s.layer0 > 0:
 77 |             gpus[0]["memory_free"] += memory_layer_output
 78 |         else:
 79 |             gpus[1]["memory_free"] += memory_layer_output
 80 |         gpus[0]["memory_free"] += (
 81 |             gpu_minimum_memory + layer_size + s.layer0 * layer_size + 1
 82 |         )
 83 |         gpus[1]["memory_free"] += (
 84 |             gpu_minimum_memory + layer_size + s.layer1 * layer_size + 1
 85 |         )
 86 |         gpus[0]["memory_free"] += max(graph_full_offload, graph_partial_offload)
 87 |         gpus[1]["memory_free"] += max(graph_full_offload, graph_partial_offload)
 88 |         estimate = estimate_gpu_layers(
 89 |             gpus,
 90 |             TEST_GGUF,
 91 |             [],
 92 |             context_length=2048,
 93 |             batch_size=512,
 94 |             num_parallel=1,
 95 |             kv_cache_type="",
 96 |         )
 97 |         assert s.expect0 + s.expect1 == estimate.layers
 98 |         assert [
 99 |             s.expect0 / estimate.layers,
100 |             s.expect1 / estimate.layers,
101 |         ] == estimate.tensor_split
102 |         layer_sums = sum(estimate.gpu_sizes)
103 |         if estimate.layers < 6:
104 |             assert estimate.vram_size < estimate.total_size
105 |             assert estimate.vram_size == layer_sums
106 |         else:
107 |             assert estimate.vram_size == estimate.total_size
108 |             assert estimate.total_size == layer_sums
109 | 
110 | 
111 | def test_missing_keys():
112 |     with open(TEST_METADATA_JSON, "r") as f:
113 |         metadata = json.load(f)
114 |     kv, partial_offload, full_offload = graph_size(
115 |         metadata, context_length=4096, batch_size=2048, num_parallel=8, kv_cache_type=""
116 |     )
117 |     assert full_offload == 67108864.0
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | vscode/
  4 | build/
  5 | models/
  6 | 
  7 | src/xllamacpp/xllamacpp.cpp
  8 | src/llama.cpp/bin/
  9 | src/llama.cpp/lib/
 10 | src/llama.cpp/include/*
 11 | !src/llama.cpp/include/common.h
 12 | !src/llama.cpp/include/ggml.h
 13 | !src/llama.cpp/include/ggml-backend.h
 14 | !src/llama.cpp/include/llama.h
 15 | tests/*.cpp
 16 | changes.diff
 17 | 
 18 | # editor
 19 | .vscode
 20 | 
 21 | # project detritus
 22 | .ruff_cache
 23 | 
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | share/python-wheels/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | MANIFEST
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | htmlcov/
 65 | .tox/
 66 | .nox/
 67 | .coverage
 68 | .coverage.*
 69 | .cache
 70 | nosetests.xml
 71 | coverage.xml
 72 | *.cover
 73 | *.py,cover
 74 | .hypothesis/
 75 | .pytest_cache/
 76 | cover/
 77 | 
 78 | # Translations
 79 | *.mo
 80 | *.pot
 81 | 
 82 | # Django stuff:
 83 | *.log
 84 | local_settings.py
 85 | db.sqlite3
 86 | db.sqlite3-journal
 87 | 
 88 | # Flask stuff:
 89 | instance/
 90 | .webassets-cache
 91 | 
 92 | # Scrapy stuff:
 93 | .scrapy
 94 | 
 95 | # Sphinx documentation
 96 | docs/_build/
 97 | 
 98 | # PyBuilder
 99 | .pybuilder/
100 | target/
101 | 
102 | # Jupyter Notebook
103 | .ipynb_checkpoints
104 | 
105 | # IPython
106 | profile_default/
107 | ipython_config.py
108 | 
109 | # pyenv
110 | #   For a library or package, you might want to ignore these files since the code is
111 | #   intended to run in multiple environments; otherwise, check them in:
112 | # .python-version
113 | 
114 | # pipenv
115 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | #   install all needed dependencies.
119 | #Pipfile.lock
120 | 
121 | # poetry
122 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
123 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
124 | #   commonly ignored for libraries.
125 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
126 | #poetry.lock
127 | 
128 | # pdm
129 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
130 | #pdm.lock
131 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
132 | #   in version control.
133 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
134 | .pdm.toml
135 | .pdm-python
136 | .pdm-build/
137 | 
138 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
139 | __pypackages__/
140 | 
141 | # Celery stuff
142 | celerybeat-schedule
143 | celerybeat.pid
144 | 
145 | # SageMath parsed files
146 | *.sage.py
147 | 
148 | # Environments
149 | .env
150 | .venv
151 | env/
152 | venv/
153 | ENV/
154 | env.bak/
155 | venv.bak/
156 | 
157 | # Spyder project settings
158 | .spyderproject
159 | .spyproject
160 | 
161 | # Rope project settings
162 | .ropeproject
163 | 
164 | # mkdocs documentation
165 | /site
166 | 
167 | # mypy
168 | .mypy_cache/
169 | .dmypy.json
170 | dmypy.json
171 | 
172 | # Pyre type checker
173 | .pyre/
174 | 
175 | # pytype static type analyzer
176 | .pytype/
177 | 
178 | # Cython debug symbols
179 | cython_debug/
180 | 
181 | # PyCharm
182 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
183 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
184 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
185 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
186 | .idea/
187 | 


--------------------------------------------------------------------------------
/assets/logo.svg:
--------------------------------------------------------------------------------
 1 | <svg width="1087" height="240" viewBox="0 0 1087 240" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path d="M79.9883 122.93C86.8589 130.553 94.6472 137.283 103.177 142.965C110.494 147.871 118.306 152.012 126.483 155.295C141.33 141.236 153.624 124.706 162.812 106.436L216.459 0L122.918 73.7179C105.682 87.3062 91.1295 104 79.9883 122.93Z" fill="url(#paint0_linear_5885_354)"/>
 3 | <path d="M71.9375 189.434C65.161 184.928 58.761 180.175 52.7139 175.293L19.9844 239.999L78.8081 193.881C76.5022 192.411 74.1963 190.952 71.9257 189.422L71.9375 189.434Z" fill="url(#paint1_linear_5885_354)"/>
 4 | <path d="M192.017 86.1407C209.617 109.482 214.758 135.423 202.829 153.27C185.394 179.341 138.146 178.329 97.2875 151.035C56.4285 123.741 37.4284 80.4701 54.852 54.3994C66.7932 36.5523 92.7227 31.3993 121.029 38.6934C72.0756 17.9169 25.4872 19.9757 7.42832 46.9405C-15.2424 80.8818 15.7225 141.423 76.5815 182.035C137.441 222.647 205.194 228.153 227.864 194.235C245.912 167.212 229.947 123.411 192.017 86.1407Z" fill="#FF8338"/>
 5 | <path d="M240.31 70V11H256.731V70H240.31ZM219 48.7307V32.3101H278V48.7307H219Z" fill="#FF8338"/>
 6 | <path d="M329.653 99.5455L345.989 127.806H346.688L363.199 99.5455H387.397L360.403 144.273L388.271 189H363.461L346.688 160.434H345.989L329.217 189H304.582L332.318 144.273L305.281 99.5455H329.653ZM398.579 189V99.5455H420.2V171.441H457.414V189H398.579ZM468.607 189V99.5455H490.228V171.441H527.443V189H468.607ZM559.186 189H535.949L566.131 99.5455H594.916L625.098 189H601.861L580.851 122.084H580.152L559.186 189ZM556.085 153.795H604.656V170.218H556.085V153.795ZM634.729 99.5455H661.504L684.218 154.93H685.266L707.979 99.5455H734.754V189H713.701V134.052H712.958L691.468 188.432H678.015L656.525 133.746H655.782V189H634.729V99.5455ZM767.59 189H744.353L774.535 99.5455H803.319L833.501 189H810.264L789.255 122.084H788.556L767.59 189ZM764.489 153.795H813.06V170.218H764.489V153.795ZM920.128 131.955H898.288C897.997 129.713 897.4 127.689 896.497 125.884C895.595 124.078 894.401 122.535 892.916 121.254C891.431 119.973 889.669 118.997 887.631 118.327C885.621 117.629 883.394 117.279 880.948 117.279C876.609 117.279 872.867 118.342 869.722 120.468C866.606 122.593 864.204 125.665 862.515 129.684C860.855 133.702 860.025 138.565 860.025 144.273C860.025 150.213 860.87 155.192 862.559 159.211C864.277 163.2 866.679 166.214 869.766 168.252C872.882 170.262 876.565 171.266 880.817 171.266C883.204 171.266 885.374 170.961 887.325 170.349C889.305 169.738 891.037 168.849 892.523 167.685C894.037 166.491 895.274 165.049 896.235 163.36C897.225 161.642 897.91 159.706 898.288 157.551L920.128 157.682C919.749 161.642 918.599 165.544 916.677 169.388C914.784 173.232 912.178 176.741 908.858 179.915C905.539 183.06 901.491 185.564 896.716 187.428C891.969 189.291 886.524 190.223 880.38 190.223C872.285 190.223 865.034 188.447 858.628 184.894C852.251 181.312 847.213 176.1 843.515 169.257C839.817 162.414 837.968 154.086 837.968 144.273C837.968 134.43 839.846 126.088 843.602 119.245C847.358 112.402 852.44 107.204 858.846 103.651C865.252 100.099 872.43 98.3224 880.38 98.3224C885.796 98.3224 890.805 99.0795 895.405 100.594C900.006 102.079 904.054 104.263 907.548 107.146C911.042 109.999 913.882 113.508 916.066 117.672C918.249 121.836 919.604 126.597 920.128 131.955ZM932.5 189V99.5455H969.452C976.15 99.5455 981.93 100.856 986.793 103.477C991.685 106.068 995.456 109.694 998.106 114.353C1000.76 118.983 1002.08 124.37 1002.08 130.514C1002.08 136.687 1000.73 142.089 998.018 146.719C995.339 151.32 991.51 154.887 986.531 157.42C981.551 159.953 975.64 161.22 968.797 161.22H945.997V144.185H964.778C968.04 144.185 970.763 143.618 972.946 142.482C975.16 141.346 976.834 139.759 977.97 137.721C979.105 135.653 979.673 133.251 979.673 130.514C979.673 127.748 979.105 125.36 977.97 123.35C976.834 121.312 975.16 119.74 972.946 118.633C970.733 117.527 968.011 116.973 964.778 116.973H954.121V189H932.5ZM1012.74 189V99.5455H1049.69C1056.39 99.5455 1062.17 100.856 1067.03 103.477C1071.92 106.068 1075.69 109.694 1078.34 114.353C1080.99 118.983 1082.32 124.37 1082.32 130.514C1082.32 136.687 1080.96 142.089 1078.26 146.719C1075.58 151.32 1071.75 154.887 1066.77 157.42C1061.79 159.953 1055.88 161.22 1049.04 161.22H1026.23V144.185H1045.02C1048.28 144.185 1051 143.618 1053.18 142.482C1055.4 141.346 1057.07 139.759 1058.21 137.721C1059.34 135.653 1059.91 133.251 1059.91 130.514C1059.91 127.748 1059.34 125.36 1058.21 123.35C1057.07 121.312 1055.4 119.74 1053.18 118.633C1050.97 117.527 1048.25 116.973 1045.02 116.973H1034.36V189H1012.74Z" fill="#1A1A1A"/>
 7 | <defs>
 8 | <linearGradient id="paint0_linear_5885_354" x1="75.89" y1="32.9823" x2="211.18" y2="97.4911" gradientUnits="userSpaceOnUse">
 9 | <stop stop-color="#6A0CF5"/>
10 | <stop offset="1" stop-color="#AB66F3"/>
11 | </linearGradient>
12 | <linearGradient id="paint1_linear_5885_354" x1="18.2179" y1="189.036" x2="75.7844" y2="217.431" gradientUnits="userSpaceOnUse">
13 | <stop stop-color="#6A0CF5"/>
14 | <stop offset="1" stop-color="#AB66F3"/>
15 | </linearGradient>
16 | </defs>
17 | </svg>
18 | 


--------------------------------------------------------------------------------
/assets/logo-white.svg:
--------------------------------------------------------------------------------
 1 | <svg width="1087" height="240" viewBox="0 0 1087 240" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path d="M79.9883 122.93C86.8589 130.553 94.6472 137.283 103.177 142.965C110.494 147.871 118.306 152.012 126.483 155.295C141.33 141.236 153.624 124.706 162.812 106.436L216.459 0L122.918 73.7179C105.682 87.3062 91.1295 104 79.9883 122.93Z" fill="url(#paint0_linear_5885_346)"/>
 3 | <path d="M71.9375 189.434C65.161 184.928 58.761 180.175 52.7139 175.293L19.9844 239.999L78.8081 193.881C76.5022 192.411 74.1963 190.952 71.9257 189.422L71.9375 189.434Z" fill="url(#paint1_linear_5885_346)"/>
 4 | <path d="M192.017 86.1407C209.617 109.482 214.758 135.423 202.829 153.27C185.394 179.341 138.146 178.329 97.2875 151.035C56.4285 123.741 37.4284 80.4701 54.852 54.3994C66.7932 36.5523 92.7227 31.3993 121.029 38.6934C72.0756 17.9169 25.4872 19.9757 7.42832 46.9405C-15.2424 80.8818 15.7225 141.423 76.5815 182.035C137.441 222.647 205.194 228.153 227.864 194.235C245.912 167.212 229.947 123.411 192.017 86.1407Z" fill="#FF8338"/>
 5 | <path d="M240.31 70V11H256.731V70H240.31ZM219 48.7307V32.3101H278V48.7307H219Z" fill="#FF8338"/>
 6 | <path d="M329.653 99.5455L345.989 127.806H346.688L363.199 99.5455H387.397L360.403 144.273L388.271 189H363.461L346.688 160.434H345.989L329.217 189H304.582L332.318 144.273L305.281 99.5455H329.653ZM398.579 189V99.5455H420.2V171.441H457.414V189H398.579ZM468.607 189V99.5455H490.228V171.441H527.443V189H468.607ZM559.186 189H535.949L566.131 99.5455H594.916L625.098 189H601.861L580.851 122.084H580.152L559.186 189ZM556.085 153.795H604.656V170.218H556.085V153.795ZM634.729 99.5455H661.504L684.218 154.93H685.266L707.979 99.5455H734.754V189H713.701V134.052H712.958L691.468 188.432H678.015L656.525 133.746H655.782V189H634.729V99.5455ZM767.59 189H744.353L774.535 99.5455H803.319L833.501 189H810.264L789.255 122.084H788.556L767.59 189ZM764.489 153.795H813.06V170.218H764.489V153.795ZM920.128 131.955H898.288C897.997 129.713 897.4 127.689 896.497 125.884C895.595 124.078 894.401 122.535 892.916 121.254C891.431 119.973 889.669 118.997 887.631 118.327C885.621 117.629 883.394 117.279 880.948 117.279C876.609 117.279 872.867 118.342 869.722 120.468C866.606 122.593 864.204 125.665 862.515 129.684C860.855 133.702 860.025 138.565 860.025 144.273C860.025 150.213 860.87 155.192 862.559 159.211C864.277 163.2 866.679 166.214 869.766 168.252C872.882 170.262 876.565 171.266 880.817 171.266C883.204 171.266 885.374 170.961 887.325 170.349C889.305 169.738 891.037 168.849 892.523 167.685C894.037 166.491 895.274 165.049 896.235 163.36C897.225 161.642 897.91 159.706 898.288 157.551L920.128 157.682C919.749 161.642 918.599 165.544 916.677 169.388C914.784 173.232 912.178 176.741 908.858 179.915C905.539 183.06 901.491 185.564 896.716 187.428C891.969 189.291 886.524 190.223 880.38 190.223C872.285 190.223 865.034 188.447 858.628 184.894C852.251 181.312 847.213 176.1 843.515 169.257C839.817 162.414 837.968 154.086 837.968 144.273C837.968 134.43 839.846 126.088 843.602 119.245C847.358 112.402 852.44 107.204 858.846 103.651C865.252 100.099 872.43 98.3224 880.38 98.3224C885.796 98.3224 890.805 99.0795 895.405 100.594C900.006 102.079 904.054 104.263 907.548 107.146C911.042 109.999 913.882 113.508 916.066 117.672C918.249 121.836 919.604 126.597 920.128 131.955ZM932.5 189V99.5455H969.452C976.15 99.5455 981.93 100.856 986.793 103.477C991.685 106.068 995.456 109.694 998.106 114.353C1000.76 118.983 1002.08 124.37 1002.08 130.514C1002.08 136.687 1000.73 142.089 998.018 146.719C995.339 151.32 991.51 154.887 986.531 157.42C981.551 159.953 975.64 161.22 968.797 161.22H945.997V144.185H964.778C968.04 144.185 970.763 143.618 972.946 142.482C975.16 141.346 976.834 139.759 977.97 137.721C979.105 135.653 979.673 133.251 979.673 130.514C979.673 127.748 979.105 125.36 977.97 123.35C976.834 121.312 975.16 119.74 972.946 118.633C970.733 117.527 968.011 116.973 964.778 116.973H954.121V189H932.5ZM1012.74 189V99.5455H1049.69C1056.39 99.5455 1062.17 100.856 1067.03 103.477C1071.92 106.068 1075.69 109.694 1078.34 114.353C1080.99 118.983 1082.32 124.37 1082.32 130.514C1082.32 136.687 1080.96 142.089 1078.26 146.719C1075.58 151.32 1071.75 154.887 1066.77 157.42C1061.79 159.953 1055.88 161.22 1049.04 161.22H1026.23V144.185H1045.02C1048.28 144.185 1051 143.618 1053.18 142.482C1055.4 141.346 1057.07 139.759 1058.21 137.721C1059.34 135.653 1059.91 133.251 1059.91 130.514C1059.91 127.748 1059.34 125.36 1058.21 123.35C1057.07 121.312 1055.4 119.74 1053.18 118.633C1050.97 117.527 1048.25 116.973 1045.02 116.973H1034.36V189H1012.74Z" fill="white"/>
 7 | <defs>
 8 | <linearGradient id="paint0_linear_5885_346" x1="75.89" y1="32.9823" x2="211.18" y2="97.4911" gradientUnits="userSpaceOnUse">
 9 | <stop stop-color="#6A0CF5"/>
10 | <stop offset="1" stop-color="#AB66F3"/>
11 | </linearGradient>
12 | <linearGradient id="paint1_linear_5885_346" x1="18.2179" y1="189.036" x2="75.7844" y2="217.431" gradientUnits="userSpaceOnUse">
13 | <stop stop-color="#6A0CF5"/>
14 | <stop offset="1" stop-color="#AB66F3"/>
15 | </linearGradient>
16 | </defs>
17 | </svg>
18 | 


--------------------------------------------------------------------------------
/tests/bge-m3-metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "GGUF.version": {
  3 |     "index": 0,
  4 |     "type": "UINT32",
  5 |     "offset": 4,
  6 |     "value": 3
  7 |   },
  8 |   "GGUF.tensor_count": {
  9 |     "index": 1,
 10 |     "type": "UINT64",
 11 |     "offset": 8,
 12 |     "value": 389
 13 |   },
 14 |   "GGUF.kv_count": {
 15 |     "index": 2,
 16 |     "type": "UINT64",
 17 |     "offset": 16,
 18 |     "value": 33
 19 |   },
 20 |   "general.architecture": {
 21 |     "index": 3,
 22 |     "type": "STRING",
 23 |     "offset": 24,
 24 |     "value": "bert"
 25 |   },
 26 |   "general.type": {
 27 |     "index": 4,
 28 |     "type": "STRING",
 29 |     "offset": 68,
 30 |     "value": "model"
 31 |   },
 32 |   "general.size_label": {
 33 |     "index": 5,
 34 |     "type": "STRING",
 35 |     "offset": 105,
 36 |     "value": "567M"
 37 |   },
 38 |   "general.license": {
 39 |     "index": 6,
 40 |     "type": "STRING",
 41 |     "offset": 147,
 42 |     "value": "mit"
 43 |   },
 44 |   "general.tags": {
 45 |     "index": 7,
 46 |     "type": "ARRAY",
 47 |     "offset": 185,
 48 |     "array_types": [
 49 |       "STRING"
 50 |     ],
 51 |     "value": []
 52 |   },
 53 |   "bert.block_count": {
 54 |     "index": 8,
 55 |     "type": "UINT32",
 56 |     "offset": 330,
 57 |     "value": 24
 58 |   },
 59 |   "bert.context_length": {
 60 |     "index": 9,
 61 |     "type": "UINT32",
 62 |     "offset": 362,
 63 |     "value": 8192
 64 |   },
 65 |   "bert.embedding_length": {
 66 |     "index": 10,
 67 |     "type": "UINT32",
 68 |     "offset": 397,
 69 |     "value": 1024
 70 |   },
 71 |   "bert.feed_forward_length": {
 72 |     "index": 11,
 73 |     "type": "UINT32",
 74 |     "offset": 434,
 75 |     "value": 4096
 76 |   },
 77 |   "bert.attention.head_count": {
 78 |     "index": 12,
 79 |     "type": "UINT32",
 80 |     "offset": 474,
 81 |     "value": 16
 82 |   },
 83 |   "bert.attention.layer_norm_epsilon": {
 84 |     "index": 13,
 85 |     "type": "FLOAT32",
 86 |     "offset": 515,
 87 |     "value": 0.000009999999747378752
 88 |   },
 89 |   "general.file_type": {
 90 |     "index": 14,
 91 |     "type": "UINT32",
 92 |     "offset": 564,
 93 |     "value": 15
 94 |   },
 95 |   "bert.attention.causal": {
 96 |     "index": 15,
 97 |     "type": "BOOL",
 98 |     "offset": 597,
 99 |     "value": false
100 |   },
101 |   "bert.pooling_type": {
102 |     "index": 16,
103 |     "type": "UINT32",
104 |     "offset": 631,
105 |     "value": 2
106 |   },
107 |   "tokenizer.ggml.model": {
108 |     "index": 17,
109 |     "type": "STRING",
110 |     "offset": 664,
111 |     "value": "t5"
112 |   },
113 |   "tokenizer.ggml.pre": {
114 |     "index": 18,
115 |     "type": "STRING",
116 |     "offset": 706,
117 |     "value": "default"
118 |   },
119 |   "tokenizer.ggml.tokens": {
120 |     "index": 19,
121 |     "type": "ARRAY",
122 |     "offset": 751,
123 |     "array_types": [
124 |       "STRING"
125 |     ],
126 |     "value": []
127 |   },
128 |   "tokenizer.ggml.scores": {
129 |     "index": 20,
130 |     "type": "ARRAY",
131 |     "offset": 4582162,
132 |     "array_types": [
133 |       "FLOAT32"
134 |     ],
135 |     "value": []
136 |   },
137 |   "tokenizer.ggml.token_type": {
138 |     "index": 21,
139 |     "type": "ARRAY",
140 |     "offset": 5582215,
141 |     "array_types": [
142 |       "INT32"
143 |     ],
144 |     "value": []
145 |   },
146 |   "tokenizer.ggml.add_space_prefix": {
147 |     "index": 22,
148 |     "type": "BOOL",
149 |     "offset": 6582272,
150 |     "value": true
151 |   },
152 |   "tokenizer.ggml.token_type_count": {
153 |     "index": 23,
154 |     "type": "UINT32",
155 |     "offset": 6582316,
156 |     "value": 1
157 |   },
158 |   "tokenizer.ggml.remove_extra_whitespaces": {
159 |     "index": 24,
160 |     "type": "BOOL",
161 |     "offset": 6582363,
162 |     "value": true
163 |   },
164 |   "tokenizer.ggml.precompiled_charsmap": {
165 |     "index": 25,
166 |     "type": "ARRAY",
167 |     "offset": 6582415,
168 |     "array_types": [
169 |       "UINT8"
170 |     ],
171 |     "value": []
172 |   },
173 |   "tokenizer.ggml.bos_token_id": {
174 |     "index": 26,
175 |     "type": "UINT32",
176 |     "offset": 6820013,
177 |     "value": 0
178 |   },
179 |   "tokenizer.ggml.eos_token_id": {
180 |     "index": 27,
181 |     "type": "UINT32",
182 |     "offset": 6820056,
183 |     "value": 2
184 |   },
185 |   "tokenizer.ggml.unknown_token_id": {
186 |     "index": 28,
187 |     "type": "UINT32",
188 |     "offset": 6820099,
189 |     "value": 3
190 |   },
191 |   "tokenizer.ggml.seperator_token_id": {
192 |     "index": 29,
193 |     "type": "UINT32",
194 |     "offset": 6820146,
195 |     "value": 2
196 |   },
197 |   "tokenizer.ggml.padding_token_id": {
198 |     "index": 30,
199 |     "type": "UINT32",
200 |     "offset": 6820195,
201 |     "value": 1
202 |   },
203 |   "tokenizer.ggml.cls_token_id": {
204 |     "index": 31,
205 |     "type": "UINT32",
206 |     "offset": 6820242,
207 |     "value": 0
208 |   },
209 |   "tokenizer.ggml.mask_token_id": {
210 |     "index": 32,
211 |     "type": "UINT32",
212 |     "offset": 6820285,
213 |     "value": 250001
214 |   },
215 |   "tokenizer.ggml.add_bos_token": {
216 |     "index": 33,
217 |     "type": "BOOL",
218 |     "offset": 6820329,
219 |     "value": true
220 |   },
221 |   "tokenizer.ggml.add_eos_token": {
222 |     "index": 34,
223 |     "type": "BOOL",
224 |     "offset": 6820370,
225 |     "value": true
226 |   },
227 |   "general.quantization_version": {
228 |     "index": 35,
229 |     "type": "UINT32",
230 |     "offset": 6820411,
231 |     "value": 2
232 |   }
233 | }
234 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheel.yaml:
--------------------------------------------------------------------------------
  1 | name: Build and upload to PyPI
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - '*'
  7 |   workflow_dispatch:
  8 | 
  9 | concurrency:
 10 |   group: ${{ github.workflow }}-${{ github.ref }}
 11 |   cancel-in-progress: true
 12 | 
 13 | jobs:
 14 |   build_wheels:
 15 |     name: Build wheels on ${{ matrix.os }} for Python ${{ matrix.python }} ${{ matrix.arch }}
 16 |     runs-on: ${{ matrix.os }}
 17 |     strategy:
 18 |       fail-fast: false
 19 |       matrix:
 20 |         include:
 21 |           # Linux x86_64
 22 |           - os: ubuntu-latest
 23 |             arch: auto
 24 |             platform-id: manylinux_x86_64
 25 |             python: 310
 26 |             requires-python: ">=3.10,<3.11"
 27 |           - os: ubuntu-latest
 28 |             arch: auto
 29 |             platform-id: manylinux_x86_64
 30 |             python: 311
 31 |             requires-python: ">=3.11,<3.12"
 32 |           - os: ubuntu-latest
 33 |             arch: auto
 34 |             platform-id: manylinux_x86_64
 35 |             python: 312
 36 |             requires-python: ">=3.12,<3.13"
 37 |           - os: ubuntu-latest
 38 |             arch: auto
 39 |             platform-id: manylinux_x86_64
 40 |             python: 313
 41 |             requires-python: ">=3.13,<3.14"
 42 | 
 43 |           # Linux aarch64
 44 |           - os: ubuntu-22.04-arm
 45 |             arch: aarch64
 46 |             platform-id: manylinux_aarch64
 47 |             python: 310
 48 |             requires-python: ">=3.10,<3.11"
 49 |           - os: ubuntu-22.04-arm
 50 |             arch: aarch64
 51 |             platform-id: manylinux_aarch64
 52 |             python: 311
 53 |             requires-python: ">=3.11,<3.12"
 54 |           - os: ubuntu-22.04-arm
 55 |             arch: aarch64
 56 |             platform-id: manylinux_aarch64
 57 |             python: 312
 58 |             requires-python: ">=3.12,<3.13"
 59 |           - os: ubuntu-22.04-arm
 60 |             arch: aarch64
 61 |             platform-id: manylinux_aarch64
 62 |             python: 313
 63 |             requires-python: ">=3.13,<3.14"
 64 | 
 65 |           # macOS x86_64
 66 |           - os: macos-15-intel
 67 |             arch: x86_64
 68 |             platform-id: macosx_x86_64
 69 |             python: 310
 70 |             requires-python: ">=3.10,<3.11"
 71 |           - os: macos-15-intel
 72 |             arch: x86_64
 73 |             platform-id: macosx_x86_64
 74 |             python: 311
 75 |             requires-python: ">=3.11,<3.12"
 76 |           - os: macos-15-intel
 77 |             arch: x86_64
 78 |             platform-id: macosx_x86_64
 79 |             python: 312
 80 |             requires-python: ">=3.12,<3.13"
 81 |           - os: macos-15-intel
 82 |             arch: x86_64
 83 |             platform-id: macosx_x86_64
 84 |             python: 313
 85 |             requires-python: ">=3.13,<3.14"
 86 | 
 87 |           # macOS arm64
 88 |           - os: macos-14
 89 |             arch: arm64
 90 |             platform-id: macosx_arm64
 91 |             python: 310
 92 |             requires-python: ">=3.10,<3.11"
 93 |           - os: macos-14
 94 |             arch: arm64
 95 |             platform-id: macosx_arm64
 96 |             python: 311
 97 |             requires-python: ">=3.11,<3.12"
 98 |           - os: macos-14
 99 |             arch: arm64
100 |             platform-id: macosx_arm64
101 |             python: 312
102 |             requires-python: ">=3.12,<3.13"
103 |           - os: macos-14
104 |             arch: arm64
105 |             platform-id: macosx_arm64
106 |             python: 313
107 |             requires-python: ">=3.13,<3.14"
108 | 
109 |           # Windows AMD64
110 |           - os: windows-2022
111 |             arch: AMD64
112 |             platform-id: win_amd64
113 |             python: 310
114 |             requires-python: ">=3.10,<3.11"
115 |           - os: windows-2022
116 |             arch: AMD64
117 |             platform-id: win_amd64
118 |             python: 311
119 |             requires-python: ">=3.11,<3.12"
120 |           - os: windows-2022
121 |             arch: AMD64
122 |             platform-id: win_amd64
123 |             python: 312
124 |             requires-python: ">=3.12,<3.13"
125 |           - os: windows-2022
126 |             arch: AMD64
127 |             platform-id: win_amd64
128 |             python: 313
129 |             requires-python: ">=3.13,<3.14"
130 | 
131 |     steps:
132 |       - uses: actions/checkout@v4
133 |         with:
134 |           fetch-depth: 0
135 |           submodules: recursive
136 |       
137 |       - name: Add msbuild to PATH
138 |         if: ${{ matrix.os == 'windows-latest'}}
139 |         uses: microsoft/setup-msbuild@v2
140 |         with:
141 |           vs-version: '[17.13,17.14)'
142 | 
143 |       - name: Build wheels
144 |         uses: pypa/cibuildwheel@v2.22.0
145 |         env:
146 |           VERSIONEER_CLOSEST_TAG_ONLY: 1
147 |           CIBW_SKIP: pp* *i686
148 |           CIBW_ARCHS: ${{ matrix.arch }}
149 |           CIBW_PROJECT_REQUIRES_PYTHON: ${{ matrix.requires-python }}
150 |           CIBW_TEST_REQUIRES: pytest requests pytest-asyncio pytest-timeout
151 |           CIBW_BEFORE_ALL_LINUX: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && source $HOME/.cargo/env
152 |           CIBW_BEFORE_BUILD: pip install -r requirements.txt && make
153 |           CIBW_BUILD_VERBOSITY: 1
154 |           CIBW_ENVIRONMENT_LINUX: "PATH=$HOME/.cargo/bin:$PATH XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}"
155 |           CIBW_ENVIRONMENT_MACOS: "XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}"
156 |           CIBW_ENVIRONMENT_WINDOWS: "XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}"
157 |         with:
158 |           package-dir: ./
159 | 
160 |       - uses: actions/upload-artifact@v4
161 |         with:
162 |           name: wheel-${{ matrix.python }}-${{ matrix.platform-id }}
163 |           path: wheelhouse/*.whl
164 | 
165 |   build_sdist:
166 |     name: Build source distribution
167 |     runs-on: ubuntu-latest
168 |     steps:
169 |       - uses: actions/checkout@v4
170 |         with:
171 |           fetch-depth: 0
172 |           submodules: recursive
173 | 
174 |       - name: Build sdist
175 |         run: pipx run build --sdist
176 | 
177 |       - uses: actions/upload-artifact@v4
178 |         with:
179 |           name: artifacts
180 |           path: ./dist/*.tar.gz
181 | 
182 |   upload_pypi:
183 |     needs: [build_wheels, build_sdist]
184 |     runs-on: ubuntu-latest
185 |     # upload to PyPI on every tag starting with 'v'
186 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
187 |     steps:
188 |       - uses: actions/download-artifact@v4
189 |         with:
190 |           path: dist
191 |           pattern: 'wheel-*'
192 |           merge-multiple: true
193 | 
194 |       - uses: actions/download-artifact@v4
195 |         with:
196 |           path: dist
197 |           name: artifacts
198 | 
199 |       - name: Publish to PyPI
200 |         if: github.repository == 'xorbitsai/xllamacpp'
201 |         uses: pypa/gh-action-pypi-publish@v1.12.4
202 |         with:
203 |           user: __token__
204 |           password: ${{ secrets.PYPI_PASSWORD }}
205 | 
206 |       - name: Publish to Test PyPI
207 |         if: github.repository != 'xorbitsai/xllamacpp'
208 |         uses: pypa/gh-action-pypi-publish@v1.12.4
209 |         with:
210 |           user: __token__
211 |           password: ${{ secrets.TEST_PYPI_PASSWORD }}
212 |           verbose: true
213 |           repository_url: https://test.pypi.org/legacy/
214 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import os
  4 | import sys
  5 | import platform
  6 | import subprocess
  7 | from setuptools import Extension, setup
  8 | 
  9 | from Cython.Build import cythonize
 10 | 
 11 | # -----------------------------------------------------------------------------
 12 | # constants
 13 | 
 14 | BUILD_CUDA = os.getenv("XLLAMACPP_BUILD_CUDA")
 15 | BUILD_HIP = os.getenv("XLLAMACPP_BUILD_HIP")
 16 | BUILD_VULKAN = os.getenv("XLLAMACPP_BUILD_VULKAN")
 17 | NAME = "xllamacpp"
 18 | # NAME = "xllamacpp-cuda12x" if BUILD_CUDA else "xllamacpp"
 19 | CWD = os.path.dirname(os.path.abspath(__file__))
 20 | 
 21 | sys.path.insert(0, CWD)
 22 | import versioneer
 23 | 
 24 | VERSION = versioneer.get_version()
 25 | 
 26 | PLATFORM = platform.system()
 27 | 
 28 | LLAMACPP_LIBS_DIR = os.path.join(CWD, "src/llama.cpp/lib")
 29 | 
 30 | DEFINE_MACROS = []
 31 | if PLATFORM == "Windows":
 32 |     EXTRA_COMPILE_ARGS = ["/std:c++17"]
 33 | else:
 34 |     EXTRA_COMPILE_ARGS = ["-std=c++17"]
 35 |     if PLATFORM == "Darwin":
 36 |         EXTRA_COMPILE_ARGS.append("-mmacosx-version-min=12.0")
 37 | EXTRA_LINK_ARGS = []
 38 | EXTRA_OBJECTS = []
 39 | INCLUDE_DIRS = [
 40 |     "src/xllamacpp",
 41 |     os.path.join(CWD, "thirdparty/llama.cpp/include"),
 42 |     os.path.join(CWD, "thirdparty/llama.cpp/common"),
 43 |     os.path.join(CWD, "thirdparty/llama.cpp/ggml/include"),
 44 |     os.path.join(
 45 |         CWD, "thirdparty/llama.cpp"
 46 |     ),  # For including 'common/base64.hpp' in server/utils.hpp
 47 |     os.path.join(
 48 |         CWD, "thirdparty/llama.cpp/build/tools/server"
 49 |     ),  # For including index.html.gz.hpp and loading.html.hpp
 50 |     os.path.join(CWD, "thirdparty/llama.cpp/tools/server"),
 51 |     os.path.join(CWD, "thirdparty/llama.cpp/tools/mtmd"),
 52 |     os.path.join(CWD, "thirdparty/llama.cpp/vendor"),
 53 | ]
 54 | LIBRARY_DIRS = [
 55 |     LLAMACPP_LIBS_DIR,
 56 | ]
 57 | LIBRARIES = []
 58 | 
 59 | if PLATFORM == "Windows":
 60 |     LIBRARIES.extend(
 61 |         [
 62 |             "common",
 63 |             "llama",
 64 |             "ggml",
 65 |             "ggml-base",
 66 |             "ggml-cpu",
 67 |             "mtmd",
 68 |             "cpp-httplib",
 69 |             "server-context",
 70 |             "llguidance",
 71 |             "Advapi32",
 72 |             "userenv",
 73 |             "ntdll",
 74 |         ]
 75 |     )
 76 |     if BUILD_CUDA:
 77 |         LIBRARY_DIRS.extend([os.getenv("CUDA_PATH", "") + "\\Lib\\x64"])
 78 |         LIBRARIES.extend(["ggml-cuda", "cudart", "cublas", "cublasLt", "cuda"])
 79 |     if BUILD_VULKAN:
 80 |         LIBRARY_DIRS.extend([os.getenv("VULKAN_SDK", "") + "\\Lib"])
 81 |         LIBRARIES.extend(["ggml-vulkan", "vulkan-1"])
 82 | else:
 83 |     LIBRARIES.extend(["pthread"])
 84 |     EXTRA_OBJECTS.extend(
 85 |         [
 86 |             f"{LLAMACPP_LIBS_DIR}/libserver-context.a",
 87 |             f"{LLAMACPP_LIBS_DIR}/libcpp-httplib.a",
 88 |             f"{LLAMACPP_LIBS_DIR}/libmtmd.a",
 89 |             f"{LLAMACPP_LIBS_DIR}/libcommon.a",
 90 |             f"{LLAMACPP_LIBS_DIR}/libllguidance.a",
 91 |             f"{LLAMACPP_LIBS_DIR}/libllama.a",
 92 |             f"{LLAMACPP_LIBS_DIR}/libggml.a",
 93 |             f"{LLAMACPP_LIBS_DIR}/libggml-cpu.a",
 94 |             f"{LLAMACPP_LIBS_DIR}/libggml-base.a",
 95 |         ]
 96 |     )
 97 |     if BUILD_CUDA:
 98 |         EXTRA_OBJECTS.extend(
 99 |             [
100 |                 f"{LLAMACPP_LIBS_DIR}/libggml-cuda.a",
101 |             ]
102 |         )
103 |         LIBRARY_DIRS.extend(
104 |             [
105 |                 os.getenv("CUDA_PATH", "") + "/lib/stubs",
106 |                 os.getenv("CUDA_PATH", "") + "/lib",
107 |             ],
108 |         )
109 |         LIBRARIES.extend(["cudart", "cublas", "cublasLt", "cuda"])
110 |     if BUILD_HIP:
111 |         EXTRA_OBJECTS.extend(
112 |             [
113 |                 f"{LLAMACPP_LIBS_DIR}/libggml-hip.a",
114 |             ]
115 |         )
116 |         LIBRARY_DIRS.extend(["/opt/rocm/lib"])
117 |         LIBRARIES.extend(["amdhip64", "hipblas", "rocblas"])
118 |     if BUILD_VULKAN:
119 |         EXTRA_OBJECTS.extend(
120 |             [
121 |                 f"{LLAMACPP_LIBS_DIR}/libggml-vulkan.a",
122 |             ]
123 |         )
124 |         LIBRARIES.extend(["vulkan"])
125 | 
126 | if PLATFORM == "Darwin":
127 |     EXTRA_LINK_ARGS.append("-Wl,-rpath," + LLAMACPP_LIBS_DIR)
128 |     os.environ["LDFLAGS"] = " ".join(
129 |         [
130 |             "-framework Accelerate",
131 |             "-framework Foundation",
132 |             "-framework Metal",
133 |             "-framework MetalKit",
134 |         ]
135 |     )
136 |     # Both the Intel and ARM platforms need to be linked with BLAS.
137 |     EXTRA_OBJECTS.extend(
138 |         [
139 |             f"{LLAMACPP_LIBS_DIR}/libggml-blas.a",
140 |         ]
141 |     )
142 |     if platform.processor() == "arm":
143 |         EXTRA_OBJECTS.extend(
144 |             [
145 |                 f"{LLAMACPP_LIBS_DIR}/libggml-metal.a",
146 |             ]
147 |         )
148 | elif PLATFORM == "Linux":
149 |     EXTRA_LINK_ARGS.extend(["-fopenmp", "-static-libgcc"])
150 |     # Check if BLAS is enabled in environment
151 |     if os.path.exists(f"{LLAMACPP_LIBS_DIR}/libggml-blas.a"):
152 |         print("BLAS is enabled, adding ggml-blas to link targets")
153 |         EXTRA_OBJECTS.extend([f"{LLAMACPP_LIBS_DIR}/libggml-blas.a"])
154 |         EXTRA_LINK_ARGS.extend(["-lopenblas"])
155 | 
156 | INCLUDE_DIRS.append(os.path.join(CWD, "src/xllamacpp"))
157 | 
158 | 
159 | def mk_extension(name, sources, define_macros=None):
160 |     return Extension(
161 |         name=name,
162 |         sources=sources,
163 |         define_macros=define_macros if define_macros else [],
164 |         include_dirs=INCLUDE_DIRS,
165 |         libraries=LIBRARIES,
166 |         library_dirs=LIBRARY_DIRS,
167 |         extra_objects=EXTRA_OBJECTS,
168 |         extra_compile_args=EXTRA_COMPILE_ARGS,
169 |         extra_link_args=EXTRA_LINK_ARGS,
170 |         language="c++",
171 |     )
172 | 
173 | 
174 | # ----------------------------------------------------------------------------
175 | # COMMON SETUP CONFIG
176 | 
177 | common = {
178 |     "name": NAME,
179 |     "version": VERSION,
180 |     "description": "A cython wrapper of the llama.cpp inference engine.",
181 |     "python_requires": ">=3.9",
182 |     "cmdclass": versioneer.get_cmdclass(),
183 |     "license": "MIT",
184 |     # "include_package_data": True,
185 | }
186 | 
187 | 
188 | # forces cythonize in this case
189 | subprocess.call("cythonize *.pyx", cwd="src/xllamacpp", shell=True)
190 | 
191 | if not os.path.exists("MANIFEST.in"):
192 |     with open("MANIFEST.in", "w") as f:
193 |         f.write("exclude src/xllamacpp/*.pxd\n")
194 |         f.write("exclude src/xllamacpp/*.pyx\n")
195 |         f.write("exclude src/xllamacpp/*.cpp\n")
196 |         f.write("exclude src/xllamacpp/*.h\n")
197 |         f.write("exclude src/xllamacpp/py.typed\n")
198 | 
199 | extensions = [
200 |     mk_extension(
201 |         "xllamacpp.xllamacpp",
202 |         sources=[
203 |             "src/xllamacpp/xllamacpp.pyx",
204 |             "src/xllamacpp/server.cpp",
205 |             "thirdparty/llama.cpp/tools/server/server-models.cpp",
206 |             "thirdparty/llama.cpp/tools/server/server-http.cpp",
207 |         ],
208 |     ),
209 | ]
210 | 
211 | setup(
212 |     **common,
213 |     ext_modules=cythonize(
214 |         extensions,
215 |         compiler_directives={
216 |             "language_level": "3",
217 |             "embedsignature": False,  # default: False
218 |             "emit_code_comments": False,  # default: True
219 |             "warn.unused": True,  # default: False
220 |         },
221 |     ),
222 |     package_dir={"": "src"},
223 | )
224 | 


--------------------------------------------------------------------------------
/scripts/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | AccessModifierOffset: -4
  4 | AlignAfterOpenBracket: DontAlign
  5 | AlignArrayOfStructures: None
  6 | AlignConsecutiveAssignments:
  7 |   Enabled:         false
  8 |   AcrossEmptyLines: false
  9 |   AcrossComments:  false
 10 |   AlignCompound:   false
 11 |   AlignFunctionPointers: false
 12 |   PadOperators:    true
 13 | AlignConsecutiveBitFields:
 14 |   Enabled:         false
 15 |   AcrossEmptyLines: false
 16 |   AcrossComments:  false
 17 |   AlignCompound:   false
 18 |   AlignFunctionPointers: false
 19 |   PadOperators:    false
 20 | AlignConsecutiveDeclarations:
 21 |   Enabled:         false
 22 |   AcrossEmptyLines: false
 23 |   AcrossComments:  false
 24 |   AlignCompound:   false
 25 |   AlignFunctionPointers: false
 26 |   PadOperators:    false
 27 | AlignConsecutiveMacros:
 28 |   Enabled:         false
 29 |   AcrossEmptyLines: false
 30 |   AcrossComments:  false
 31 |   AlignCompound:   false
 32 |   AlignFunctionPointers: false
 33 |   PadOperators:    false
 34 | AlignConsecutiveShortCaseStatements:
 35 |   Enabled:         false
 36 |   AcrossEmptyLines: false
 37 |   AcrossComments:  false
 38 |   AlignCaseArrows: false
 39 |   AlignCaseColons: false
 40 | AlignConsecutiveTableGenBreakingDAGArgColons:
 41 |   Enabled:         false
 42 |   AcrossEmptyLines: false
 43 |   AcrossComments:  false
 44 |   AlignCompound:   false
 45 |   AlignFunctionPointers: false
 46 |   PadOperators:    false
 47 | AlignConsecutiveTableGenCondOperatorColons:
 48 |   Enabled:         false
 49 |   AcrossEmptyLines: false
 50 |   AcrossComments:  false
 51 |   AlignCompound:   false
 52 |   AlignFunctionPointers: false
 53 |   PadOperators:    false
 54 | AlignConsecutiveTableGenDefinitionColons:
 55 |   Enabled:         false
 56 |   AcrossEmptyLines: false
 57 |   AcrossComments:  false
 58 |   AlignCompound:   false
 59 |   AlignFunctionPointers: false
 60 |   PadOperators:    false
 61 | AlignEscapedNewlines: Left
 62 | AlignOperands:   DontAlign
 63 | AlignTrailingComments:
 64 |   Kind:            Never
 65 |   OverEmptyLines:  0
 66 | # AllowAllArgumentsOnNextLine: false
 67 | # AllowAllParametersOfDeclarationOnNextLine: false
 68 | AllowAllArgumentsOnNextLine: true
 69 | AllowAllParametersOfDeclarationOnNextLine: true
 70 | AllowBreakBeforeNoexceptSpecifier: Never
 71 | AllowShortBlocksOnASingleLine: Empty
 72 | AllowShortCaseExpressionOnASingleLine: true
 73 | AllowShortCaseLabelsOnASingleLine: false
 74 | AllowShortCompoundRequirementOnASingleLine: true
 75 | AllowShortEnumsOnASingleLine: true
 76 | AllowShortFunctionsOnASingleLine: None
 77 | AllowShortIfStatementsOnASingleLine: Never
 78 | AllowShortLambdasOnASingleLine: All
 79 | AllowShortLoopsOnASingleLine: false
 80 | AlwaysBreakAfterDefinitionReturnType: None
 81 | AlwaysBreakBeforeMultilineStrings: false
 82 | AttributeMacros:
 83 |   - __capability
 84 | BinPackArguments: true
 85 | BinPackParameters: true
 86 | BitFieldColonSpacing: Both
 87 | BraceWrapping:
 88 |   AfterCaseLabel:  false
 89 |   AfterClass:      false
 90 |   AfterControlStatement: Never
 91 |   AfterEnum:       false
 92 |   AfterExternBlock: false
 93 |   AfterFunction:   true
 94 |   AfterNamespace:  false
 95 |   AfterObjCDeclaration: false
 96 |   AfterStruct:     false
 97 |   AfterUnion:      false
 98 |   BeforeCatch:     false
 99 |   BeforeElse:      false
100 |   BeforeLambdaBody: false
101 |   BeforeWhile:     false
102 |   IndentBraces:    false
103 |   SplitEmptyFunction: true
104 |   SplitEmptyRecord: true
105 |   SplitEmptyNamespace: true
106 | BreakAdjacentStringLiterals: true
107 | BreakAfterAttributes: Leave
108 | BreakAfterJavaFieldAnnotations: false
109 | BreakAfterReturnType: None
110 | BreakArrays:     true
111 | BreakBeforeBinaryOperators: All
112 | BreakBeforeConceptDeclarations: Always
113 | BreakBeforeBraces: WebKit
114 | BreakBeforeInlineASMColon: OnlyMultiline
115 | BreakBeforeTernaryOperators: true
116 | BreakConstructorInitializers: BeforeComma
117 | BreakFunctionDefinitionParameters: false
118 | BreakInheritanceList: BeforeColon
119 | BreakStringLiterals: true
120 | BreakTemplateDeclarations: MultiLine
121 | ColumnLimit:     300
122 | CommentPragmas:  '^ IWYU pragma:'
123 | CompactNamespaces: false
124 | ConstructorInitializerIndentWidth: 4
125 | ContinuationIndentWidth: 4
126 | Cpp11BracedListStyle: false
127 | DerivePointerAlignment: false
128 | DisableFormat:   false
129 | EmptyLineAfterAccessModifier: Never
130 | EmptyLineBeforeAccessModifier: LogicalBlock
131 | ExperimentalAutoDetectBinPacking: false
132 | FixNamespaceComments: false
133 | ForEachMacros:
134 |   - foreach
135 |   - Q_FOREACH
136 |   - BOOST_FOREACH
137 | IfMacros:
138 |   - KJ_IF_MAYBE
139 | IncludeBlocks:   Preserve
140 | IncludeCategories:
141 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
142 |     Priority:        2
143 |     SortPriority:    0
144 |     CaseSensitive:   false
145 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
146 |     Priority:        3
147 |     SortPriority:    0
148 |     CaseSensitive:   false
149 |   - Regex:           '.*'
150 |     Priority:        1
151 |     SortPriority:    0
152 |     CaseSensitive:   false
153 | IncludeIsMainRegex: '(Test)?$'
154 | IncludeIsMainSourceRegex: ''
155 | IndentAccessModifiers: false
156 | IndentCaseBlocks: false
157 | IndentCaseLabels: false
158 | IndentExternBlock: AfterExternBlock
159 | IndentGotoLabels: true
160 | IndentPPDirectives: None
161 | IndentRequiresClause: true
162 | IndentWidth:     4
163 | IndentWrappedFunctionNames: false
164 | InsertBraces:    false
165 | InsertNewlineAtEOF: false
166 | InsertTrailingCommas: None
167 | IntegerLiteralSeparator:
168 |   Binary:          0
169 |   BinaryMinDigits: 0
170 |   Decimal:         0
171 |   DecimalMinDigits: 0
172 |   Hex:             0
173 |   HexMinDigits:    0
174 | JavaScriptQuotes: Leave
175 | JavaScriptWrapImports: true
176 | KeepEmptyLines:
177 |   AtEndOfFile:     false
178 |   AtStartOfBlock:  true
179 |   AtStartOfFile:   true
180 | LambdaBodyIndentation: Signature
181 | LineEnding:      DeriveLF
182 | MacroBlockBegin: ''
183 | MacroBlockEnd:   ''
184 | MainIncludeChar: Quote
185 | MaxEmptyLinesToKeep: 1
186 | NamespaceIndentation: Inner
187 | ObjCBinPackProtocolList: Auto
188 | ObjCBlockIndentWidth: 4
189 | ObjCBreakBeforeNestedBlockParam: true
190 | ObjCSpaceAfterProperty: true
191 | ObjCSpaceBeforeProtocolList: true
192 | PackConstructorInitializers: BinPack
193 | PenaltyBreakAssignment: 2
194 | PenaltyBreakBeforeFirstCallParameter: 19
195 | PenaltyBreakComment: 300
196 | PenaltyBreakFirstLessLess: 120
197 | PenaltyBreakOpenParenthesis: 0
198 | PenaltyBreakScopeResolution: 500
199 | PenaltyBreakString: 1000
200 | PenaltyBreakTemplateDeclaration: 10
201 | PenaltyExcessCharacter: 1000000
202 | PenaltyIndentedWhitespace: 0
203 | PenaltyReturnTypeOnItsOwnLine: 60
204 | PointerAlignment: Left
205 | PPIndentWidth:   -1
206 | QualifierAlignment: Leave
207 | ReferenceAlignment: Pointer
208 | ReflowComments:  true
209 | RemoveBracesLLVM: false
210 | RemoveParentheses: Leave
211 | RemoveSemicolon: false
212 | RequiresClausePosition: OwnLine
213 | RequiresExpressionIndentation: OuterScope
214 | SeparateDefinitionBlocks: Leave
215 | ShortNamespaceLines: 1
216 | SkipMacroDefinitionBody: false
217 | SortIncludes:    CaseSensitive
218 | SortJavaStaticImport: Before
219 | SortUsingDeclarations: LexicographicNumeric
220 | SpaceAfterCStyleCast: false
221 | SpaceAfterLogicalNot: false
222 | SpaceAfterTemplateKeyword: true
223 | SpaceAroundPointerQualifiers: Default
224 | SpaceBeforeAssignmentOperators: true
225 | SpaceBeforeCaseColon: false
226 | SpaceBeforeCpp11BracedList: true
227 | SpaceBeforeCtorInitializerColon: true
228 | SpaceBeforeInheritanceColon: true
229 | SpaceBeforeJsonColon: false
230 | SpaceBeforeParens: ControlStatements
231 | SpaceBeforeParensOptions:
232 |   AfterControlStatements: true
233 |   AfterForeachMacros: true
234 |   AfterFunctionDefinitionName: false
235 |   AfterFunctionDeclarationName: false
236 |   AfterIfMacros:   true
237 |   AfterOverloadedOperator: false
238 |   AfterPlacementOperator: true
239 |   AfterRequiresInClause: false
240 |   AfterRequiresInExpression: false
241 |   BeforeNonEmptyParentheses: false
242 | SpaceBeforeRangeBasedForLoopColon: true
243 | SpaceBeforeSquareBrackets: false
244 | SpaceInEmptyBlock: true
245 | SpacesBeforeTrailingComments: 1
246 | SpacesInAngles:  Never
247 | SpacesInContainerLiterals: true
248 | SpacesInLineCommentPrefix:
249 |   Minimum:         1
250 |   Maximum:         -1
251 | SpacesInParens:  Never
252 | SpacesInParensOptions:
253 |   ExceptDoubleParentheses: false
254 |   InCStyleCasts:   false
255 |   InConditionalStatements: false
256 |   InEmptyParentheses: false
257 |   Other:           false
258 | SpacesInSquareBrackets: false
259 | Standard:        Latest
260 | StatementAttributeLikeMacros:
261 |   - Q_EMIT
262 | StatementMacros:
263 |   - Q_UNUSED
264 |   - QT_REQUIRE_VERSION
265 | TableGenBreakInsideDAGArg: DontBreak
266 | TabWidth:        4
267 | UseTab:          Never
268 | VerilogBreakBetweenInstancePorts: true
269 | WhitespaceSensitiveMacros:
270 |   - BOOST_PP_STRINGIZE
271 |   - CF_SWIFT_NAME
272 |   - NS_SWIFT_NAME
273 |   - PP_STRINGIZE
274 |   - STRINGIZE
275 | ...
276 | 
277 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <img src="./assets/logo.png" width="400px" alt="xorbits" />
  3 | 
  4 | # xllamacpp - a Python wrapper of llama.cpp
  5 | 
  6 | [![PyPI Latest Release](https://img.shields.io/pypi/v/xllamacpp.svg?style=for-the-badge)](https://pypi.org/project/xllamacpp/)
  7 | [![License](https://img.shields.io/pypi/l/xllamacpp.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE)
  8 | [![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5)
  9 | [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio)
 10 | 
 11 | </div>
 12 | <br />
 13 | 
 14 | This project forks from [cyllama](https://github.com/shakfu/cyllama) and provides a Python wrapper for @ggerganov's [llama.cpp](https://github.com/ggerganov/llama.cpp) which is likely the most active open-source compiled LLM inference engine.
 15 | 
 16 | ## Compare to llama-cpp-python 
 17 | 
 18 | The following table provide an overview of the current implementations / features:
 19 | 
 20 | | implementations / features |      xllamacpp      |         llama-cpp-python         |
 21 | |:---------------------------|:-------------------:|:--------------------------------:|
 22 | | Wrapper-type               |       cython        |              ctypes              |
 23 | | API                        | Server & Params API |            Llama API             |
 24 | | Server implementation      |         C++         | Python through wrapped LLama API |
 25 | | Continuous batching        |         yes         |                no                |
 26 | | Thread safe                |         yes         |                no                |
 27 | | Release package            |      prebuilt       |    build during installation     |
 28 | 
 29 | It goes without saying that any help / collaboration / contributions to accelerate the above would be welcome!
 30 | 
 31 | ## Wrapping Guidelines
 32 | 
 33 | As the intent is to provide a very thin wrapping layer and play to the strengths of the original c++ library as well as python, the approach to wrapping intentionally adopts the following guidelines:
 34 | 
 35 | - In general, key structs are implemented as cython extension classses with related functions implemented as methods of said classes.
 36 | 
 37 | - Be as consistent as possible with llama.cpp's naming of its api elements, except when it makes sense to shorten functions names which are used as methods.
 38 | 
 39 | - Minimize non-wrapper python code.
 40 | 
 41 | ## Usage
 42 | 
 43 | Here is a simple example of how to use `xllamacpp` to get embeddings for a list of texts. For this example, you'll need an embedding model like [Qwen3-Embedding-0.6B-Q8_0.gguf](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF/resolve/main/Qwen3-Embedding-0.6B-Q8_0.gguf).
 44 | 
 45 | ```python
 46 | import xllamacpp as xlc
 47 | 
 48 | params = xlc.CommonParams()
 49 | 
 50 | params.model.path = "Qwen3-Embedding-0.6B-Q8_0.gguf"
 51 | params.embedding = True
 52 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST
 53 | 
 54 | server = xlc.Server(params)
 55 | 
 56 | embedding_input = {
 57 |     "input": [
 58 |         "I believe the meaning of life is",
 59 |         "This is a test",
 60 |     ],
 61 |     "model": "My Qwen3 Model",
 62 | }
 63 | 
 64 | result = server.handle_embeddings(embedding_input)
 65 | 
 66 | print(result)
 67 | 
 68 | ```
 69 | 
 70 | Output:
 71 | 
 72 | ```python
 73 | {'data': [{'embedding': [-0.006413215305656195,
 74 |                          -0.05906733125448227,
 75 |                          ...
 76 |                          -0.05887744203209877],
 77 |            'index': 0,
 78 |            'object': 'embedding'},
 79 |           {'embedding': [0.041170503944158554,
 80 |                          -0.004472420550882816,
 81 |                          ...
 82 |                          0.008314250037074089],
 83 |            'index': 1,
 84 |            'object': 'embedding'}],
 85 |  'model': 'My Qwen3 Model',
 86 |  'object': 'list',
 87 |  'usage': {'prompt_tokens': 11, 'total_tokens': 11}}
 88 | ```
 89 | 
 90 | ## OpenAI API Compatible HTTP Server
 91 | 
 92 | The server provides OpenAI API compatible endpoints. For a complete list of available API endpoints, see the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints). You can use the OpenAI Python client:
 93 | 
 94 | ```python
 95 | import xllamacpp as xlc
 96 | from openai import OpenAI
 97 | 
 98 | # Start server
 99 | params = xlc.CommonParams()
100 | params.model.path = "Llama-3.2-1B-Instruct-Q8_0.gguf"
101 | server = xlc.Server(params)
102 | 
103 | # Connect using OpenAI client
104 | client = OpenAI(
105 |     base_url=server.listening_address + "/v1",
106 |     api_key="not-required"  # No API key needed for local server
107 | )
108 | 
109 | # Make chat completion request
110 | response = client.chat.completions.create(
111 |     model="local-model",
112 |     messages=[{"role": "user", "content": "What is the capital of France?"}],
113 |     max_tokens=10
114 | )
115 | 
116 | print(response.choices[0].message.content)
117 | ```
118 | 
119 | ## Prerequisites for Prebuilt Wheels
120 | 
121 | Before pip installing `xllamacpp`, please ensure your system meets the following requirements based on your build type:
122 | 
123 | - **CPU (aarch64)**:
124 |   - Requires ARMv8-A or later architecture
125 |   - For best performance, build from source if your CPU supports advanced instruction sets
126 | 
127 | - **CUDA (Linux)**:
128 |   - Requires glibc 2.35 or later
129 |   - Compatible NVIDIA GPU with appropriate drivers (CUDA 12.4 or 12.8)
130 | 
131 | - **ROCm (Linux)**:
132 |   - Requires glibc 2.35 or later
133 |   - Requires gcc 10 or later (ROCm libraries have this dependency)
134 |   - Compatible AMD GPU with ROCm support (ROCm 6.3.4 or 6.4.1)
135 | 
136 | - **Vulkan (Linux/Windows, Intel/AMD/NVIDIA where supported)**:
137 |   - Install the Vulkan SDK and GPU drivers with Vulkan support
138 |   - Linux users may need distro packages and the LunarG SDK
139 |   - macOS Intel is supported via Vulkan; Apple Silicon Vulkan is not supported in this project
140 | 
141 | ## Install
142 | 
143 | **Note on Performance and Compatibility**
144 | 
145 | For maximum performance, you can build `xllamacpp` from source to optimize for your specific native CPU architecture. The pre-built wheels are designed for broad compatibility.
146 | 
147 | Specifically, the `aarch64` wheels are built for the `armv8-a` architecture. This ensures they run on a wide range of ARM64 devices, but it means that more advanced CPU instruction sets (like SVE) are not enabled. If your CPU supports these advanced features, building from source will provide better performance.
148 | 
149 | - From pypi for `CPU` or `Mac`:
150 | 
151 | ```sh
152 | pip install -U xllamacpp
153 | ```
154 | 
155 | - From github pypi for `CUDA` (use `--force-reinstall` to replace the installed CPU version):
156 | 
157 |   - CUDA 12.4
158 |     ```sh
159 |     pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/cu124
160 |     ```
161 | 
162 |   - CUDA 12.8
163 |     ```sh
164 |     pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/cu128
165 |     ```
166 | 
167 | - From github pypi for `HIP` AMD GPU (use `--force-reinstall` to replace the installed CPU version):
168 | 
169 |   - ROCm 6.3.4
170 |     ```sh
171 |     pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.3.4
172 |     ```
173 | 
174 |   - ROCm 6.4.1
175 |     ```sh
176 |     pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.4.1
177 |     ```
178 | 
179 | - From github pypi for `Vulkan` (use `--force-reinstall` to replace the installed CPU version):
180 | 
181 |   ```sh
182 |   pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/vulkan
183 |   ```
184 | 
185 | ## Build from Source
186 | 
187 | ### (Optional) Preparation
188 | 
189 | - CUDA
190 | 
191 |   This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
192 |   
193 |   #### Download directly from NVIDIA
194 |   You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
195 |   
196 |   
197 |   #### Compile and run inside a Fedora Toolbox Container
198 |   We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
199 |   
200 |   **Recommended for:**
201 |   - ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
202 |     - (there are no supported CUDA packages for these systems)
203 |   - ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
204 |     - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
205 |   - ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
206 |   - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
207 | 
208 | - HIP
209 | 
210 |   This provides GPU acceleration on HIP-supported AMD GPUs.
211 |   Make sure to have ROCm installed.
212 |   You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
213 | 
214 |   
215 |   Or you can try to build inside the [ROCm docker container](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html).
216 | 
217 | - Vulkan
218 | 
219 |   Install the Vulkan SDK and drivers for your platform.
220 |   - Linux: use your distro packages and/or the [LunarG Vulkan SDK](https://vulkan.lunarg.com/sdk/home).
221 |   - Windows: install [LunarG Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and vendor GPU drivers.
222 |   - macOS: Intel only; Apple Silicon is not supported for Vulkan in this project.
223 | 
224 | ### Build `xllamacpp`
225 | 
226 | 1. A recent version of `python3` (testing on python 3.12)
227 | 
228 | 2. Install Rust toolchain (required for building):
229 | 
230 |  ```sh
231 |  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
232 |  ```
233 | 
234 |  For more installation options, see the [rustup installation guide](https://rustup.rs/).
235 | 
236 | 3. Git clone the latest version of `xllamacpp`:
237 | 
238 |  ```sh
239 |  git clone git@github.com:xorbitsai/xllamacpp.git
240 |  cd xllamacpp
241 |  git submodule init
242 |  git submodule update
243 |  ```
244 | 
245 | 4. Install dependencies of `cython`, `setuptools`, and `pytest` for testing:
246 | 
247 |  ```sh
248 |  pip install -r requirements.txt
249 |  ```
250 | 
251 | 5. Select backend via environment and build. Examples:
252 | 
253 |    - CPU (default):
254 |      ```sh
255 |      make
256 |      ```
257 | 
258 |    - CUDA:
259 |      ```sh
260 |      export XLLAMACPP_BUILD_CUDA=1
261 |      make
262 |      ```
263 | 
264 |    - HIP (AMD):
265 |      ```sh
266 |      export XLLAMACPP_BUILD_HIP=1
267 |      make
268 |      ```
269 | 
270 |    - Vulkan:
271 |      ```sh
272 |      export XLLAMACPP_BUILD_VULKAN=1
273 |      make
274 |      ```
275 | 
276 |    - Enable BLAS (optional):
277 |      ```sh
278 |      export CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
279 |      make
280 |      ```
281 | 
282 | ## Testing
283 | 
284 | The `tests` directory in this repo provides extensive examples of using xllamacpp.
285 | 
286 | However, as a first step, you should download a smallish llm in the `.gguf` model from [huggingface](https://huggingface.co/models?search=gguf). A good model to start and which is assumed by tests is [Llama-3.2-1B-Instruct-Q8_0.gguf](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf). `xllamacpp` expects models to be stored in a `models` folder in the cloned `xllamacpp` directory. So to create the `models` directory if doesn't exist and download this model, you can just type:
287 | 
288 | ```sh
289 | make download
290 | ```
291 | 
292 | This basically just does:
293 | 
294 | ```sh
295 | cd xllamacpp
296 | mkdir models && cd models
297 | wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf 
298 | ```
299 | 
300 | Now you can test it using `llama-cli` or `llama-simple`:
301 | 
302 | ```sh
303 | bin/llama-cli -c 512 -n 32 -m models/Llama-3.2-1B-Instruct-Q8_0.gguf \
304 |  -p "Is mathematics discovered or invented?"
305 | ```
306 | 
307 | You can also run the test suite with `pytest` by typing `pytest` or:
308 | 
309 | ```sh
310 | make test
311 | ```
312 | 


--------------------------------------------------------------------------------
/tests/test_params.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pytest import approx
  3 | 
  4 | import xllamacpp as xlc
  5 | 
  6 | 
  7 | def test_common_params_sampling():
  8 |     with pytest.raises(Exception, match="construct"):
  9 |         xlc.CommonParamsSampling()
 10 |     params = xlc.CommonParams()
 11 |     assert params.sampling.timing_per_token is False
 12 |     assert params.sampling.user_sampling_config == 0
 13 |     # assert params.seed == xlc.LLAMA_DEFAULT_SEED
 14 |     # assert params.n_prev == 64
 15 |     # assert params.n_probs == 0
 16 |     # assert params.min_keep == 0
 17 |     # assert params.top_k == 40
 18 |     # assert params.top_p == approx(0.95)
 19 |     # assert params.min_p == approx(0.05)
 20 |     # assert params.xtc_probability == 0.00
 21 |     # assert params.xtc_threshold == approx(0.10)
 22 |     # assert params.typ_p == approx(1.00)
 23 |     # assert params.temp == approx(0.80)
 24 |     # assert params.dynatemp_range == 0.00
 25 |     # assert params.dynatemp_exponent == approx(1.00)
 26 |     # assert params.penalty_last_n == 64
 27 |     # assert params.penalty_repeat == approx(1.00)
 28 |     # assert params.penalty_freq == 0.00
 29 |     # assert params.penalty_present == 0.00
 30 |     # assert params.dry_multiplier == 0.0
 31 |     # assert params.dry_base == approx(1.75)
 32 |     # assert params.dry_allowed_length == 2
 33 |     # assert params.dry_penalty_last_n == -1
 34 |     # assert params.mirostat == 0
 35 |     # assert params.mirostat_tau == approx(5.00)
 36 |     # assert params.mirostat_eta == approx(0.10)
 37 |     # assert params.ignore_eos is False
 38 |     # assert params.no_perf is False
 39 | 
 40 | 
 41 | def test_enum_values():
 42 |     assert xlc.GGML_MAX_N_THREADS == 512
 43 |     assert xlc.GGML_ROPE_TYPE_VISION == 24
 44 |     assert xlc.ggml_sched_priority.GGML_SCHED_PRIO_REALTIME == 3
 45 |     assert xlc.ggml_numa_strategy.GGML_NUMA_STRATEGY_COUNT == 5
 46 |     assert xlc.ggml_type.GGML_TYPE_COUNT == 40
 47 |     assert xlc.ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_ACCEL == 3
 48 |     assert xlc.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_MAX_VALUE == 3
 49 |     assert xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK == 4
 50 |     assert xlc.llama_attention_type.LLAMA_ATTENTION_TYPE_NON_CAUSAL == 1
 51 |     assert xlc.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED == 1
 52 |     assert xlc.llama_split_mode.LLAMA_SPLIT_MODE_ROW == 2
 53 |     assert xlc.llama_model_kv_override_type.LLAMA_KV_OVERRIDE_TYPE_STR == 3
 54 |     assert xlc.dimre_method.DIMRE_METHOD_MEAN == 1
 55 |     assert xlc.common_conversation_mode.COMMON_CONVERSATION_MODE_AUTO == 2
 56 |     assert xlc.common_grammar_trigger_type.COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL == 3
 57 |     assert xlc.common_reasoning_format.COMMON_REASONING_FORMAT_DEEPSEEK == 3
 58 |     assert xlc.common_params_sampling_config.COMMON_PARAMS_SAMPLING_CONFIG_TEMP == 64
 59 | 
 60 | 
 61 | def test_common_params():
 62 |     params = xlc.CommonParams()
 63 |     assert params.n_predict == -1
 64 |     assert params.n_ctx == 4096
 65 |     assert params.n_batch == 2048
 66 |     assert params.n_ubatch == 512
 67 |     assert params.n_keep == 0
 68 |     assert params.n_chunks == -1
 69 |     assert params.n_parallel == 1
 70 |     assert params.n_sequences == 1
 71 |     # assert params.p_split              ==   approx(0.1)
 72 |     assert params.n_gpu_layers == -1
 73 |     # assert params.n_gpu_layers_draft   ==    -1
 74 |     assert params.main_gpu == 0
 75 |     assert params.tensor_split == [0] * 128
 76 |     assert params.grp_attn_n == 1
 77 |     assert params.grp_attn_w == 512
 78 |     assert params.n_print == -1
 79 |     assert params.rope_freq_base == 0.0
 80 |     assert params.rope_freq_scale == 0.0
 81 |     assert params.yarn_ext_factor == approx(-1.0)
 82 |     assert params.yarn_attn_factor == approx(-1.0)
 83 |     assert params.yarn_beta_fast == approx(-1.0)
 84 |     assert params.yarn_beta_slow == approx(-1.0)
 85 |     assert params.yarn_orig_ctx == 0
 86 | 
 87 |     assert params.cpuparams.n_threads == -1
 88 |     assert params.cpuparams.cpumask == [False] * xlc.GGML_MAX_N_THREADS
 89 |     assert params.cpuparams.mask_valid is False
 90 |     assert params.cpuparams.priority == xlc.ggml_sched_priority.GGML_SCHED_PRIO_NORMAL
 91 |     assert params.cpuparams.strict_cpu is False
 92 |     assert params.cpuparams.poll == 50
 93 | 
 94 |     # assert params.cpuparams_batch      ==
 95 |     # assert params.draft_cpuparams      ==
 96 |     # assert params.draft_cpuparams_batch ===
 97 | 
 98 |     # assert params.cb_eval             == nullptr;
 99 |     # assert params.cb_eval_user_data   == nullptr;
100 | 
101 |     assert params.numa == xlc.ggml_numa_strategy.GGML_NUMA_STRATEGY_DISABLED
102 |     assert params.split_mode == xlc.llama_split_mode.LLAMA_SPLIT_MODE_LAYER
103 |     assert (
104 |         params.rope_scaling_type
105 |         == xlc.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
106 |     )
107 |     assert params.pooling_type == xlc.llama_pooling_type.LLAMA_POOLING_TYPE_UNSPECIFIED
108 |     assert (
109 |         params.attention_type
110 |         == xlc.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED
111 |     )
112 |     assert (
113 |         params.flash_attn_type == xlc.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO
114 |     )
115 | 
116 |     # common_sampler_params sparams
117 | 
118 |     assert params.model.path == ""
119 |     assert params.model.url == ""
120 |     assert params.model.hf_repo == ""
121 |     assert params.model.hf_file == ""
122 |     assert params.model.docker_repo == ""
123 |     assert params.model.name == ""
124 |     assert params.model_alias == ""
125 |     assert params.hf_token == ""
126 |     assert params.prompt == ""
127 |     assert params.prompt_file == ""
128 |     assert params.path_prompt_cache == ""
129 |     assert params.input_prefix == ""
130 |     assert params.input_suffix == ""
131 |     assert params.lookup_cache_static == ""
132 |     assert params.lookup_cache_dynamic == ""
133 |     assert params.logits_file == ""
134 | 
135 |     assert params.verbosity == 3
136 |     assert params.control_vector_layer_start == -1
137 |     assert params.control_vector_layer_end == -1
138 |     assert params.ppl_stride == 0
139 |     assert params.ppl_output_type == 0
140 | 
141 |     assert params.hellaswag is False
142 |     assert params.hellaswag_tasks == 400
143 |     assert params.winogrande is False
144 |     assert params.winogrande_tasks == 0
145 |     assert params.multiple_choice is False
146 |     assert params.multiple_choice_tasks == 0
147 |     assert params.kl_divergence is False
148 |     assert params.usage is False
149 |     assert params.use_color is False
150 |     assert params.special is False
151 |     assert params.interactive is False
152 |     assert params.prompt_cache_all is False
153 |     assert params.prompt_cache_ro is False
154 |     assert params.escape is True
155 |     assert params.multiline_input is False
156 |     assert params.simple_io is False
157 |     assert params.cont_batching is True
158 |     assert params.no_perf is False
159 |     assert params.show_timings is True
160 |     assert params.ctx_shift is False
161 |     assert params.swa_full is False
162 |     assert params.kv_unified is False
163 |     assert params.input_prefix_bos is False
164 |     assert params.use_mmap is True
165 |     assert params.use_mlock is False
166 |     assert params.verbose_prompt is False
167 |     assert params.display_prompt is True
168 |     assert params.no_kv_offload is False
169 |     assert params.warmup is True
170 |     assert params.check_tensors is False
171 |     assert params.no_op_offload is False
172 |     assert params.no_extra_bufts is False
173 |     assert params.no_host is False
174 | 
175 |     assert params.cache_type_k == xlc.ggml_type.GGML_TYPE_F16
176 |     assert params.cache_type_v == xlc.ggml_type.GGML_TYPE_F16
177 | 
178 |     assert params.mmproj.path == ""
179 |     assert params.mmproj_use_gpu is True
180 |     assert params.no_mmproj is False
181 |     assert params.image == []
182 |     assert params.image_min_tokens == -1
183 |     assert params.image_max_tokens == -1
184 | 
185 |     assert params.embedding is False
186 |     assert params.embd_normalize == 2
187 |     assert params.embd_out == ""
188 |     assert params.embd_sep == "\n"
189 | 
190 |     assert params.port == 0
191 |     assert params.timeout_read == 600
192 |     assert params.timeout_write == 600
193 |     assert params.n_threads_http == -1
194 |     assert params.n_cache_reuse == 0
195 |     assert params.n_ctx_checkpoints == 8
196 |     assert params.cache_ram_mib == 8192
197 | 
198 |     assert params.hostname == "127.0.0.1"
199 |     assert params.public_path == ""
200 |     assert params.api_prefix == ""
201 |     assert params.chat_template == ""
202 |     assert params.use_jinja is True
203 |     params.use_jinja = False
204 |     assert params.use_jinja is False
205 |     assert params.enable_chat_template is True
206 |     assert (
207 |         params.reasoning_format
208 |         == xlc.common_reasoning_format.COMMON_REASONING_FORMAT_DEEPSEEK
209 |     )
210 |     assert params.prefill_assistant is True
211 | 
212 |     assert params.api_keys == []
213 |     assert params.ssl_file_key == ""
214 |     assert params.ssl_file_cert == ""
215 | 
216 |     params.default_template_kwargs = {"abc": "def"}
217 |     assert params.default_template_kwargs == {"abc": "def"}
218 | 
219 |     assert params.webui is True
220 |     assert params.endpoint_slots is True
221 |     assert params.endpoint_props is False
222 |     assert params.endpoint_metrics is False
223 | 
224 |     assert params.log_json is False
225 | 
226 |     assert params.slot_save_path == ""
227 |     assert params.media_path == ""
228 | 
229 |     assert params.slot_prompt_similarity == approx(0.1)
230 | 
231 |     assert params.is_pp_shared is False
232 |     assert params.is_tg_separate is False
233 | 
234 |     assert params.n_pp == []
235 |     assert params.n_tg == []
236 |     assert params.n_pl == []
237 | 
238 |     assert params.context_files == []
239 |     assert params.chunk_size == 64
240 |     assert params.chunk_separator == "\n"
241 | 
242 |     assert params.n_junk == 250
243 |     assert params.i_pos == -1
244 |     assert params.out_file == ""
245 | 
246 |     assert params.n_out_freq == 10
247 |     assert params.n_save_freq == 0
248 |     assert params.i_chunk == 0
249 |     assert params.imat_dat == 0
250 | 
251 |     assert params.process_output is False
252 |     assert params.compute_ppl is True
253 |     assert params.parse_special is False
254 | 
255 |     assert params.n_pca_batch == 100
256 |     assert params.n_pca_iterations == 1000
257 | 
258 |     sp = params.sampling.samplers
259 |     assert sp
260 |     params.sampling.samplers = sp
261 |     assert params.sampling.samplers == sp
262 |     params.sampling.samplers = "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
263 |     assert params.sampling.samplers == "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
264 |     assert params.speculative.cache_type_k == xlc.ggml_type.GGML_TYPE_F16
265 |     assert params.speculative.cache_type_v == xlc.ggml_type.GGML_TYPE_F16
266 |     assert params.speculative.replacements == []
267 |     params.speculative.replacements = [("a", "b")]
268 |     assert params.speculative.replacements == [("a", "b")]
269 | 
270 |     assert params.cls_sep == "\t"
271 |     assert params.offline is False
272 |     assert params.reasoning_budget == -1
273 | 
274 |     assert params.diffusion.steps == 128
275 |     params.diffusion.steps = 13
276 |     assert params.diffusion.steps == 13
277 |     assert params.diffusion.visual_mode is False
278 |     params.diffusion.visual_mode = True
279 |     assert params.diffusion.visual_mode is True
280 |     assert params.diffusion.eps < 0.01
281 |     params.diffusion.eps = 1.2
282 |     assert 1.19 < params.diffusion.eps < 1.21
283 |     assert params.diffusion.block_length == 0
284 |     params.diffusion.block_length = 13
285 |     assert params.diffusion.block_length == 13
286 |     assert params.diffusion.algorithm == 4
287 |     params.diffusion.algorithm = 1
288 |     assert params.diffusion.algorithm == 1
289 |     assert params.diffusion.alg_temp == 0.0
290 |     params.diffusion.alg_temp = 1.1
291 |     assert 1.09 < params.diffusion.alg_temp < 1.11
292 |     assert params.diffusion.cfg_scale == 0.0
293 |     params.diffusion.cfg_scale = 1.1
294 |     assert 1.09 < params.diffusion.cfg_scale < 1.11
295 |     assert params.diffusion.add_gumbel_noise is False
296 |     params.diffusion.add_gumbel_noise = True
297 |     assert params.diffusion.add_gumbel_noise is True
298 | 
299 |     assert params.tensor_buft_overrides == ""
300 |     with pytest.raises(ValueError, match="unknown buffer type"):
301 |         params.tensor_buft_overrides = (
302 |             "blk\\.([0-3])\\.ffn_.*=GPU0,blk\\.4\\.ffn_(down|up)_exps\\..*=GPU0"
303 |         )
304 |     params.tensor_buft_overrides = (
305 |         "blk\\.([0-3])\\.ffn_.*=CPU,blk\\.4\\.ffn_(down|up)_exps\\..*=CPU"
306 |     )
307 |     assert (
308 |         params.tensor_buft_overrides
309 |         == "blk\\.([0-3])\\.ffn_.*=CPU,blk\\.4\\.ffn_(down|up)_exps\\..*=CPU"
310 |     )
311 | 
312 |     # assert params.cvector_dimre_method  == cy.DIMRE_METHOD_PCA
313 |     # assert params.cvector_outfile       == "control_vector.gguf"
314 |     # assert params.cvector_positive_file == "examples/cvector-generator/positive.txt"
315 |     # assert params.cvector_negative_file == "examples/cvector-generator/negative.txt"
316 | 
317 |     # assert params.spm_infill            is False
318 | 
319 |     # assert params.lora_outfile          == "ggml-lora-merged-f16.gguf"
320 | 
321 |     # assert params.batched_bench_output_jsonl is False
322 | 
323 |     # ... rest not yet implemented
324 | 
325 | 
326 | def test_json_schema_to_grammar():
327 |     schema = {
328 |         "type": "object",
329 |         "properties": {
330 |             "answer": {"type": "string"},
331 |             "score": {"type": "number"},
332 |         },
333 |         "required": ["answer"],
334 |     }
335 |     grammar = xlc.json_schema_to_grammar(schema)
336 |     assert isinstance(grammar, str)
337 |     assert grammar.strip()
338 | 
339 |     with pytest.raises(ValueError):
340 |         xlc.json_schema_to_grammar("{not json}")
341 | 


--------------------------------------------------------------------------------
/tests/test_server.py:
--------------------------------------------------------------------------------
  1 | import pprint
  2 | import os
  3 | import sys
  4 | import base64
  5 | import pytest
  6 | import json
  7 | import orjson
  8 | 
  9 | import xllamacpp as xlc
 10 | 
 11 | 
 12 | def test_get_system_info():
 13 |     assert "CPU :" in xlc.get_system_info()
 14 | 
 15 | 
 16 | def test_get_device_info():
 17 |     xlc.get_device_info()
 18 |     info = xlc.get_device_info()
 19 |     assert len(info) > 0
 20 |     assert "CPU" in [i["name"] for i in info]
 21 |     print(info)
 22 | 
 23 | 
 24 | def test_llama_server(model_path):
 25 |     params = xlc.CommonParams()
 26 | 
 27 |     params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf")
 28 |     params.prompt = "When did the universe begin?"
 29 |     params.warmup = False
 30 |     params.n_predict = 32
 31 |     params.n_ctx = 256
 32 |     params.n_parallel = 1
 33 |     params.cpuparams.n_threads = 2
 34 |     params.cpuparams_batch.n_threads = 2
 35 |     params.endpoint_metrics = True
 36 |     params.cache_ram_mib = 0
 37 | 
 38 |     server = xlc.Server(params)
 39 | 
 40 |     complete_prompt = {
 41 |         "max_tokens": 128,
 42 |         "prompt": "Write the fibonacci function in c++.",
 43 |     }
 44 | 
 45 |     server.handle_completions(
 46 |         complete_prompt,
 47 |         lambda v: pprint.pprint(v),
 48 |     )
 49 |     v = server.handle_completions(complete_prompt)
 50 |     assert isinstance(v, dict)
 51 |     assert "code" not in v
 52 |     pprint.pprint(v)
 53 | 
 54 |     # If the prompt is a str or bytes, a callback is required.
 55 |     with pytest.raises(ValueError, match="non dict prompt"):
 56 |         server.handle_chat_completions(orjson.dumps(complete_prompt))
 57 | 
 58 |     complete_prompt["stream"] = True
 59 | 
 60 |     # If the prompt is streaming, a callback is required.
 61 |     with pytest.raises(ValueError, match="requires a callback for streaming"):
 62 |         server.handle_completions(complete_prompt)
 63 | 
 64 |     server.handle_completions(
 65 |         complete_prompt,
 66 |         lambda v: pprint.pprint(v),
 67 |     )
 68 | 
 69 |     # Test handle_completions with a str or bytes prompt
 70 |     ok = False
 71 | 
 72 |     def _cb_str(v):
 73 |         nonlocal ok
 74 |         assert type(v) is str
 75 |         json.loads(v)
 76 |         ok = True
 77 | 
 78 |     complete_prompt_str = json.dumps(complete_prompt)
 79 |     server.handle_completions(
 80 |         complete_prompt_str,
 81 |         _cb_str,
 82 |     )
 83 |     assert ok
 84 | 
 85 |     ok = False
 86 | 
 87 |     def _cb_bytes(v):
 88 |         nonlocal ok
 89 |         assert type(v) is bytes
 90 |         orjson.loads(v)
 91 |         ok = True
 92 | 
 93 |     complete_prompt_bytes = orjson.dumps(complete_prompt)
 94 |     server.handle_completions(
 95 |         complete_prompt_bytes,
 96 |         _cb_bytes,
 97 |     )
 98 |     assert ok
 99 | 
100 |     chat_complete_prompt = {
101 |         "max_tokens": 128,
102 |         "messages": [
103 |             {"role": "system", "content": "You are a coding assistant."},
104 |             {"role": "user", "content": "Write the fibonacci function in c++."},
105 |         ],
106 |     }
107 | 
108 |     server.handle_chat_completions(
109 |         chat_complete_prompt,
110 |         lambda v: pprint.pprint(v),
111 |     )
112 |     v = server.handle_chat_completions(chat_complete_prompt)
113 |     assert isinstance(v, dict)
114 |     assert "code" not in v
115 |     pprint.pprint(v)
116 | 
117 |     # If the prompt is a str or bytes, a callback is required.
118 |     with pytest.raises(ValueError, match="non dict prompt"):
119 |         server.handle_chat_completions(json.dumps(chat_complete_prompt))
120 | 
121 |     chat_complete_prompt["stream"] = True
122 | 
123 |     # If the prompt is streaming, a callback is required.
124 |     with pytest.raises(ValueError, match="requires a callback for streaming"):
125 |         server.handle_chat_completions(chat_complete_prompt)
126 | 
127 |     server.handle_chat_completions(
128 |         chat_complete_prompt,
129 |         lambda v: pprint.pprint(v),
130 |     )
131 | 
132 |     # Test handle_chat_completions with a str or bytes prompt
133 |     ok = False
134 | 
135 |     def _cb_str(v):
136 |         nonlocal ok
137 |         assert type(v) is str
138 |         json.loads(v)
139 |         ok = True
140 | 
141 |     chat_complete_prompt_str = json.dumps(chat_complete_prompt)
142 |     server.handle_chat_completions(
143 |         chat_complete_prompt_str,
144 |         _cb_str,
145 |     )
146 |     assert ok
147 | 
148 |     ok = False
149 | 
150 |     def _cb_bytes(v):
151 |         nonlocal ok
152 |         assert type(v) is bytes
153 |         orjson.loads(v)
154 |         ok = True
155 | 
156 |     chat_complete_prompt_bytes = orjson.dumps(chat_complete_prompt)
157 |     server.handle_chat_completions(
158 |         chat_complete_prompt_bytes,
159 |         _cb_bytes,
160 |     )
161 |     assert ok
162 | 
163 |     # Test handle_metrics()
164 |     result = server.handle_metrics()
165 |     assert type(result) is str
166 |     assert "llamacpp:prompt_seconds_total" in result
167 | 
168 | 
169 | def test_llama_server_stream_callback_stop(model_path):
170 |     params = xlc.CommonParams()
171 | 
172 |     params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf")
173 |     params.prompt = "When did the universe begin?"
174 |     params.warmup = False
175 |     params.n_predict = 64
176 |     params.n_ctx = 256
177 |     params.n_parallel = 1
178 |     params.cpuparams.n_threads = 2
179 |     params.cpuparams_batch.n_threads = 2
180 | 
181 |     server = xlc.Server(params)
182 | 
183 |     # Test handle_completions streaming stop via callback return value
184 |     complete_prompt = {
185 |         "max_tokens": 128,
186 |         "prompt": "Write a long story about the history of the universe.",
187 |         "stream": True,
188 |     }
189 | 
190 |     all_chunks = 0
191 | 
192 |     def _cb_all(v):
193 |         nonlocal all_chunks
194 |         all_chunks += 1
195 | 
196 |     stop_chunks = 0
197 | 
198 |     def _cb_stop(v):
199 |         nonlocal stop_chunks
200 |         stop_chunks += 1
201 |         return True
202 | 
203 |     server.handle_completions(complete_prompt, _cb_all)
204 |     assert all_chunks >= 1
205 | 
206 |     server.handle_completions(complete_prompt, _cb_stop)
207 |     assert stop_chunks == 1
208 |     assert all_chunks > stop_chunks
209 | 
210 |     # Test handle_chat_completions streaming stop via callback return value
211 |     chat_complete_prompt = {
212 |         "max_tokens": 128,
213 |         "messages": [
214 |             {"role": "system", "content": "You are a coding assistant."},
215 |             {
216 |                 "role": "user",
217 |                 "content": "Tell me in detail about the history of programming languages.",
218 |             },
219 |         ],
220 |         "stream": True,
221 |     }
222 | 
223 |     chat_all_chunks = 0
224 | 
225 |     def _chat_cb_all(v):
226 |         nonlocal chat_all_chunks
227 |         chat_all_chunks += 1
228 | 
229 |     chat_stop_chunks = 0
230 | 
231 |     def _chat_cb_stop(v):
232 |         nonlocal chat_stop_chunks
233 |         chat_stop_chunks += 1
234 |         return True
235 | 
236 |     server.handle_chat_completions(chat_complete_prompt, _chat_cb_all)
237 |     assert chat_all_chunks >= 1
238 | 
239 |     server.handle_chat_completions(chat_complete_prompt, _chat_cb_stop)
240 |     assert chat_stop_chunks == 1
241 |     assert chat_all_chunks > chat_stop_chunks
242 | 
243 | 
244 | def test_llama_server_chat_with_grammar(model_path):
245 |     schema = {
246 |         "type": "object",
247 |         "properties": {
248 |             "answer": {"type": "string"},
249 |             "score": {"type": "number"},
250 |         },
251 |         "required": ["answer"],
252 |     }
253 |     grammar = xlc.json_schema_to_grammar(schema)
254 | 
255 |     params = xlc.CommonParams()
256 | 
257 |     params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf")
258 |     params.warmup = False
259 |     params.n_predict = 64
260 |     params.n_ctx = 256
261 |     params.cpuparams.n_threads = 2
262 |     params.cpuparams_batch.n_threads = 2
263 |     params.sampling.temp = 0
264 |     params.sampling.top_k = 1
265 |     params.sampling.grammar = grammar
266 | 
267 |     server = xlc.Server(params)
268 | 
269 |     chat_complete_prompt = {
270 |         "max_tokens": 64,
271 |         "messages": [
272 |             {
273 |                 "role": "system",
274 |                 "content": "Respond with a JSON object matching the provided schema.",
275 |             },
276 |             {
277 |                 "role": "user",
278 |                 "content": "Provide an answer string and an optional numeric score.",
279 |             },
280 |         ],
281 |     }
282 | 
283 |     result = server.handle_chat_completions(chat_complete_prompt)
284 | 
285 |     assert isinstance(result, dict)
286 |     content = result["choices"][0]["message"]["content"]
287 |     parsed = json.loads(content)
288 | 
289 |     assert parsed["answer"]
290 |     assert isinstance(parsed["answer"], str)
291 |     if "score" in parsed:
292 |         assert isinstance(parsed["score"], (int, float))
293 | 
294 | 
295 | def test_llama_server_multimodal(model_path):
296 |     with open(os.path.join(os.path.dirname(__file__), "data/11_truck.png"), "rb") as f:
297 |         content = f.read()
298 |     IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(content).decode("utf-8")
299 | 
300 |     params = xlc.CommonParams()
301 | 
302 |     params.model.path = os.path.join(model_path, "tinygemma3-Q8_0.gguf")
303 |     params.mmproj.path = os.path.join(model_path, "mmproj-tinygemma3.gguf")
304 |     params.sampling.seed = 42
305 |     params.sampling.top_k = 1
306 |     params.sampling.temp = 0
307 |     params.n_predict = 4
308 |     params.n_ctx = 1024
309 |     params.cpuparams.n_threads = 4
310 |     params.cpuparams_batch.n_threads = 2
311 | 
312 |     server = xlc.Server(params)
313 | 
314 |     chat_complete_prompt = {
315 |         "max_tokens": 128,
316 |         "messages": [
317 |             {
318 |                 "role": "user",
319 |                 "content": [
320 |                     {"type": "text", "text": "What is this:\n"},
321 |                     {
322 |                         "type": "image_url",
323 |                         "image_url": {
324 |                             "url": IMG_BASE64_0,
325 |                         },
326 |                     },
327 |                 ],
328 |             },
329 |         ],
330 |     }
331 | 
332 |     server.handle_chat_completions(
333 |         chat_complete_prompt,
334 |         lambda v: pprint.pprint(v),
335 |     )
336 | 
337 | 
338 | def test_llama_server_embedding(model_path):
339 |     params = xlc.CommonParams()
340 | 
341 |     params.model.path = os.path.join(model_path, "Qwen3-Embedding-0.6B-Q8_0.gguf")
342 |     params.embedding = True
343 |     params.n_predict = -1
344 |     params.n_ctx = 512
345 |     params.n_batch = 128
346 |     params.n_ubatch = 128
347 |     params.sampling.seed = 42
348 |     params.cpuparams.n_threads = 2
349 |     params.cpuparams_batch.n_threads = 2
350 |     params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST
351 | 
352 |     server = xlc.Server(params)
353 | 
354 |     embedding_input = {
355 |         "input": [
356 |             "I believe the meaning of life is",
357 |             "Write a joke about AI from a very long prompt which will not be truncated",
358 |             "This is a test",
359 |             "This is another test",
360 |         ],
361 |     }
362 | 
363 |     result = server.handle_embeddings(embedding_input)
364 | 
365 |     assert type(result) is dict
366 |     assert len(result["data"]) == 4
367 |     for d in result["data"]:
368 |         assert len(d["embedding"]) == 1024
369 | 
370 |     embedding_input_str = json.dumps(embedding_input)
371 |     assert type(embedding_input_str) is str
372 |     result_str = server.handle_embeddings(embedding_input_str)
373 |     assert type(result_str) is str
374 |     result = json.loads(result_str)
375 | 
376 |     assert type(result) is dict
377 |     assert len(result["data"]) == 4
378 |     for d in result["data"]:
379 |         assert len(d["embedding"]) == 1024
380 | 
381 |     embedding_input_bytes = orjson.dumps(embedding_input)
382 |     assert type(embedding_input_bytes) is bytes
383 |     result_bytes = server.handle_embeddings(embedding_input_bytes)
384 |     assert type(result_bytes) is bytes
385 |     result = orjson.loads(result_str)
386 | 
387 |     assert type(result) is dict
388 |     assert len(result["data"]) == 4
389 |     for d in result["data"]:
390 |         assert len(d["embedding"]) == 1024
391 | 
392 | 
393 | @pytest.mark.skipif(sys.platform == "darwin", reason="Rerank test crashes on macOS CI")
394 | def test_llama_server_rerank(model_path):
395 |     params = xlc.CommonParams()
396 | 
397 |     params.model.path = os.path.join(model_path, "bge-reranker-v2-m3-Q2_K.gguf")
398 |     params.embedding = True
399 |     params.n_predict = -1
400 |     params.n_ctx = 512
401 |     params.n_batch = 128
402 |     params.n_ubatch = 128
403 |     params.sampling.seed = 42
404 |     params.cpuparams.n_threads = 2
405 |     params.cpuparams_batch.n_threads = 2
406 |     params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK
407 | 
408 |     server = xlc.Server(params)
409 | 
410 |     rerank_input = {
411 |         "query": "What is the capital of France?",
412 |         "documents": [
413 |             "Paris is the capital of France.",
414 |             "The Eiffel Tower is in Paris.",
415 |             "Germany is located in Europe.",
416 |         ],
417 |     }
418 | 
419 |     result = server.handle_rerank(rerank_input)
420 | 
421 |     assert type(result) is dict
422 |     assert len(result["results"]) == 3
423 | 
424 |     rerank_input_str = json.dumps(rerank_input)
425 |     result_str = server.handle_rerank(rerank_input_str)
426 |     assert type(result_str) is str
427 |     result = json.loads(result_str)
428 | 
429 |     assert type(result) is dict
430 |     assert len(result["results"]) == 3
431 | 
432 |     rerank_input_bytes = orjson.dumps(rerank_input)
433 |     result_bytes = server.handle_rerank(rerank_input_bytes)
434 |     assert type(result_bytes) is bytes
435 |     result = orjson.loads(result_bytes)
436 | 
437 |     assert type(result) is dict
438 |     assert len(result["results"]) == 3
439 | 


--------------------------------------------------------------------------------
/src/llama.cpp/src/server.cpp:
--------------------------------------------------------------------------------
  1 | #include "server-context.h"
  2 | #include "server-http.h"
  3 | #include "server-models.h"
  4 | 
  5 | #include "arg.h"
  6 | #include "common.h"
  7 | #include "llama.h"
  8 | #include "log.h"
  9 | 
 10 | #include <atomic>
 11 | #include <signal.h>
 12 | #include <thread> // for std::thread::hardware_concurrency
 13 | 
 14 | #if defined(_WIN32)
 15 | #include <windows.h>
 16 | #endif
 17 | 
 18 | static std::function<void(int)> shutdown_handler;
 19 | static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 20 | 
 21 | static inline void signal_handler(int signal) {
 22 |     if (is_terminating.test_and_set()) {
 23 |         // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
 24 |         // this is for better developer experience, we can remove when the server is stable enough
 25 |         fprintf(stderr, "Received second interrupt, terminating immediately.\n");
 26 |         exit(1);
 27 |     }
 28 | 
 29 |     shutdown_handler(signal);
 30 | }
 31 | 
 32 | // wrapper function that handles exceptions and logs errors
 33 | // this is to make sure handler_t never throws exceptions; instead, it returns an error response
 34 | static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
 35 |     return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
 36 |         std::string message;
 37 |         error_type error;
 38 |         try {
 39 |             return func(req);
 40 |         } catch (const std::invalid_argument & e) {
 41 |             // treat invalid_argument as invalid request (400)
 42 |             error = ERROR_TYPE_INVALID_REQUEST;
 43 |             message = e.what();
 44 |         } catch (const std::exception & e) {
 45 |             // treat other exceptions as server error (500)
 46 |             error = ERROR_TYPE_SERVER;
 47 |             message = e.what();
 48 |         } catch (...) {
 49 |             error = ERROR_TYPE_SERVER;
 50 |             message = "unknown error";
 51 |         }
 52 | 
 53 |         auto res = std::make_unique<server_http_res>();
 54 |         res->status = 500;
 55 |         try {
 56 |             json error_data = format_error_response(message, error);
 57 |             res->status = json_value(error_data, "code", 500);
 58 |             res->data = safe_json_to_str({{ "error", error_data }});
 59 |             SRV_WRN("got exception: %s\n", res->data.c_str());
 60 |         } catch (const std::exception & e) {
 61 |             SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
 62 |             res->data = "Internal Server Error";
 63 |         }
 64 |         return res;
 65 |     };
 66 | }
 67 | 
 68 | int main(int argc, char ** argv, char ** envp) {
 69 |     // own arguments required by this example
 70 |     common_params params;
 71 | 
 72 |     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
 73 |         return 1;
 74 |     }
 75 | 
 76 |     // TODO: should we have a separate n_parallel parameter for the server?
 77 |     //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
 78 |     // TODO: this is a common configuration that is suitable for most local use cases
 79 |     //       however, overriding the parameters is a bit confusing - figure out something more intuitive
 80 |     if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
 81 |         LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
 82 | 
 83 |         params.n_parallel = 4;
 84 |         params.kv_unified = true;
 85 |     }
 86 | 
 87 |     // for consistency between server router mode and single-model mode, we set the same model name as alias
 88 |     if (params.model_alias.empty() && !params.model.name.empty()) {
 89 |         params.model_alias = params.model.name;
 90 |     }
 91 | 
 92 |     common_init();
 93 | 
 94 |     // struct that contains llama context and inference
 95 |     server_context ctx_server;
 96 | 
 97 |     llama_backend_init();
 98 |     llama_numa_init(params.numa);
 99 | 
100 |     LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
101 |     LOG_INF("\n");
102 |     LOG_INF("%s\n", common_params_get_system_info(params).c_str());
103 |     LOG_INF("\n");
104 | 
105 |     server_http_context ctx_http;
106 |     if (!ctx_http.init(params)) {
107 |         LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
108 |         return 1;
109 |     }
110 | 
111 |     //
112 |     // Router
113 |     //
114 | 
115 |     // register API routes
116 |     server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
117 | 
118 |     bool is_router_server = params.model.path.empty();
119 |     std::optional<server_models_routes> models_routes{};
120 |     if (is_router_server) {
121 |         // setup server instances manager
122 |         models_routes.emplace(params, argc, argv, envp);
123 | 
124 |         // proxy handlers
125 |         // note: routes.get_health stays the same
126 |         routes.get_metrics                 = models_routes->proxy_get;
127 |         routes.post_props                  = models_routes->proxy_post;
128 |         routes.get_api_show                = models_routes->proxy_get;
129 |         routes.post_completions            = models_routes->proxy_post;
130 |         routes.post_completions_oai        = models_routes->proxy_post;
131 |         routes.post_chat_completions       = models_routes->proxy_post;
132 |         routes.post_anthropic_messages     = models_routes->proxy_post;
133 |         routes.post_anthropic_count_tokens = models_routes->proxy_post;
134 |         routes.post_infill                 = models_routes->proxy_post;
135 |         routes.post_embeddings             = models_routes->proxy_post;
136 |         routes.post_embeddings_oai         = models_routes->proxy_post;
137 |         routes.post_rerank                 = models_routes->proxy_post;
138 |         routes.post_tokenize               = models_routes->proxy_post;
139 |         routes.post_detokenize             = models_routes->proxy_post;
140 |         routes.post_apply_template         = models_routes->proxy_post;
141 |         routes.get_lora_adapters           = models_routes->proxy_get;
142 |         routes.post_lora_adapters          = models_routes->proxy_post;
143 |         routes.get_slots                   = models_routes->proxy_get;
144 |         routes.post_slots                  = models_routes->proxy_post;
145 | 
146 |         // custom routes for router
147 |         routes.get_props  = models_routes->get_router_props;
148 |         routes.get_models = models_routes->get_router_models;
149 |         ctx_http.post("/models/load",   ex_wrapper(models_routes->post_router_models_load));
150 |         ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
151 |         ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status));
152 |     }
153 | 
154 |     ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
155 |     ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
156 |     ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
157 |     ctx_http.get ("/props",               ex_wrapper(routes.get_props));
158 |     ctx_http.post("/props",               ex_wrapper(routes.post_props));
159 |     ctx_http.post("/api/show",            ex_wrapper(routes.get_api_show));
160 |     ctx_http.get ("/models",              ex_wrapper(routes.get_models)); // public endpoint (no API key check)
161 |     ctx_http.get ("/v1/models",           ex_wrapper(routes.get_models)); // public endpoint (no API key check)
162 |     ctx_http.get ("/api/tags",            ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
163 |     ctx_http.post("/completion",          ex_wrapper(routes.post_completions)); // legacy
164 |     ctx_http.post("/completions",         ex_wrapper(routes.post_completions));
165 |     ctx_http.post("/v1/completions",      ex_wrapper(routes.post_completions_oai));
166 |     ctx_http.post("/chat/completions",    ex_wrapper(routes.post_chat_completions));
167 |     ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
168 |     ctx_http.post("/api/chat",            ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
169 |     ctx_http.post("/v1/messages",         ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
170 |     ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
171 |     ctx_http.post("/infill",              ex_wrapper(routes.post_infill));
172 |     ctx_http.post("/embedding",           ex_wrapper(routes.post_embeddings)); // legacy
173 |     ctx_http.post("/embeddings",          ex_wrapper(routes.post_embeddings));
174 |     ctx_http.post("/v1/embeddings",       ex_wrapper(routes.post_embeddings_oai));
175 |     ctx_http.post("/rerank",              ex_wrapper(routes.post_rerank));
176 |     ctx_http.post("/reranking",           ex_wrapper(routes.post_rerank));
177 |     ctx_http.post("/v1/rerank",           ex_wrapper(routes.post_rerank));
178 |     ctx_http.post("/v1/reranking",        ex_wrapper(routes.post_rerank));
179 |     ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
180 |     ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
181 |     ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
182 |     // LoRA adapters hotswap
183 |     ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
184 |     ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));
185 |     // Save & load slots
186 |     ctx_http.get ("/slots",               ex_wrapper(routes.get_slots));
187 |     ctx_http.post("/slots/:id_slot",      ex_wrapper(routes.post_slots));
188 | 
189 |     //
190 |     // Start the server
191 |     //
192 | 
193 |     std::function<void()> clean_up;
194 | 
195 |     if (is_router_server) {
196 |         LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
197 | 
198 |         clean_up = [&models_routes]() {
199 |             SRV_INF("%s: cleaning up before exit...\n", __func__);
200 |             if (models_routes.has_value()) {
201 |                 models_routes->models.unload_all();
202 |             }
203 |             llama_backend_free();
204 |         };
205 | 
206 |         if (!ctx_http.start()) {
207 |             clean_up();
208 |             LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
209 |             return 1;
210 |         }
211 |         ctx_http.is_ready.store(true);
212 | 
213 |         shutdown_handler = [&](int) {
214 |             ctx_http.stop();
215 |         };
216 | 
217 |     } else {
218 |         // setup clean up function, to be called before exit
219 |         clean_up = [&ctx_http, &ctx_server]() {
220 |             SRV_INF("%s: cleaning up before exit...\n", __func__);
221 |             ctx_http.stop();
222 |             ctx_server.terminate();
223 |             llama_backend_free();
224 |         };
225 | 
226 |         // start the HTTP server before loading the model to be able to serve /health requests
227 |         if (!ctx_http.start()) {
228 |             clean_up();
229 |             LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
230 |             return 1;
231 |         }
232 | 
233 |         // load the model
234 |         LOG_INF("%s: loading model\n", __func__);
235 | 
236 |         if (!ctx_server.load_model(params)) {
237 |             clean_up();
238 |             if (ctx_http.thread.joinable()) {
239 |                 ctx_http.thread.join();
240 |             }
241 |             LOG_ERR("%s: exiting due to model loading error\n", __func__);
242 |             return 1;
243 |         }
244 | 
245 |         ctx_server.init();
246 |         ctx_http.is_ready.store(true);
247 | 
248 |         LOG_INF("%s: model loaded\n", __func__);
249 | 
250 |         shutdown_handler = [&](int) {
251 |             // this will unblock start_loop()
252 |             ctx_server.terminate();
253 |         };
254 |     }
255 | 
256 |     // TODO: refactor in common/console
257 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
258 |     struct sigaction sigint_action;
259 |     sigint_action.sa_handler = signal_handler;
260 |     sigemptyset (&sigint_action.sa_mask);
261 |     sigint_action.sa_flags = 0;
262 |     sigaction(SIGINT, &sigint_action, NULL);
263 |     sigaction(SIGTERM, &sigint_action, NULL);
264 | #elif defined (_WIN32)
265 |     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
266 |         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
267 |     };
268 |     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
269 | #endif
270 | 
271 |     if (is_router_server) {
272 |         LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
273 |         LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
274 |         LOG_INF("%s:       it is not recommended to use this mode in untrusted environments\n", __func__);
275 |         if (ctx_http.thread.joinable()) {
276 |             ctx_http.thread.join(); // keep the main thread alive
277 |         }
278 | 
279 |         // when the HTTP server stops, clean up and exit
280 |         clean_up();
281 |     } else {
282 |         LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
283 |         LOG_INF("%s: starting the main loop...\n", __func__);
284 | 
285 |         // optionally, notify router server that this instance is ready
286 |         const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
287 |         std::thread monitor_thread;
288 |         if (router_port != nullptr) {
289 |             monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
290 |         }
291 | 
292 |         // this call blocks the main thread until queue_tasks.terminate() is called
293 |         ctx_server.start_loop();
294 | 
295 |         clean_up();
296 |         if (ctx_http.thread.joinable()) {
297 |             ctx_http.thread.join();
298 |         }
299 |         if (monitor_thread.joinable()) {
300 |             monitor_thread.join();
301 |         }
302 |         llama_memory_breakdown_print(ctx_server.get_llama_context());
303 |     }
304 | 
305 |     return 0;
306 | }
307 | 


--------------------------------------------------------------------------------
/tests/test_server_http.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import base64
  6 | import pytest
  7 | import requests
  8 | import threading
  9 | from typing import Dict, Any
 10 | 
 11 | import xllamacpp as xlc
 12 | 
 13 | 
 14 | class TestServerHTTP:
 15 |     """Test suite for xllamacpp HTTP server endpoints"""
 16 | 
 17 |     @pytest.fixture(scope="class")
 18 |     def server_url(self):
 19 |         """Start HTTP server using xllamacpp.Server and return base URL"""
 20 |         # Configure server parameters
 21 |         params = xlc.CommonParams()
 22 |         params.model.path = os.path.join(
 23 |             os.path.dirname(__file__), "../models/Llama-3.2-1B-Instruct-Q8_0.gguf"
 24 |         )
 25 |         params.n_parallel = 1
 26 |         params.n_ctx = 256
 27 |         params.cpuparams.n_threads = 2
 28 |         params.cpuparams_batch.n_threads = 2
 29 |         params.endpoint_metrics = True
 30 | 
 31 |         # Create server instance - this automatically starts the HTTP server
 32 |         server = xlc.Server(params)
 33 | 
 34 |         # Wait for server to be ready - default port is likely 8080
 35 |         base_url = server.listening_address
 36 |         max_wait = 5  # seconds
 37 |         wait_interval = 0.5
 38 | 
 39 |         for _ in range(int(max_wait / wait_interval)):
 40 |             try:
 41 |                 response = requests.get(f"{base_url}/health", timeout=1)
 42 |                 if response.status_code == 200:
 43 |                     yield base_url
 44 |                     break
 45 |             except requests.exceptions.RequestException:
 46 |                 time.sleep(wait_interval)
 47 |         else:
 48 |             pytest.fail("Server failed to start within timeout period")
 49 | 
 50 |         # Server will be automatically cleaned up when the object goes out of scope
 51 | 
 52 |     def test_health_endpoints(self, server_url):
 53 |         """Test health check endpoints"""
 54 |         # Test /health
 55 |         response = requests.get(f"{server_url}/health")
 56 |         assert response.status_code == 200
 57 |         data = response.json()
 58 |         assert "status" in data
 59 |         assert data["status"] == "ok"
 60 | 
 61 |         # Test /v1/health
 62 |         response = requests.get(f"{server_url}/v1/health")
 63 |         assert response.status_code == 200
 64 |         data = response.json()
 65 |         assert "status" in data
 66 |         assert data["status"] == "ok"
 67 | 
 68 |     def test_models_endpoints(self, server_url):
 69 |         """Test model listing endpoints"""
 70 |         # Test /models
 71 |         response = requests.get(f"{server_url}/models")
 72 |         assert response.status_code == 200
 73 |         data = response.json()
 74 |         assert "data" in data
 75 |         assert len(data["data"]) > 0
 76 |         model = data["data"][0]
 77 |         assert "id" in model
 78 |         assert "object" in model
 79 |         assert model["object"] == "model"
 80 | 
 81 |         # Test /v1/models
 82 |         response = requests.get(f"{server_url}/v1/models")
 83 |         assert response.status_code == 200
 84 |         data = response.json()
 85 |         assert "data" in data
 86 | 
 87 |         # Test /api/tags (ollama compatible)
 88 |         response = requests.get(f"{server_url}/api/tags")
 89 |         assert response.status_code == 200
 90 |         data = response.json()
 91 |         assert "models" in data
 92 | 
 93 |     def test_props_endpoints(self, server_url):
 94 |         """Test server properties endpoints"""
 95 |         # Test GET /props
 96 |         response = requests.get(f"{server_url}/props")
 97 |         assert response.status_code == 200
 98 |         data = response.json()
 99 |         assert "build_info" in data
100 | 
101 |     def test_metrics_endpoint(self, server_url):
102 |         """Test metrics endpoint"""
103 |         response = requests.get(f"{server_url}/metrics")
104 |         assert response.status_code == 200
105 |         # Metrics should be in Prometheus format
106 |         assert "llamacpp:" in response.text
107 | 
108 |     def test_completion_endpoints(self, server_url):
109 |         """Test text completion endpoints"""
110 |         completion_data = {
111 |             "prompt": "The capital of France is",
112 |             "max_tokens": 10,
113 |             "temperature": 0.1,
114 |         }
115 | 
116 |         # Test /v1/completions (OpenAI compatible)
117 |         response = requests.post(f"{server_url}/v1/completions", json=completion_data)
118 |         assert response.status_code == 200
119 |         data = response.json()
120 |         assert "choices" in data
121 | 
122 |     def test_chat_completion_endpoints(self, server_url):
123 |         """Test chat completion endpoints"""
124 |         chat_data = {
125 |             "messages": [
126 |                 {"role": "system", "content": "You are a helpful assistant."},
127 |                 {"role": "user", "content": "What is the capital of France?"},
128 |             ],
129 |             "max_tokens": 10,
130 |             "temperature": 0.1,
131 |         }
132 | 
133 |         # Test /chat/completions
134 |         response = requests.post(f"{server_url}/chat/completions", json=chat_data)
135 |         assert response.status_code == 200
136 |         data = response.json()
137 |         assert "choices" in data
138 |         assert len(data["choices"]) > 0
139 |         assert "message" in data["choices"][0]
140 |         assert "content" in data["choices"][0]["message"]
141 | 
142 |         # Test /v1/chat/completions (OpenAI compatible)
143 |         response = requests.post(f"{server_url}/v1/chat/completions", json=chat_data)
144 |         assert response.status_code == 200
145 |         data = response.json()
146 |         assert "choices" in data
147 | 
148 |         # Test /api/chat (ollama compatible)
149 |         response = requests.post(f"{server_url}/api/chat", json=chat_data)
150 |         assert response.status_code == 200
151 | 
152 |     def test_tokenize_endpoints(self, server_url):
153 |         """Test tokenization endpoints"""
154 |         tokenize_data = {"content": "Hello world, how are you?"}
155 | 
156 |         # Test /tokenize
157 |         response = requests.post(f"{server_url}/tokenize", json=tokenize_data)
158 |         assert response.status_code == 200
159 |         data = response.json()
160 |         assert "tokens" in data
161 | 
162 |         # Test /detokenize
163 |         detokenize_data = {"tokens": [1, 2, 3, 4, 5]}
164 |         response = requests.post(f"{server_url}/detokenize", json=detokenize_data)
165 |         assert response.status_code == 200
166 |         data = response.json()
167 |         assert "content" in data
168 | 
169 |     def test_apply_template_endpoint(self, server_url):
170 |         """Test template application endpoint"""
171 |         template_data = {
172 |             "messages": [
173 |                 {"role": "system", "content": "You are a test."},
174 |                 {"role": "user", "content": "Hi there"},
175 |             ]
176 |         }
177 | 
178 |         response = requests.post(f"{server_url}/apply-template", json=template_data)
179 |         assert response.status_code == 200
180 |         body = response.json()
181 |         assert "prompt" in body
182 |         assert "You are a test." in body["prompt"]
183 | 
184 |     def test_slots_endpoints(self, server_url):
185 |         """Test slots management endpoints"""
186 |         # Test GET /slots
187 |         response = requests.get(f"{server_url}/slots")
188 |         assert response.status_code == 200
189 |         data = response.json()
190 |         assert isinstance(data, list)
191 | 
192 |     def test_streaming_completions(self, server_url):
193 |         """Test streaming completion endpoints"""
194 |         completion_data = {
195 |             "prompt": "The capital of France is",
196 |             "max_tokens": 10,
197 |             "stream": True,
198 |         }
199 | 
200 |         response = requests.post(
201 |             f"{server_url}/completions", json=completion_data, stream=True
202 |         )
203 |         assert response.status_code == 200
204 | 
205 |         # Read streaming response
206 |         lines = response.iter_lines()
207 |         first_line = next(lines, None)
208 |         assert first_line is not None
209 |         assert first_line.startswith(b"data: ")
210 | 
211 |     def test_streaming_chat_completions(self, server_url):
212 |         """Test streaming chat completion endpoints"""
213 |         chat_data = {
214 |             "messages": [{"role": "user", "content": "What is the capital of France?"}],
215 |             "max_tokens": 10,
216 |             "stream": True,
217 |         }
218 | 
219 |         response = requests.post(
220 |             f"{server_url}/chat/completions", json=chat_data, stream=True
221 |         )
222 |         assert response.status_code == 200
223 | 
224 |         # Read streaming response
225 |         lines = response.iter_lines()
226 |         first_line = next(lines, None)
227 |         assert first_line is not None
228 |         assert first_line.startswith(b"data: ")
229 | 
230 |     def test_error_handling(self, server_url):
231 |         """Test error handling for invalid requests"""
232 |         # Test invalid JSON
233 |         response = requests.post(f"{server_url}/completions", data="invalid json")
234 |         assert response.status_code == 500
235 | 
236 |         # Test missing required fields
237 |         response = requests.post(f"{server_url}/completions", json={})
238 |         assert response.status_code in [400, 422]
239 | 
240 |         # Test invalid endpoint
241 |         response = requests.get(f"{server_url}/invalid_endpoint")
242 |         assert response.status_code == 404
243 | 
244 |     def test_concurrent_requests(self, server_url):
245 |         """Test handling of concurrent requests"""
246 |         import concurrent.futures
247 | 
248 |         def make_request():
249 |             response = requests.get(f"{server_url}/health")
250 |             return response.status_code
251 | 
252 |         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
253 |             futures = [executor.submit(make_request) for _ in range(10)]
254 |             results = [future.result() for future in futures]
255 | 
256 |         # All requests should succeed
257 |         assert all(status == 200 for status in results)
258 | 
259 | 
260 | # Test with embedding model
261 | class TestServerHTTPEmbedding:
262 |     """Test suite for embedding-specific HTTP endpoints"""
263 | 
264 |     @pytest.fixture(scope="class")
265 |     def embedding_server_url(self):
266 |         """Start HTTP server using xllamacpp.Server with embedding model"""
267 |         # Configure server parameters for embedding model
268 |         params = xlc.CommonParams()
269 |         params.model.path = os.path.join(
270 |             os.path.dirname(__file__), "../models/Qwen3-Embedding-0.6B-Q8_0.gguf"
271 |         )
272 |         params.embedding = True
273 |         params.n_predict = -1
274 |         params.n_ctx = 512
275 |         params.n_batch = 128
276 |         params.n_ubatch = 128
277 |         params.cpuparams.n_threads = 2
278 |         params.cpuparams_batch.n_threads = 2
279 |         params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST
280 | 
281 |         # Create server instance - this automatically starts the HTTP server
282 |         server = xlc.Server(params)
283 | 
284 |         # Wait for server to be ready - use different port to avoid conflicts
285 |         base_url = server.listening_address
286 |         max_wait = 5
287 |         wait_interval = 0.5
288 | 
289 |         for _ in range(int(max_wait / wait_interval)):
290 |             try:
291 |                 response = requests.get(f"{base_url}/health", timeout=1)
292 |                 if response.status_code == 200:
293 |                     yield base_url
294 |                     break
295 |             except requests.exceptions.RequestException:
296 |                 time.sleep(wait_interval)
297 |         else:
298 |             pytest.fail("Embedding server failed to start within timeout period")
299 | 
300 |         # Server will be automatically cleaned up when the object goes out of scope
301 | 
302 |     def test_embedding_model_specific(self, embedding_server_url):
303 |         """Test embedding-specific functionality"""
304 |         embedding_data = {
305 |             "input": [
306 |                 "I believe the meaning of life is",
307 |                 "This is a test",
308 |                 "This is another test",
309 |             ]
310 |         }
311 | 
312 |         response = requests.post(
313 |             f"{embedding_server_url}/v1/embeddings", json=embedding_data
314 |         )
315 |         assert response.status_code == 200
316 |         data = response.json()
317 |         assert len(data["data"]) == 3
318 | 
319 |         # Check embedding dimensions (should be consistent)
320 |         first_embedding = data["data"][0]["embedding"]
321 |         assert len(first_embedding) > 0
322 | 
323 |         for item in data["data"]:
324 |             assert len(item["embedding"]) == len(first_embedding)
325 | 
326 | 
327 | # Test with rerank model
328 | class TestServerHTTPRerank:
329 |     """Test suite for rerank-specific HTTP endpoints"""
330 | 
331 |     @pytest.fixture(scope="class")
332 |     def rerank_server_url(self):
333 |         """Start HTTP server using xllamacpp.Server with rerank model"""
334 |         # Configure server parameters for rerank model
335 |         params = xlc.CommonParams()
336 |         params.model.path = os.path.join(
337 |             os.path.dirname(__file__), "../models/bge-reranker-v2-m3-Q2_K.gguf"
338 |         )
339 |         params.embedding = True
340 |         params.n_predict = -1
341 |         params.n_ctx = 512
342 |         params.n_batch = 128
343 |         params.n_ubatch = 128
344 |         params.cpuparams.n_threads = 2
345 |         params.cpuparams_batch.n_threads = 2
346 |         params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK
347 | 
348 |         # Create server instance - this automatically starts the HTTP server
349 |         server = xlc.Server(params)
350 | 
351 |         # Wait for server to be ready - use different port to avoid conflicts
352 |         base_url = server.listening_address
353 |         max_wait = 5
354 |         wait_interval = 0.5
355 | 
356 |         for _ in range(int(max_wait / wait_interval)):
357 |             try:
358 |                 response = requests.get(f"{base_url}/health", timeout=1)
359 |                 if response.status_code == 200:
360 |                     yield base_url
361 |                     break
362 |             except requests.exceptions.RequestException:
363 |                 time.sleep(wait_interval)
364 |         else:
365 |             pytest.fail("Rerank server failed to start within timeout period")
366 | 
367 |         # Server will be automatically cleaned up when the object goes out of scope
368 | 
369 |     def test_rerank_model_specific(self, rerank_server_url):
370 |         """Test rerank-specific functionality"""
371 |         TEST_DOCUMENTS = [
372 |             "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
373 |             "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
374 |             "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
375 |             "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.",
376 |         ]
377 | 
378 |         response = requests.post(
379 |             f"{rerank_server_url}/rerank",
380 |             json={
381 |                 "query": "Machine learning is",
382 |                 "documents": TEST_DOCUMENTS,
383 |             },
384 |         )
385 |         assert response.status_code == 200
386 |         body = response.json()
387 |         assert len(body["results"]) == 4
388 | 
389 |         most_relevant = body["results"][0]
390 |         least_relevant = body["results"][0]
391 |         for doc in body["results"]:
392 |             if doc["relevance_score"] > most_relevant["relevance_score"]:
393 |                 most_relevant = doc
394 |             if doc["relevance_score"] < least_relevant["relevance_score"]:
395 |                 least_relevant = doc
396 | 
397 |         assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
398 |         assert most_relevant["index"] == 2
399 |         assert least_relevant["index"] == 3
400 | 


--------------------------------------------------------------------------------
/src/xllamacpp/memory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022-2023 XProbe Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import logging
 16 | from collections.abc import Sequence
 17 | from dataclasses import dataclass
 18 | from typing import Any, TYPE_CHECKING
 19 | 
 20 | if TYPE_CHECKING:
 21 |     from gguf import GGUFReader, GGUFValueType  # type: ignore
 22 | 
 23 | 
 24 | def _import_gguf():
 25 |     try:
 26 |         from gguf import GGUFReader, GGUFValueType  # type: ignore
 27 | 
 28 |         return GGUFReader, GGUFValueType
 29 |     except Exception as e:
 30 |         raise RuntimeError(
 31 |             "Optional dependency 'gguf' is required for xllamacpp.memory. Install it with `pip install gguf`."
 32 |         ) from e
 33 | 
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | 
 38 | def get_file_host_endian(reader: "GGUFReader") -> tuple[str, str]:
 39 |     file_endian = reader.endianess.name  # codespell:ignore
 40 |     if reader.byte_order == "S":
 41 |         host_endian = "BIG" if file_endian == "LITTLE" else "LITTLE"
 42 |     else:
 43 |         host_endian = file_endian
 44 |     return host_endian, file_endian
 45 | 
 46 | 
 47 | def dump_metadata_json(reader: "GGUFReader", model_path: str) -> dict:
 48 |     _, GGUFValueType = _import_gguf()
 49 |     host_endian, file_endian = get_file_host_endian(reader)
 50 |     metadata: dict[str, Any] = {}
 51 |     tensors: dict[str, Any] = {}
 52 |     result = {
 53 |         "filename": model_path,
 54 |         "endian": file_endian,
 55 |         "metadata": metadata,
 56 |         "tensors": tensors,
 57 |     }
 58 |     for idx, field in enumerate(reader.fields.values()):
 59 |         curr: dict[str, Any] = {
 60 |             "index": idx,
 61 |             "type": field.types[0].name if field.types else "UNKNOWN",
 62 |             "offset": field.offset,
 63 |         }
 64 |         metadata[field.name] = curr
 65 |         if field.types[:1] == [GGUFValueType.ARRAY]:
 66 |             curr["array_types"] = [t.name for t in field.types][1:]
 67 |             curr["value"] = field.contents()
 68 |         else:
 69 |             curr["value"] = field.contents()
 70 |         for i, tensor in enumerate(reader.tensors):
 71 |             tensors[tensor.name] = {
 72 |                 "index": i,
 73 |                 "shape": tensor.shape.tolist(),
 74 |                 "type": tensor.tensor_type.name,
 75 |                 "offset": tensor.field.offset,
 76 |                 "n_bytes": tensor.n_bytes,
 77 |             }
 78 |     return result
 79 | 
 80 | 
 81 | @dataclass
 82 | class MemoryEstimate:
 83 |     # How many layers we predict we can load
 84 |     layers: int
 85 |     # The size of the graph which occupies the main GPU
 86 |     graph: int
 87 |     # How much VRAM will be allocated given the number of layers we predict
 88 |     vram_size: int
 89 |     # The total size of the model if loaded into VRAM.  If all layers are loaded, vram_size == total_size
 90 |     total_size: int
 91 |     # For multi-GPU scenarios, this provides the tensor split parameter
 92 |     tensor_split: list[float]
 93 |     # For multi-GPU scenarios, this is the size in bytes per GPU
 94 |     gpu_sizes: list[int]
 95 | 
 96 | 
 97 | def _get_max_min(value):
 98 |     if isinstance(value, Sequence):
 99 |         return max(value), min(value)
100 |     else:
101 |         return value, value
102 | 
103 | 
104 | def graph_size(
105 |     metadata: dict,
106 |     context_length: int,
107 |     batch_size: int,
108 |     num_parallel: int,
109 |     kv_cache_type: str,
110 | ):
111 |     """
112 |     Most of the logic comes from `GraphSize` in https://github.com/ollama/ollama/blob/main/fs/ggml/ggml.go
113 |     """
114 |     if context_length < batch_size:
115 |         batch_size = context_length
116 | 
117 |     architecture = metadata["general.architecture"]["value"]
118 |     embedding_length = metadata[f"{architecture}.embedding_length"]["value"]
119 |     block_count = metadata[f"{architecture}.block_count"]["value"]
120 |     head_count_max, head_count_min = _get_max_min(
121 |         metadata.get(f"{architecture}.attention.head_count", {}).get("value", 1)
122 |     )
123 |     head_count_kv_max, head_count_kv_min = _get_max_min(
124 |         metadata.get(f"{architecture}.attention.head_count_kv", {}).get("value", 1)
125 |     )
126 |     vocab = len(metadata["tokenizer.ggml.tokens"]["value"])
127 |     embedding_head_count_max = (
128 |         (embedding_length // head_count_min) if head_count_min > 0 else 0
129 |     )
130 |     embedding_head_count_k = metadata.get(
131 |         f"{architecture}.attention.key_length", {}
132 |     ).get("value", embedding_head_count_max)
133 |     embedding_head_count_v = metadata.get(
134 |         f"{architecture}.attention.value_length", {}
135 |     ).get("value", embedding_head_count_max)
136 | 
137 |     # f16(default)
138 |     bytes_per_kv_element = {
139 |         "q8_0": 1,  # 1/2 of fp16
140 |         "q4_0": 0.5,  # 1/4 of fp16
141 |     }.get(kv_cache_type, 2)
142 | 
143 |     kv = [0] * block_count
144 |     for i in range(block_count):
145 |         kv[i] = (
146 |             context_length
147 |             * (embedding_head_count_k + embedding_head_count_v)
148 |             * head_count_kv_max
149 |             * bytes_per_kv_element
150 |         )
151 | 
152 |     full_offload = 0
153 |     partial_offload = 0
154 |     if architecture in ["llama", "llama4"]:
155 |         full_offload = max(
156 |             4
157 |             * batch_size
158 |             * (1 + 4 * embedding_length + context_length * (1 + head_count_max)),
159 |             4 * batch_size * (embedding_length + vocab),
160 |         )
161 |         partial_offload = 4 * batch_size * embedding_length
162 |         partial_offload += max(
163 |             4
164 |             * batch_size
165 |             * (1 + embedding_length + max(context_length, embedding_length))
166 |             + embedding_length * embedding_length * 9 / 16
167 |             + 4
168 |             * context_length
169 |             * (
170 |                 batch_size * head_count_max
171 |                 + embedding_head_count_max * head_count_kv_max
172 |             ),
173 |             4 * batch_size * (embedding_length + vocab)
174 |             + embedding_length * vocab * 105 / 128,
175 |         )
176 |     elif architecture in ["gemma", "gemma2", "gemma3"]:
177 |         full_offload = max(
178 |             4 * batch_size * (embedding_length + vocab),
179 |             4
180 |             * batch_size
181 |             * (
182 |                 2
183 |                 + context_length
184 |                 + context_length * head_count_max
185 |                 + 2 * embedding_length
186 |                 + 2 * embedding_head_count_k * head_count_max
187 |             ),
188 |         )
189 |         partial_offload = max(
190 |             4 * embedding_length * batch_size
191 |             + embedding_length * vocab * 105 / 128
192 |             + 4 * vocab * batch_size,
193 |             4
194 |             * batch_size
195 |             * (
196 |                 2 * embedding_length
197 |                 + 1
198 |                 + 2 * embedding_head_count_k * head_count_max
199 |                 + context_length
200 |                 + context_length * head_count_max
201 |             )
202 |             + 4 * embedding_head_count_k * context_length * 8
203 |             + embedding_length * embedding_head_count_k * head_count_max * 9 / 16,
204 |         )
205 |         if architecture == "gemma3":
206 |             gemma3_global_cache_count = 6
207 |             sliding_window = (
208 |                 num_parallel
209 |                 * metadata[f"{architecture}.attention.sliding_window"]["value"]
210 |                 + batch_size
211 |             )
212 |             for i in range(block_count):
213 |                 if (i + 1) % gemma3_global_cache_count != 0:
214 |                     kv[i] = (
215 |                         sliding_window
216 |                         * (embedding_head_count_k + embedding_head_count_v)
217 |                         * head_count_kv_max
218 |                         * bytes_per_kv_element
219 |                     )
220 |     elif architecture == "qwen2":
221 |         full_offload = max(
222 |             4 * batch_size * (embedding_length + vocab),
223 |             4
224 |             * batch_size
225 |             * (
226 |                 1
227 |                 + 2 * embedding_length
228 |                 + context_length
229 |                 + context_length * head_count_max
230 |             ),
231 |         )
232 | 
233 |         partial_offload = max(
234 |             4 * batch_size * (embedding_length + vocab)
235 |             + embedding_length * vocab * 105 / 128,
236 |             4
237 |             * (
238 |                 batch_size
239 |                 * (1 + 2 * embedding_length + context_length * (1 + head_count_max))
240 |                 + embedding_length * (1 + context_length)
241 |             ),
242 |         )
243 |     elif architecture == "stablelm":
244 |         full_offload = (
245 |             4
246 |             * batch_size
247 |             * (context_length * (1 + head_count_max) + 3 * embedding_length + 2)
248 |         )
249 |         partial_offload = max(
250 |             4 * batch_size * (vocab + 2 * embedding_length), full_offload
251 |         )
252 |     elif architecture == "deepseek2":
253 |         full_offload = max(
254 |             4 * batch_size * (3 * embedding_length + vocab),
255 |             4
256 |             * batch_size
257 |             * (
258 |                 3 * embedding_length
259 |                 + 2
260 |                 + context_length * (1 + head_count_kv_max)
261 |                 + 2 * embedding_head_count_k * head_count_kv_max
262 |             ),
263 |         )
264 | 
265 |         partial_offload = max(
266 |             4 * batch_size * (3 * embedding_length + vocab)
267 |             + embedding_length * vocab * 105 / 128,
268 |             4
269 |             * batch_size
270 |             * (
271 |                 2 * embedding_length
272 |                 + 1
273 |                 + 2 * embedding_head_count_k * head_count_kv_max
274 |                 + context_length
275 |                 + context_length * head_count_kv_max
276 |             )
277 |             + 4 * embedding_head_count_k * context_length * head_count_kv_max
278 |             + embedding_length * embedding_head_count_k * head_count_kv_max * 9 / 16,
279 |         )
280 | 
281 |     kv_total = sum(kv)
282 |     if partial_offload == 0:
283 |         partial_offload = (
284 |             head_count_max
285 |             / (1 if head_count_kv_min <= 0 else head_count_kv_min)
286 |             * kv_total
287 |             / 6
288 |         )
289 |     if full_offload == 0:
290 |         full_offload = partial_offload
291 | 
292 |     return kv, partial_offload, full_offload
293 | 
294 | 
295 | def projector_memory_requirements(projector: str):
296 |     GGUFReader, _ = _import_gguf()
297 |     reader = GGUFReader(projector, "r")
298 |     data = dump_metadata_json(reader, projector)
299 |     return sum(t["n_bytes"] for t in data["tensors"].values())
300 | 
301 | 
302 | def estimate_gpu_layers(
303 |     gpus: list[dict],
304 |     model_path: str,
305 |     projectors: list[str],
306 |     context_length: int,
307 |     batch_size: int,
308 |     num_parallel: int,
309 |     kv_cache_type: str,
310 | ):
311 |     """
312 |     Most of the logic comes from `EstimateGPULayers` in https://github.com/ollama/ollama/blob/main/llm/memory.go
313 |     """
314 |     # Projectors loaded into GPU0 only
315 |     projector_weights = sum(map(projector_memory_requirements, projectors))
316 |     if projector_weights > 0:
317 |         # Multimodal models require at least 2048 context
318 |         context_length = max(context_length, 2048)
319 |     GGUFReader, _ = _import_gguf()
320 |     reader = GGUFReader(model_path, "r")
321 |     data = dump_metadata_json(reader, model_path)
322 |     metadata = data["metadata"]
323 |     kv, graph_partial_offload, graph_full_offload = graph_size(
324 |         metadata,
325 |         context_length=context_length,
326 |         batch_size=batch_size,
327 |         num_parallel=num_parallel,
328 |         kv_cache_type=kv_cache_type,
329 |     )
330 |     # Get all layer sizes
331 |     architecture = metadata["general.architecture"]["value"]
332 |     block_count = metadata[f"{architecture}.block_count"]["value"]
333 |     layer_sizes = [0] * block_count
334 |     for name, layer in data["tensors"].items():
335 |         if name.startswith("blk."):
336 |             index = int(name[len("blk.") :].split(".")[0])
337 |             layer_sizes[index] += layer["n_bytes"]
338 |     layer_size = layer_sizes[0] if layer_sizes else 0
339 | 
340 |     if len(kv) > 0:
341 |         layer_size += kv[0]
342 |     # On metal there's no partial offload overhead
343 |     if gpus[0]["name"] == "Metal":
344 |         graph_partial_offload = graph_full_offload
345 |     elif len(gpus) > 1:
346 |         # Multi gpu should always use the partial graph size
347 |         graph_full_offload = graph_partial_offload
348 | 
349 |     # Get output layer size
350 |     memory_layer_output = 0
351 |     # Output layer handled at the end if we have space
352 |     for name, layer in data["tensors"].items():
353 |         if any(
354 |             name.startswith(prefix)
355 |             for prefix in ["output_norm", "output", "token_embd"]
356 |         ):
357 |             memory_layer_output += layer["n_bytes"]
358 | 
359 |     # Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
360 |     default_memory_min = 512 * 1024**2
361 |     gpu_allocations = [0] * len(gpus)
362 |     gpus_with_space: list[int] = []
363 |     for i in range(len(gpus)):
364 |         gpu0_overhead = projector_weights if len(gpus_with_space) == 0 else 0
365 |         minimum_memory = gpus[i].get("memory_min", default_memory_min)
366 |         if (
367 |             gpus[i]["memory_free"]
368 |             < gpu0_overhead
369 |             + max(graph_partial_offload, graph_full_offload)
370 |             + minimum_memory
371 |             + 2 * layer_size
372 |         ):
373 |             continue
374 |         gpus_with_space.append(i)
375 |         gpu_allocations[i] += gpu0_overhead + minimum_memory + layer_size
376 | 
377 |     overflow = 0
378 |     if len(gpus_with_space) == 0:
379 |         overflow = projector_weights
380 | 
381 |     # For all the layers, find where they can fit on the GPU(s)
382 |     layer_count = 0
383 |     layer_counts = [0] * len(gpus)
384 |     for i in range(block_count - 1, -1, -1):
385 |         layer_size = layer_sizes[i]
386 |         layer_size += kv[i]
387 | 
388 |         # Distribute the layers across the GPU(s) that have space
389 |         for j in range(len(gpus_with_space), 0, -1):
390 |             g = gpus_with_space[i % j]
391 |             used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
392 |             if gpus[g]["memory_free"] > used + layer_size:
393 |                 gpu_allocations[g] += layer_size
394 |                 layer_counts[g] += 1
395 |                 layer_count += 1
396 |                 break
397 |             else:
398 |                 gpus_with_space = (
399 |                     gpus_with_space[: i % j] + gpus_with_space[i % j + 1 :]
400 |                 )
401 | 
402 |         if len(gpus_with_space) == 0:
403 |             overflow += layer_size
404 | 
405 |     fully_loaded = False
406 |     if layer_count >= block_count:
407 |         fully_loaded = True
408 | 
409 |     # Determine if we need to consider output then find where it fits
410 |     if memory_layer_output > 0:
411 |         for j in range(len(gpus_with_space), 0, -1):
412 |             g = gpus_with_space[layer_count % j]
413 |             used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
414 |             if gpus[g]["memory_free"] > used + memory_layer_output:
415 |                 gpu_allocations[g] += memory_layer_output
416 |                 layer_counts[g] += 1
417 |                 layer_count += 1
418 |                 break
419 |             else:
420 |                 gpus_with_space = (
421 |                     gpus_with_space[: layer_count % j]
422 |                     + gpus_with_space[layer_count % j + 1 :]
423 |                 )
424 | 
425 |         if layer_count < block_count + 1:
426 |             fully_loaded = False
427 |             overflow += memory_layer_output
428 | 
429 |     # Add the applicable (full or partial) graph allocations
430 |     for i in range(len(gpus)):
431 |         if layer_counts[i] <= 0:
432 |             continue
433 |         if fully_loaded:
434 |             gpu_allocations[i] += graph_full_offload
435 |         else:
436 |             gpu_allocations[i] += graph_partial_offload
437 | 
438 |     if fully_loaded:
439 |         graph_offload = graph_full_offload
440 |     else:
441 |         graph_offload = graph_partial_offload
442 | 
443 |     # Normalize splits
444 |     tensor_split = layer_counts
445 |     if layer_count != 0:
446 |         for i in range(len(tensor_split)):
447 |             tensor_split[i] /= layer_count
448 | 
449 |     # Summaries
450 |     memory_required_partial = sum(gpu_allocations)
451 |     memory_required_total = memory_required_partial + overflow
452 | 
453 |     estimate = MemoryEstimate(
454 |         layers=0,
455 |         graph=0,
456 |         vram_size=0,
457 |         total_size=int(memory_required_total),
458 |         tensor_split=tensor_split,
459 |         gpu_sizes=[],
460 |     )
461 |     if gpus[0]["name"] == "CPU":
462 |         return estimate
463 |     if layer_count == 0:
464 |         return estimate
465 | 
466 |     estimate.layers = layer_count
467 |     estimate.graph = int(graph_offload)
468 |     estimate.vram_size = int(memory_required_partial)
469 |     estimate.total_size = int(memory_required_total)
470 |     estimate.tensor_split = tensor_split
471 |     estimate.gpu_sizes = [int(i) for i in gpu_allocations]
472 |     return estimate
473 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheel-cuda-hip.yaml:
--------------------------------------------------------------------------------
  1 | name: Build Wheels (CUDA & HIP)
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - '*'
  7 |   workflow_dispatch:
  8 | 
  9 | # on:
 10 | #   push:
 11 | #     branches:
 12 | #       - '*'
 13 | #   pull_request:
 14 | #     types: ['opened', 'reopened', 'synchronize']
 15 | 
 16 | concurrency:
 17 |   group: ${{ github.workflow }}-${{ github.ref }}
 18 |   cancel-in-progress: true
 19 | 
 20 | permissions:
 21 |   contents: write
 22 | 
 23 | jobs:
 24 |   build_wheels_hip_linux:
 25 |     name: Build Wheel HIP Linux ${{ matrix.pyver }} ${{matrix.hip}}
 26 |     runs-on: ubuntu-22.04
 27 |     strategy:
 28 |       matrix:
 29 |         pyver: ["3.10", "3.11", "3.12", "3.13"]
 30 |         hip: ["6.3.4", "6.4.1"]
 31 |     steps:
 32 |       - name: Free Disk Space (Ubuntu)
 33 |         uses: jlumbroso/free-disk-space@main
 34 |         with:
 35 |           # this might remove tools that are actually needed,
 36 |           # if set to "true" but frees about 6 GB
 37 |           tool-cache: false
 38 |           
 39 |           # all of these default to true, but feel free to set to
 40 |           # "false" if necessary for your workflow
 41 |           android: true
 42 |           dotnet: true
 43 |           haskell: true
 44 |           large-packages: false
 45 |           docker-images: true
 46 |           swap-storage: true
 47 | 
 48 |       - name: Clone
 49 |         id: checkout
 50 |         uses: actions/checkout@v4
 51 |         with:
 52 |           submodules: "recursive"
 53 | 
 54 |       - name: Set up Python ${{ matrix.pyver }}
 55 |         id: setup-python
 56 |         uses: actions/setup-python@v5
 57 |         with:
 58 |           python-version: ${{ matrix.pyver }}
 59 | 
 60 |       - name: Start ROCm container
 61 |         run: |
 62 |           # Get Python location from setup-python
 63 |           PYTHON_PATH=$(which python)
 64 |           PYTHON_HOME=$(dirname $(dirname $PYTHON_PATH))
 65 |           
 66 |           # Start the container with Python from host mounted
 67 |           docker run -d \
 68 |             --name rocm-container \
 69 |             -v ${{ github.workspace }}:/workspace \
 70 |             -v $PYTHON_HOME:$PYTHON_HOME \
 71 |             -e PATH=$PYTHON_HOME/bin:$PATH \
 72 |             -w /workspace \
 73 |             rocm/dev-ubuntu-22.04:${{ matrix.hip }} \
 74 |             sleep infinity
 75 |           
 76 |           # Verify Python installation in container
 77 |           docker exec rocm-container python --version
 78 |           docker exec rocm-container pip --version
 79 | 
 80 |       - name: System Dependencies
 81 |         run: |
 82 |           docker exec rocm-container bash -c '
 83 |             df -ha
 84 |             apt-get update
 85 |             apt-get install -y build-essential git cmake libcurl4-openssl-dev patchelf rocblas-dev hipblas-dev rocwmma-dev curl
 86 |             apt-get clean
 87 |             df -ha
 88 |             hipconfig --full
 89 |             ls -alh /opt/rocm/lib
 90 |           '
 91 | 
 92 |       - name: Install Rust
 93 |         run: |
 94 |           docker exec rocm-container bash -c '
 95 |             curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 96 |             source $HOME/.cargo/env
 97 |             rustc --version
 98 |             cargo --version
 99 |             # Add cargo to PATH permanently for this container
100 |             echo "export PATH=\$HOME/.cargo/bin:\$PATH" >> $HOME/.bashrc
101 |           '
102 | 
103 |       - name: Setup Python environment
104 |         run: |
105 |           docker exec rocm-container bash -c '
106 |             python -m pip install --upgrade pip
107 |             python -m pip install build wheel auditwheel
108 |             python -m pip install -r requirements.txt
109 |           '
110 | 
111 |       - name: Build with native CMake HIP support
112 |         env:
113 |           XLLAMACPP_BUILD_HIP: "1"
114 |           VERSIONEER_CLOSEST_TAG_ONLY: "1"
115 |           VERBOSE: "1"
116 |         run: |
117 |           docker exec -e XLLAMACPP_BUILD_HIP=$XLLAMACPP_BUILD_HIP \
118 |                      -e VERSIONEER_CLOSEST_TAG_ONLY=$VERSIONEER_CLOSEST_TAG_ONLY \
119 |                      -e VERBOSE=$VERBOSE \
120 |                      rocm-container bash -l -c '
121 |             python --version
122 |             gcc -v
123 |             cargo --version
124 |             printenv
125 |             git config --global --add safe.directory "*"
126 |             make
127 |             python -m build --wheel
128 |             df -ha
129 |             echo "Clean up"
130 |             rm -rf build
131 |             rm -rf thirdparty
132 |             df -ha
133 |             auditwheel show dist/*.whl
134 |             auditwheel repair --plat manylinux_2_35_x86_64 dist/*.whl -w dist
135 |             rm dist/*-linux_x86_64.whl
136 |             ls -alh dist
137 |           '
138 | 
139 |       - name: Stop ROCm container and verify wheel files
140 |         if: always()
141 |         run: |
142 |           # Always stop and remove the container
143 |           docker stop rocm-container || true
144 |           docker rm rocm-container || true
145 |           
146 |           # Check if any wheel files exist in the dist directory
147 |           if [ -z "$(ls -A dist/*.whl 2>/dev/null)" ]; then
148 |             echo "❌ No wheel files found in dist directory!"
149 |             echo "Current directory contents:"
150 |             ls -la dist/ 2>/dev/null || echo "No dist directory found"
151 |             exit 1
152 |           fi
153 |           
154 |           echo "✅ Wheel files found in dist directory"
155 | 
156 |       # - uses: actions/upload-artifact@v4
157 |       #   with:
158 |       #     name: artifacts
159 |       #     path: ./dist/*.whl
160 |       #     overwrite: true
161 | 
162 |       - uses: softprops/action-gh-release@v2
163 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
164 |         with:
165 |           files: dist/*
166 |           tag_name: ${{ github.ref_name }}-rocm-${{matrix.hip}}
167 |         env:
168 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
169 | 
170 | 
171 |   build_wheels_vulkan:
172 |     name: Build Wheel Vulkan ${{ matrix.os }} ${{ matrix.pyver }}
173 |     runs-on: ${{ matrix.os }}
174 |     strategy:
175 |       matrix:
176 |         os: ["ubuntu-22.04", "macos-15-intel", "windows-2022"]
177 |         pyver: ["3.10", "3.11", "3.12", "3.13"]
178 |         vulkan: ["1.4.313.2"]
179 |     env:
180 |       VULKAN_VERSION: ${{ matrix.vulkan }}
181 |     steps:
182 |       - name: Clone
183 |         id: checkout
184 |         uses: actions/checkout@v4
185 |         with:
186 |           submodules: "recursive"
187 | 
188 |       - name: Set up Python ${{ matrix.pyver }}
189 |         id: setup-python
190 |         uses: actions/setup-python@v5
191 |         with:
192 |           python-version: ${{ matrix.pyver }}
193 | 
194 |       - name: System Dependencies (Linux)
195 |         if: runner.os == 'Linux'
196 |         run: |
197 |           sudo wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
198 |           sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
199 |           sudo apt-get update -y
200 |           sudo apt-get install -y build-essential git cmake libcurl4-openssl-dev patchelf vulkan-sdk mesa-vulkan-drivers
201 | 
202 |       - name: System Dependencies (Windows)
203 |         if: runner.os == 'Windows'
204 |         run: |
205 |            curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip
206 |            unzip -q w64devkit.zip -d .
207 |            echo "$(pwd)/w64devkit/bin" >> $GITHUB_PATH
208 |            curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
209 |            & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
210 |            Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
211 |            Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
212 | 
213 |       - name: System Dependencies (macOS)
214 |         if: runner.os == 'macOS'
215 |         run: |
216 |           # Download and install Vulkan SDK
217 |           brew install cmake libomp vulkan-headers glslang molten-vk shaderc vulkan-loader vulkan-tools
218 |           vulkaninfo
219 | 
220 |       - name: Setup Python environment
221 |         run: |
222 |           python -m pip install --upgrade pip
223 |           python -m pip install build wheel
224 |           python -m pip install -r requirements.txt
225 | 
226 |       - name: Install wheel repair tools (Linux)
227 |         if: runner.os == 'Linux'
228 |         run: python -m pip install auditwheel
229 | 
230 |       - name: Install wheel repair tools (macOS)
231 |         if: runner.os == 'macOS'
232 |         run: python -m pip install delocate
233 | 
234 |       - name: Install wheel repair tools (Windows)
235 |         if: runner.os == 'Windows'
236 |         run: python -m pip install delvewheel
237 | 
238 |       - name: Build Wheel
239 |         env:
240 |           XLLAMACPP_BUILD_VULKAN: "1"
241 |           VERSIONEER_CLOSEST_TAG_ONLY: "1"
242 |           VERBOSE: "1"
243 |         run: |
244 |           python --version
245 |           printenv
246 |           git config --global --add safe.directory "*"
247 |           make
248 |           python -m build --wheel
249 | 
250 |       - name: Repair Wheel (Linux)
251 |         if: runner.os == 'Linux'
252 |         run: |
253 |           auditwheel show dist/*.whl
254 |           auditwheel repair --plat manylinux_2_35_x86_64 dist/*.whl -w dist
255 |           rm dist/*-linux_x86_64.whl
256 |           ls -alh dist
257 | 
258 |       - name: Repair Wheel (macOS)
259 |         if: runner.os == 'macOS'
260 |         run: |
261 |           delocate-wheel -v dist/*.whl
262 |           ls -alh dist
263 | 
264 |       - name: Repair Wheel (Windows)
265 |         if: runner.os == 'Windows'
266 |         run: |
267 |           python -m delvewheel repair dist/*.whl -w dist
268 |           dir dist
269 | 
270 |       # - uses: actions/upload-artifact@v4
271 |       #   with:
272 |       #     name: artifacts-${{ runner.os }}
273 |       #     path: ./dist/*.whl
274 |       #     overwrite: true
275 | 
276 |       - uses: softprops/action-gh-release@v2
277 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
278 |         with:
279 |           files: dist/*
280 |           tag_name: ${{ github.ref_name }}-vulkan-${{ matrix.os }}
281 |         env:
282 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
283 | 
284 |   build_wheels_cuda_linux:
285 |     name: Build Wheel CUDA Linux ${{ matrix.platform }} ${{ matrix.pyver }} ${{ matrix.cuda }}
286 |     runs-on: ${{ matrix.platform }}
287 |     strategy:
288 |       matrix:
289 |         platform: ["ubuntu-22.04", "ubuntu-22.04-arm"]
290 |         pyver: ["3.10", "3.11", "3.12", "3.13"]
291 |         cuda: ["12.4.1", "12.8.1"]
292 |     env:
293 |       CUDAVER: ${{ matrix.cuda }}
294 | 
295 |     steps:
296 |       - name: Free Disk Space (Ubuntu)
297 |         uses: jlumbroso/free-disk-space@main
298 |         with:
299 |           tool-cache: false
300 |           android: true
301 |           dotnet: true
302 |           haskell: true
303 |           large-packages: false
304 |           docker-images: true
305 |           swap-storage: true
306 | 
307 |       - name: Clone
308 |         id: checkout
309 |         uses: actions/checkout@v4
310 |         with:
311 |           submodules: "recursive"
312 |           fetch-depth: 0
313 | 
314 |       - name: Setup Python
315 |         uses: actions/setup-python@v5
316 |         with:
317 |           python-version: ${{ matrix.pyver }}
318 |           cache: 'pip'
319 | 
320 |       - name: Setup Mamba
321 |         uses: conda-incubator/setup-miniconda@v3.1.1
322 |         with:
323 |           activate-environment: "llamacpp"
324 |           python-version: ${{ matrix.pyver }}
325 |           miniforge-version: latest
326 |           add-pip-as-python-dependency: true
327 |           auto-activate-base: false
328 | 
329 |       - name: Install Dependencies
330 |         env:
331 |           MAMBA_DOWNLOAD_FAILFAST: "0"
332 |           MAMBA_NO_LOW_SPEED_LIMIT: "1"
333 |         run: |
334 |           # Echo glibc version
335 |           ldd --version
336 | 
337 |           # First install basic build tools
338 |           sudo apt-get update
339 |           sudo apt-get install -y build-essential
340 |           
341 |           # Initialize conda for shell
342 |           source $CONDA/etc/profile.d/conda.sh
343 |           
344 |           # Activate the conda environment
345 |           conda activate llamacpp
346 |           
347 |           echo "CONDA_PREFIX after activation: $CONDA_PREFIX"
348 |           
349 |           # Try different CUDA package names and channels
350 |           echo "Attempting to install CUDA ${{ matrix.cuda }}..."
351 |           
352 |           # Install using the cuda meta-package from the official channel
353 |           mamba install -y -c conda-forge 'cuda==${{ matrix.cuda }}'
354 |           echo "Successfully installed CUDA ${{ matrix.cuda }}"
355 |           
356 |           # Verify CONDA_PREFIX is set
357 |           if [ -z "$CONDA_PREFIX" ]; then
358 |             echo "ERROR: CONDA_PREFIX is not set after conda activation"
359 |             exit 1
360 |           fi
361 |           
362 |           # Install build dependencies
363 |           python -m pip install build wheel
364 |           python -m pip install -r requirements.txt
365 |           
366 |           # Verify CUDA installation
367 |           echo "=== CUDA Installation Check ==="
368 |           echo "CONDA_PREFIX: $CONDA_PREFIX"
369 |           echo "=== CUDA Files in CONDA_PREFIX ==="
370 |           find $CONDA_PREFIX -name "*cuda*" -o -name "*nvcc*" | sort
371 |           echo "=== NVCC Version ==="
372 |           which nvcc || echo "nvcc not found in PATH"
373 |           nvcc --version || echo "nvcc version check failed"
374 |           
375 |           # Set CUDA environment variables
376 |           echo "=== Setting CUDA Environment Variables ==="
377 |           export CUDA_HOME=$CONDA_PREFIX
378 |           export CUDA_PATH=$CONDA_PREFIX
379 |           export CUDA_TOOLKIT_ROOT_DIR=$CONDA_PREFIX
380 |           export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
381 |           export PATH="$CONDA_PREFIX/bin:$PATH"
382 |           
383 |           # Save to GITHUB_ENV for subsequent steps
384 |           echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV
385 |           echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV
386 |           echo "CUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR" >> $GITHUB_ENV
387 |           echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
388 |           echo "PATH=$PATH" >> $GITHUB_ENV
389 | 
390 |       - name: Verify CUDA Version
391 |         run: |
392 |           source $CONDA/etc/profile.d/conda.sh
393 |           conda activate llamacpp
394 |           nvcc --version | grep "release $(echo ${{ matrix.cuda }} | cut -d. -f1,2)"
395 | 
396 |       - name: Build Wheel
397 |         run: |
398 |           # Echo glibc version
399 |           ldd --version
400 | 
401 |           # Initialize conda for shell
402 |           source $CONDA/etc/profile.d/conda.sh
403 |           conda activate llamacpp
404 |           
405 |           echo "=== Build Environment ==="
406 |           echo "System: ${{ matrix.platform }}"
407 |           echo "Python: $(which python)"
408 |           echo "Pip: $(which pip)"
409 |           pip list
410 |           echo "CUDA_HOME: $CUDA_HOME"
411 |           echo "PATH: $PATH"
412 |           echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
413 |           echo "CONDA_PREFIX: $CONDA_PREFIX"
414 |           which nvcc
415 |           nvcc --version
416 |           
417 |           # Set build environment
418 |           cuda_version=${CUDAVER//./}
419 |           cuda_version=${cuda_version:0:${#cuda_version}-1}
420 |           
421 |           # Export build variables
422 |           export XLLAMACPP_BUILD_CUDA=1
423 |           export VERSIONEER_CLOSEST_TAG_ONLY=1
424 |           export VERBOSE=1S
425 |           
426 |           make
427 |           python -m build --wheel
428 |           
429 |           echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV
430 | 
431 |       - uses: softprops/action-gh-release@v2
432 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
433 |         with:
434 |           files: dist/*
435 |           tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
436 |         env:
437 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
438 | 
439 |   build_wheels_cuda_windows:
440 |     name: Build Wheel CUDA Windows ${{ matrix.pyver }} ${{ matrix.cuda }}
441 |     runs-on: windows-2022
442 |     strategy:
443 |       matrix:
444 |         pyver: ["3.10", "3.11", "3.12", "3.13"]
445 |         cuda: ["12.4.1", "12.8.1"]
446 |     defaults:
447 |       run:
448 |         shell: bash
449 |     env:
450 |       CUDAVER: ${{ matrix.cuda }}
451 | 
452 |     steps:
453 |       - name: Clone
454 |         id: checkout
455 |         uses: actions/checkout@v4
456 |         with:
457 |           submodules: "recursive"
458 |           fetch-depth: 0
459 | 
460 |       - name: Setup Python
461 |         uses: actions/setup-python@v5
462 |         with:
463 |           python-version: ${{ matrix.pyver }}
464 |           cache: 'pip'
465 | 
466 |       - name: Install python dependencies
467 |         run: |
468 |           python -m pip install --upgrade pip
469 |           python -m pip install build wheel delvewheel
470 |           python -m pip install -r requirements.txt
471 | 
472 |       - name: Download and install win64devkit
473 |         run: |
474 |           curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip
475 |           unzip -q w64devkit.zip -d .
476 | 
477 |       - name: Add w64devkit to PATH
478 |         run: |
479 |           echo "$(pwd)/w64devkit/bin" >> $GITHUB_PATH
480 | 
481 |       - name: Setup CUDA
482 |         uses: Jimver/cuda-toolkit@v0.2.24
483 |         id: cuda-toolkit
484 |         with:
485 |           use-github-cache: false
486 |           cuda: ${{ matrix.cuda }}
487 | 
488 |       - name: Build Wheel
489 |         run: |
490 |           cuda_version=${CUDAVER//./}
491 |           cuda_version=${cuda_version:0:${#cuda_version}-1}
492 |           
493 |           export XLLAMACPP_BUILD_CUDA=1
494 |           export VERSIONEER_CLOSEST_TAG_ONLY=1
495 |           export VERBOSE=1
496 |                     
497 |           make
498 |           python -m build --wheel
499 |           
500 |           # On Windows, we use delvewheel for wheel repair
501 |           python -m delvewheel repair --exclude nvcuda.dll dist/*.whl -w dist
502 |           
503 |           echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV
504 | 
505 |       - uses: softprops/action-gh-release@v2
506 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
507 |         with:
508 |           files: dist/*
509 |           tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
510 |         env:
511 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
512 | 


--------------------------------------------------------------------------------
/src/llama.cpp/include/ggml-backend.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "ggml.h"
  4 | #include "ggml-alloc.h"
  5 | 
  6 | #ifdef GGML_BACKEND_SHARED
  7 | #    if defined(_WIN32) && !defined(__MINGW32__)
  8 | #        ifdef GGML_BACKEND_BUILD
  9 | #            define GGML_BACKEND_API __declspec(dllexport) extern
 10 | #        else
 11 | #            define GGML_BACKEND_API __declspec(dllimport) extern
 12 | #        endif
 13 | #    else
 14 | #        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
 15 | #    endif
 16 | #else
 17 | #    define GGML_BACKEND_API extern
 18 | #endif
 19 | 
 20 | #ifdef  __cplusplus
 21 | extern "C" {
 22 | #endif
 23 | 
 24 |     typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
 25 |     typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 26 |     typedef struct ggml_backend_event * ggml_backend_event_t;
 27 |     typedef struct ggml_backend * ggml_backend_t;
 28 |     typedef void * ggml_backend_graph_plan_t;
 29 |     typedef struct ggml_backend_reg * ggml_backend_reg_t;
 30 |     typedef struct ggml_backend_device * ggml_backend_dev_t;
 31 | 
 32 | 
 33 |     //
 34 |     // Backend buffer type
 35 |     //
 36 | 
 37 |     GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
 38 |     GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
 39 |     GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
 40 |     GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
 41 |     GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
 42 |     GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
 43 |     GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
 44 | 
 45 |     //
 46 |     // Backend buffer
 47 |     //
 48 | 
 49 |     enum ggml_backend_buffer_usage {
 50 |         GGML_BACKEND_BUFFER_USAGE_ANY = 0,
 51 |         GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
 52 |         GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
 53 |     };
 54 | 
 55 |     GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
 56 |     GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
 57 |     GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
 58 |     GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
 59 |     GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 60 |     GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
 61 |     GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
 62 |     GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
 63 |     GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
 64 |     GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
 65 |     GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
 66 |     GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
 67 |     GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
 68 |     GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
 69 | 
 70 |     // tensor copy between different backends
 71 |     GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
 72 | 
 73 |     //
 74 |     // Backend (stream)
 75 |     //
 76 | 
 77 |     GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
 78 |     GGML_API const char * ggml_backend_name(ggml_backend_t backend);
 79 |     GGML_API void         ggml_backend_free(ggml_backend_t backend);
 80 | 
 81 |     GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
 82 |     GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
 83 |     GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
 84 |     GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
 85 | 
 86 |     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 87 |     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 88 | 
 89 |     // "offset" refers to the offset in tensor->data for setting/getting data
 90 |     GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 91 |     GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 92 |     GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
 93 | 
 94 |     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 95 | 
 96 |     GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 97 |     GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 98 | 
 99 |     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100 |     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101 |     GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
102 | 
103 |     // NOTE: will be removed, use device version instead
104 |     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
105 |     GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
106 |     GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
107 | 
108 |     // asynchronous copy
109 |     // the copy is performed after all the currently queued operations in backend_src
110 |     // backend_dst will wait for the copy to complete before performing other operations
111 |     // automatic fallback to sync copy if async is not supported
112 |     GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
113 | 
114 |     GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
115 | 
116 |     //
117 |     // Events
118 |     //
119 | 
120 |     GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
121 |     GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
122 |     GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
123 |     GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
124 |     GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
125 | 
126 |     //
127 |     // Backend device
128 |     //
129 | 
130 |     enum ggml_backend_dev_type {
131 |         // CPU device using system memory
132 |         GGML_BACKEND_DEVICE_TYPE_CPU,
133 |         // GPU device using dedicated memory
134 |         GGML_BACKEND_DEVICE_TYPE_GPU,
135 |         // integrated GPU device using host memory
136 |         GGML_BACKEND_DEVICE_TYPE_IGPU,
137 |         // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
138 |         GGML_BACKEND_DEVICE_TYPE_ACCEL
139 |     };
140 | 
141 |     // functionality supported by the device
142 |     struct ggml_backend_dev_caps {
143 |         // asynchronous operations
144 |         bool async;
145 |         // pinned host buffer
146 |         bool host_buffer;
147 |         // creating buffers from host ptr
148 |         bool buffer_from_host_ptr;
149 |         // event synchronization
150 |         bool events;
151 |     };
152 | 
153 |     // all the device properties
154 |     struct ggml_backend_dev_props {
155 |         // device name
156 |         const char * name;
157 |         // device description
158 |         const char * description;
159 |         // device free memory in bytes
160 |         size_t memory_free;
161 |         // device total memory in bytes
162 |         size_t memory_total;
163 |         // device type
164 |         enum ggml_backend_dev_type type;
165 |         // device id
166 |         //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167 |         //   if the id is unknown, this should be NULL
168 |         const char * device_id;
169 |         // device capabilities
170 |         struct ggml_backend_dev_caps caps;
171 |     };
172 | 
173 |     GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
174 |     GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
175 |     GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
176 |     GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
177 |     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
178 |     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
179 |     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
180 |     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
181 |     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
182 |     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
183 | 
184 |     GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
185 |     GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
186 |     GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
187 | 
188 |     //
189 |     // Backend (reg)
190 |     //
191 | 
192 |     GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
193 |     GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
194 |     GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
195 |     GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
196 | 
197 |     // Common functions that may be obtained using ggml_backend_reg_get_proc_address
198 | 
199 |     // Split buffer type for tensor parallelism
200 |     typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
201 |     // Set the number of threads for the backend
202 |     typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
203 |     // Get additional buffer types provided by the device (returns a NULL-terminated array)
204 |     typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
205 |     // Set the abort callback for the backend
206 |     typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
207 |     // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
208 |     struct ggml_backend_feature {
209 |         const char * name;
210 |         const char * value;
211 |     };
212 |     typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
213 | 
214 |     //
215 |     // Backend registry
216 |     //
217 | 
218 |     GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
219 | 
220 |     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
221 | 
222 |     // Backend (reg) enumeration
223 |     GGML_API size_t             ggml_backend_reg_count(void);
224 |     GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
225 |     GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
226 | 
227 |     // Device enumeration
228 |     GGML_API size_t             ggml_backend_dev_count(void);
229 |     GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
230 |     GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
231 |     GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
232 | 
233 |     // Direct backend (stream) initialization
234 |     // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
235 |     GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
236 |     // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
237 |     GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
238 |     // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
239 |     GGML_API ggml_backend_t ggml_backend_init_best(void);
240 | 
241 |     // Load a backend from a dynamic library and register it
242 |     GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
243 |     // Unload a backend if loaded dynamically and unregister it
244 |     GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
245 |     // Load all known backends from dynamic libraries
246 |     GGML_API void               ggml_backend_load_all(void);
247 |     GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);
248 | 
249 |     //
250 |     // Backend scheduler
251 |     //
252 | 
253 |     // The backend scheduler allows for multiple backend devices to be used together
254 |     // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
255 |     // The backends are selected based on:
256 |     // - the backend that supports the operation
257 |     // - the location of the pre-allocated tensors (e.g. the weights)
258 |     /*
259 |       Example usage:
260 | 
261 |         // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
262 |         // preferrably to run on the same backend as the buffer
263 |         ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
264 | 
265 |         sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
266 | 
267 |         // initialize buffers from a max size graph (optional)
268 |         reserve_graph = build_graph(sched, max_batch_size);
269 | 
270 |         // manually assign nodes to a backend (optional, should not be needed in most cases)
271 |         struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
272 |         ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
273 | 
274 |         ggml_backend_sched_reserve(sched, reserve_graph);
275 | 
276 |         // compute
277 |         graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
278 |         for (int i = 0; i < 10; ++i) {
279 |             ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
280 |         }
281 | 
282 |         // if there are graph inputs:
283 |         graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
284 |         ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
285 |         ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
286 |         ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
287 |         ggml_backend_sched_graph_compute(sched, graph); // execute the graph
288 | 
289 |         // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
290 |         // allocate them statically via ggml_backend_alloc_ctx_tensors
291 |     }
292 |     */
293 | 
294 |     typedef struct ggml_backend_sched * ggml_backend_sched_t;
295 | 
296 |     // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
297 |     // when ask == true, the scheduler wants to know if the user wants to observe this node
298 |     // this allows the scheduler to batch nodes together in order to evaluate them in a single call
299 |     //
300 |     // when ask == false, the scheduler is passing the node tensor to the user for observation
301 |     // if the user returns false, the scheduler will cancel the graph compute
302 |     //
303 |     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
304 | 
305 |     // Initialize a backend scheduler, backends with low index are given priority over backends with high index
306 |     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
307 |     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
308 | 
309 |     // Initialize backend buffers from a measure graph
310 |     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311 | 
312 |     GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
313 |     GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
314 | 
315 |     // Get the number of splits of the last graph
316 |     GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
317 |     GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
318 | 
319 |     GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
320 |     GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
321 | 
322 |     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
323 |     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
324 | 
325 |     // Split graph without allocating it
326 |     GGML_API void                 ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
327 | 
328 |     // Allocate and compute graph on the backend scheduler
329 |     GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
330 |     GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
331 |     GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
332 |     GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
333 | 
334 |     // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
335 |     // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
336 |     // The correct way to use this API is to discard the deallocated tensors and create new ones.
337 |     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
338 | 
339 |     // Set a callback to be called for each resulting node during graph compute
340 |     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
341 | 
342 |     //
343 |     // Utils
344 |     //
345 | 
346 |     struct ggml_backend_graph_copy {
347 |         ggml_backend_buffer_t buffer;
348 |         struct ggml_context * ctx_allocated;
349 |         struct ggml_context * ctx_unallocated;
350 |         struct ggml_cgraph * graph;
351 |     };
352 | 
353 |     // Copy a graph to a different backend
354 |     GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
355 |     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
356 | 
357 |     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
358 | 
359 |     // Compare the output of two backends
360 |     GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
361 | 
362 |     // Tensor initialization
363 |     GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
364 |     GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
365 | 
366 |     // CPU buffer types are always available
367 |     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
368 |     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
369 | 
370 | #ifdef  __cplusplus
371 | }
372 | #endif
373 | 


--------------------------------------------------------------------------------
/src/xllamacpp/server.cpp:
--------------------------------------------------------------------------------
  1 | #include "json-schema-to-grammar.h"
  2 | #include "server-context.h"
  3 | #include "server-http.h"
  4 | #include "server-models.h"
  5 | 
  6 | #include "arg.h"
  7 | #include "common.h"
  8 | #include "llama.h"
  9 | #include "log.h"
 10 | 
 11 | #include <atomic>
 12 | #include <future>
 13 | #include <signal.h>
 14 | #include <thread> // for std::thread::hardware_concurrency
 15 | 
 16 | #if defined(_WIN32)
 17 | #include <windows.h>
 18 | #endif
 19 | 
 20 | static std::function<void(int)> shutdown_handler;
 21 | static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 22 | 
 23 | static inline void signal_handler(int signal) {
 24 |   if (is_terminating.test_and_set()) {
 25 |     // in case it hangs, we can force terminate the server by hitting Ctrl+C
 26 |     // twice this is for better developer experience, we can remove when the
 27 |     // server is stable enough
 28 |     fprintf(stderr, "Received second interrupt, terminating immediately.\n");
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   shutdown_handler(signal);
 33 | }
 34 | 
 35 | // wrapper function that handles exceptions and logs errors
 36 | // this is to make sure handler_t never throws exceptions; instead, it returns
 37 | // an error response
 38 | static server_http_context::handler_t
 39 | ex_wrapper(server_http_context::handler_t func) {
 40 |   return [func = std::move(func)](
 41 |              const server_http_req &req) -> server_http_res_ptr {
 42 |     std::string message;
 43 |     error_type error;
 44 |     try {
 45 |       return func(req);
 46 |     } catch (const std::invalid_argument &e) {
 47 |       // treat invalid_argument as invalid request (400)
 48 |       error = ERROR_TYPE_INVALID_REQUEST;
 49 |       message = e.what();
 50 |     } catch (const std::exception &e) {
 51 |       // treat other exceptions as server error (500)
 52 |       error = ERROR_TYPE_SERVER;
 53 |       message = e.what();
 54 |     } catch (...) {
 55 |       error = ERROR_TYPE_SERVER;
 56 |       message = "unknown error";
 57 |     }
 58 | 
 59 |     auto res = std::make_unique<server_http_res>();
 60 |     res->status = 500;
 61 |     try {
 62 |       json error_data = format_error_response(message, error);
 63 |       res->status = json_value(error_data, "code", 500);
 64 |       res->data = safe_json_to_str({{"error", error_data}});
 65 |       SRV_WRN("got exception: %s\n", res->data.c_str());
 66 |     } catch (const std::exception &e) {
 67 |       SRV_ERR("got another exception: %s | while handling exception: %s\n",
 68 |               e.what(), message.c_str());
 69 |       res->data = "Internal Server Error";
 70 |     }
 71 |     return res;
 72 |   };
 73 | }
 74 | 
 75 | static void init(common_params &params, server_context &ctx_server,
 76 |                  std::string &listening_address, std::promise<int> out) {
 77 |   common_log_set_verbosity_thold(params.verbosity);
 78 | 
 79 |   // TODO: should we have a separate n_parallel parameter for the server?
 80 |   //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
 81 |   // TODO: this is a common configuration that is suitable for most local use
 82 |   // cases
 83 |   //       however, overriding the parameters is a bit confusing - figure out
 84 |   //       something more intuitive
 85 |   if (params.n_parallel == 1 && params.kv_unified == false &&
 86 |       !params.has_speculative()) {
 87 |     LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to "
 88 |             "disable this)\n",
 89 |             __func__);
 90 | 
 91 |     params.n_parallel = 4;
 92 |     params.kv_unified = true;
 93 |   }
 94 | 
 95 |   // for consistency between server router mode and single-model mode, we set
 96 |   // the same model name as alias
 97 |   if (params.model_alias.empty() && !params.model.name.empty()) {
 98 |     params.model_alias = params.model.name;
 99 |   }
100 | 
101 |   common_init();
102 |   llama_backend_init();
103 |   llama_numa_init(params.numa);
104 | 
105 |   LOG_INF(
106 |       "system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n",
107 |       params.cpuparams.n_threads, params.cpuparams_batch.n_threads,
108 |       std::thread::hardware_concurrency());
109 |   LOG_INF("\n");
110 |   LOG_INF("%s\n", common_params_get_system_info(params).c_str());
111 |   LOG_INF("\n");
112 | 
113 |   server_http_context ctx_http;
114 |   if (!ctx_http.init(params)) {
115 |     LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
116 |     out.set_value(1);
117 |     return;
118 |   }
119 | 
120 |   //
121 |   // Router
122 |   //
123 | 
124 |   // register API routes
125 |   server_routes routes(params, ctx_server,
126 |                        [&ctx_http]() { return ctx_http.is_ready.load(); });
127 | 
128 |   constexpr bool is_router_server = false;
129 |   std::optional<server_models_routes> models_routes{};
130 |   if (is_router_server) {
131 |     // setup server instances manager
132 |     models_routes.emplace(params, 0, nullptr, nullptr);
133 | 
134 |     // proxy handlers
135 |     // note: routes.get_health stays the same
136 |     routes.get_metrics = models_routes->proxy_get;
137 |     routes.post_props = models_routes->proxy_post;
138 |     routes.get_api_show = models_routes->proxy_get;
139 |     routes.post_completions = models_routes->proxy_post;
140 |     routes.post_completions_oai = models_routes->proxy_post;
141 |     routes.post_chat_completions = models_routes->proxy_post;
142 |     routes.post_anthropic_messages = models_routes->proxy_post;
143 |     routes.post_anthropic_count_tokens = models_routes->proxy_post;
144 |     routes.post_infill = models_routes->proxy_post;
145 |     routes.post_embeddings = models_routes->proxy_post;
146 |     routes.post_embeddings_oai = models_routes->proxy_post;
147 |     routes.post_rerank = models_routes->proxy_post;
148 |     routes.post_tokenize = models_routes->proxy_post;
149 |     routes.post_detokenize = models_routes->proxy_post;
150 |     routes.post_apply_template = models_routes->proxy_post;
151 |     routes.get_lora_adapters = models_routes->proxy_get;
152 |     routes.post_lora_adapters = models_routes->proxy_post;
153 |     routes.get_slots = models_routes->proxy_get;
154 |     routes.post_slots = models_routes->proxy_post;
155 | 
156 |     // custom routes for router
157 |     routes.get_props = models_routes->get_router_props;
158 |     routes.get_models = models_routes->get_router_models;
159 |     ctx_http.post("/models/load",
160 |                   ex_wrapper(models_routes->post_router_models_load));
161 |     ctx_http.post("/models/unload",
162 |                   ex_wrapper(models_routes->post_router_models_unload));
163 |     ctx_http.post("/models/status",
164 |                   ex_wrapper(models_routes->post_router_models_status));
165 |   }
166 | 
167 |   ctx_http.get(
168 |       "/health",
169 |       ex_wrapper(routes.get_health)); // public endpoint (no API key check)
170 |   ctx_http.get(
171 |       "/v1/health",
172 |       ex_wrapper(routes.get_health)); // public endpoint (no API key check)
173 |   ctx_http.get("/metrics", ex_wrapper(routes.get_metrics));
174 |   ctx_http.get("/props", ex_wrapper(routes.get_props));
175 |   ctx_http.post("/props", ex_wrapper(routes.post_props));
176 |   ctx_http.post("/api/show", ex_wrapper(routes.get_api_show));
177 |   ctx_http.get(
178 |       "/models",
179 |       ex_wrapper(routes.get_models)); // public endpoint (no API key check)
180 |   ctx_http.get(
181 |       "/v1/models",
182 |       ex_wrapper(routes.get_models)); // public endpoint (no API key check)
183 |   ctx_http.get(
184 |       "/api/tags",
185 |       ex_wrapper(routes.get_models)); // ollama specific endpoint. public
186 |                                       // endpoint (no API key check)
187 |   ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy
188 |   ctx_http.post("/completions", ex_wrapper(routes.post_completions));
189 |   ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai));
190 |   ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
191 |   ctx_http.post("/v1/chat/completions",
192 |                 ex_wrapper(routes.post_chat_completions));
193 |   ctx_http.post(
194 |       "/api/chat",
195 |       ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
196 |   ctx_http.post(
197 |       "/v1/messages",
198 |       ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
199 |   ctx_http.post(
200 |       "/v1/messages/count_tokens",
201 |       ex_wrapper(
202 |           routes.post_anthropic_count_tokens)); // anthropic token counting
203 |   ctx_http.post("/infill", ex_wrapper(routes.post_infill));
204 |   ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
205 |   ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
206 |   ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai));
207 |   ctx_http.post("/rerank", ex_wrapper(routes.post_rerank));
208 |   ctx_http.post("/reranking", ex_wrapper(routes.post_rerank));
209 |   ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank));
210 |   ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank));
211 |   ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
212 |   ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
213 |   ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
214 |   // LoRA adapters hotswap
215 |   ctx_http.get("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
216 |   ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
217 |   // Save & load slots
218 |   ctx_http.get("/slots", ex_wrapper(routes.get_slots));
219 |   ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
220 | 
221 |   //
222 |   // Start the server
223 |   //
224 | 
225 |   std::function<void()> clean_up;
226 | 
227 |   if (is_router_server) {
228 |     LOG_INF(
229 |         "%s: starting router server, no model will be loaded in this process\n",
230 |         __func__);
231 | 
232 |     clean_up = [&models_routes]() {
233 |       SRV_INF("%s: cleaning up before exit...\n", __func__);
234 |       if (models_routes.has_value()) {
235 |         models_routes->models.unload_all();
236 |       }
237 |       llama_backend_free();
238 |     };
239 | 
240 |     if (!ctx_http.start()) {
241 |       clean_up();
242 |       LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
243 |       out.set_value(1);
244 |       return;
245 |     }
246 |     ctx_http.is_ready.store(true);
247 | 
248 |     shutdown_handler = [&](int) { ctx_http.stop(); };
249 | 
250 |   } else {
251 |     // setup clean up function, to be called before exit
252 |     clean_up = [&ctx_http, &ctx_server]() {
253 |       SRV_INF("%s: cleaning up before exit...\n", __func__);
254 |       ctx_http.stop();
255 |       ctx_server.terminate();
256 |       llama_backend_free();
257 |     };
258 | 
259 |     // start the HTTP server before loading the model to be able to serve
260 |     // /health requests
261 |     if (!ctx_http.start()) {
262 |       clean_up();
263 |       LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
264 |       out.set_value(1);
265 |       return;
266 |     }
267 | 
268 |     // load the model
269 |     LOG_INF("%s: loading model\n", __func__);
270 | 
271 |     if (!ctx_server.load_model(params)) {
272 |       clean_up();
273 |       if (ctx_http.thread.joinable()) {
274 |         ctx_http.thread.join();
275 |       }
276 |       LOG_ERR("%s: exiting due to model loading error\n", __func__);
277 |       out.set_value(1);
278 |       return;
279 |     }
280 | 
281 |     ctx_server.init();
282 |     ctx_http.is_ready.store(true);
283 | 
284 |     LOG_INF("%s: model loaded\n", __func__);
285 | 
286 |     shutdown_handler = [&](int) {
287 |       // this will unblock start_loop()
288 |       ctx_server.terminate();
289 |     };
290 |   }
291 | 
292 |   // TODO: refactor in common/console
293 | #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
294 |   struct sigaction sigint_action;
295 |   sigint_action.sa_handler = signal_handler;
296 |   sigemptyset(&sigint_action.sa_mask);
297 |   sigint_action.sa_flags = 0;
298 |   sigaction(SIGINT, &sigint_action, NULL);
299 |   sigaction(SIGTERM, &sigint_action, NULL);
300 | #elif defined(_WIN32)
301 |   auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
302 |     return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
303 |   };
304 |   SetConsoleCtrlHandler(
305 |       reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
306 | #endif
307 | 
308 |   if (is_router_server) {
309 |     LOG_INF("%s: router server is listening on %s\n", __func__,
310 |             ctx_http.listening_address.c_str());
311 |     LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
312 |     LOG_INF("%s:       it is not recommended to use this mode in untrusted "
313 |             "environments\n",
314 |             __func__);
315 |     if (ctx_http.thread.joinable()) {
316 |       ctx_http.thread.join(); // keep the main thread alive
317 |     }
318 | 
319 |     // when the HTTP server stops, clean up and exit
320 |     clean_up();
321 |   } else {
322 |     LOG_INF("%s: server is listening on %s\n", __func__,
323 |             ctx_http.listening_address.c_str());
324 |     LOG_INF("%s: starting the main loop...\n", __func__);
325 | 
326 |     // optionally, notify router server that this instance is ready
327 |     const char *router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
328 |     std::thread monitor_thread;
329 |     if (router_port != nullptr) {
330 |       monitor_thread = server_models::setup_child_server(
331 |           params, std::atoi(router_port), params.model_alias, shutdown_handler);
332 |     }
333 | 
334 |     // write the listening_address
335 |     listening_address = ctx_http.listening_address;
336 | 
337 |     out.set_value(0);
338 | 
339 |     // this call blocks the main thread until queue_tasks.terminate() is called
340 |     ctx_server.start_loop();
341 | 
342 |     clean_up();
343 |     if (ctx_http.thread.joinable()) {
344 |       ctx_http.thread.join();
345 |     }
346 |     if (monitor_thread.joinable()) {
347 |       monitor_thread.join();
348 |     }
349 |     // crash during llama_memory_breakdown_print if the model is rerank.
350 |     if (params.pooling_type != LLAMA_POOLING_TYPE_RANK) {
351 |       llama_memory_breakdown_print(ctx_server.get_llama_context());
352 |     }
353 |   }
354 | }
355 | 
356 | static void ggml_log_callback_default(enum ggml_log_level level,
357 |                                       const char *text, void *user_data) {
358 |   (void)level;
359 |   (void)text;
360 |   (void)user_data;
361 |   // if (level == GGML_LOG_LEVEL_INFO || level == GGML_LOG_LEVEL_ERROR) {
362 |   //   fputs(text, stderr);
363 |   //   fflush(stderr);
364 |   // }
365 | }
366 | 
367 | std::function<bool()> not_stop = [] { return false; };
368 | 
369 | static std::vector<std::string> parse_oai_sse(const std::string &sse) {
370 |   std::vector<std::string> out;
371 | 
372 |   std::size_t start = 0;
373 |   while (start < sse.size()) {
374 |     std::size_t end = sse.find('\n', start);
375 |     if (end == std::string::npos) {
376 |       break;
377 |     }
378 | 
379 |     // Empty line = event separator, skip
380 |     if (end > start) {
381 |       // Guaranteed format: "data: <json>"
382 |       out.emplace_back(sse.substr(start + 6, end - start - 6));
383 |     }
384 | 
385 |     start = end + 1;
386 |   }
387 | 
388 |   return out;
389 | }
390 | 
391 | static void
392 | process_handler_response(server_http_res_ptr &response,
393 |                          std::function<bool(std::string &&)> res_err,
394 |                          std::function<bool(std::string &&)> res_ok) {
395 |   static const std::string sse_prefix("data: ");
396 |   auto res = response->status == 200 ? res_ok : res_err;
397 |   if (response->is_stream()) {
398 |     std::string chunk;
399 | 
400 |     while (true) {
401 |       const bool has_next = response->next(chunk);
402 |       if (!chunk.empty() && chunk.size() >= sse_prefix.size()) {
403 |         if (!has_next && chunk == "data: [DONE]\n\n") {
404 |           return;
405 |         }
406 |         auto parsed = parse_oai_sse(chunk);
407 |         for (auto &&json_str : parsed) {
408 |           if (res(std::move(json_str))) {
409 |             return;
410 |           }
411 |         }
412 |       }
413 |       if (!has_next) {
414 |         return;
415 |       }
416 |     }
417 |   } else {
418 |     res(std::move(response->data));
419 |   }
420 | }
421 | 
422 | #include "server.h"
423 | 
424 | namespace xllamacpp {
425 | 
426 | std::string get_system_info() { return llama_print_system_info(); }
427 | 
428 | std::vector<ggml_backend_dev_props> get_device_info() {
429 |   ggml_log_set(ggml_log_callback_default, nullptr);
430 | 
431 |   const size_t dev_count = ggml_backend_dev_count();
432 | 
433 |   std::vector<ggml_backend_dev_props> result;
434 |   std::vector<ggml_backend_dev_t> devs;
435 |   std::vector<ggml_backend_t> backends;
436 | 
437 |   for (size_t i = 0; i < dev_count; ++i) {
438 |     devs.push_back(ggml_backend_dev_get(i));
439 | 
440 |     ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
441 |     GGML_ASSERT(backend != NULL);
442 | 
443 |     auto *reg = ggml_backend_dev_backend_reg(devs[i]);
444 |     auto ggml_backend_set_n_threads_fn =
445 |         (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(
446 |             reg, "ggml_backend_set_n_threads");
447 |     if (ggml_backend_set_n_threads_fn) {
448 |       ggml_backend_set_n_threads_fn(backend,
449 |                                     std::thread::hardware_concurrency() / 2);
450 |     }
451 | 
452 |     backends.push_back(backend);
453 |   }
454 | 
455 |   for (size_t i = 0; i < dev_count; ++i) {
456 |     // Put the backend to be tested in front so that it's prioritized:
457 |     std::vector<ggml_backend_t> backends_modded = {backends[i]};
458 |     backends_modded.insert(backends_modded.end(), backends.begin(),
459 |                            backends.end());
460 | 
461 |     ggml_backend_dev_props prop;
462 |     ggml_backend_dev_get_props(devs[i], &prop);
463 |     // Avoid crash when converting the prop struct to Python dict by Cython.
464 |     if (prop.device_id == nullptr) {
465 |       prop.device_id = "";
466 |     }
467 | 
468 |     result.push_back(prop);
469 |   }
470 | 
471 |   for (ggml_backend_t backend : backends) {
472 |     ggml_backend_free(backend);
473 |   }
474 | 
475 |   return result;
476 | }
477 | 
478 | Server::Server(const common_params &params)
479 |     : _params(params), _ctx_server(new server_context()) {
480 |   std::promise<int> out;
481 |   std::future<int> fut = out.get_future();
482 |   _loop_thread = std::thread(init, std::ref(_params), std::ref(*_ctx_server),
483 |                              std::ref(_listening_address), std::move(out));
484 |   if (fut.get() != 0) {
485 |     if (_loop_thread.joinable()) {
486 |       _loop_thread.join();
487 |     }
488 |     throw std::runtime_error(
489 |         "Failed to init server, please check the input params.");
490 |   }
491 |   _routes = std::make_shared<server_routes>(_params, *_ctx_server,
492 |                                             []() { return true; });
493 | }
494 | 
495 | Server::~Server() {
496 |   _ctx_server->terminate();
497 |   LOG_INF("%s: waiting for main loop exit\n", __func__);
498 |   if (_loop_thread.joinable()) {
499 |     _loop_thread.join();
500 |   }
501 |   LOG_INF("%s: main loop exited\n", __func__);
502 | }
503 | 
504 | std::string Server::listening_address() const { return _listening_address; }
505 | 
506 | std::string Server::handle_metrics() {
507 |   server_http_req req{{}, {}, "", "", not_stop};
508 |   auto res = _routes->get_metrics(req);
509 |   return res->data;
510 | }
511 | 
512 | std::string Server::handle_embeddings(const std::string &input_json_str) {
513 |   server_http_req req{{}, {}, "", input_json_str, not_stop};
514 |   auto res = _routes->post_embeddings_oai(req);
515 |   return res->data;
516 | }
517 | 
518 | std::string Server::handle_rerank(const std::string &input_json_str) {
519 |   server_http_req req{{}, {}, "", input_json_str, not_stop};
520 |   auto res = _routes->post_rerank(req);
521 |   return res->data;
522 | }
523 | 
524 | void Server::handle_completions(const std::string &prompt_json_str,
525 |                                 Callback res_err, void *py_cb_err,
526 |                                 Callback res_ok, void *py_cb_ok) {
527 |   server_http_req req{{}, {}, "", prompt_json_str, not_stop};
528 |   auto res = _routes->post_completions_oai(req);
529 |   process_handler_response(
530 |       res,
531 |       [res_err, py_cb_err](std::string &&err) {
532 |         return res_err(std::move(err), py_cb_err);
533 |       },
534 |       [res_ok, py_cb_ok](std::string &&ok) {
535 |         return res_ok(std::move(ok), py_cb_ok);
536 |       });
537 | }
538 | 
539 | void Server::handle_chat_completions(const std::string &prompt_json_str,
540 |                                      Callback res_err, void *py_cb_err,
541 |                                      Callback res_ok, void *py_cb_ok) {
542 |   server_http_req req{{}, {}, "", prompt_json_str, not_stop};
543 |   auto res = _routes->post_chat_completions(req);
544 |   process_handler_response(
545 |       res,
546 |       [res_err, py_cb_err](std::string &&err) {
547 |         return res_err(std::move(err), py_cb_err);
548 |       },
549 |       [res_ok, py_cb_ok](std::string &&ok) {
550 |         return res_ok(std::move(ok), py_cb_ok);
551 |       });
552 | }
553 | 
554 | std::string json_schema_to_grammar_str(const std::string &schema_json_str) {
555 |   try {
556 |     auto schema = json::parse(schema_json_str);
557 |     return json_schema_to_grammar(schema);
558 |   } catch (const std::exception &e) {
559 |     throw std::runtime_error(std::string("json_schema_to_grammar: ") +
560 |                              e.what());
561 |   }
562 | }
563 | 
564 | // Helper function to parse tensor buffer override strings
565 | void parse_tensor_buffer_overrides(
566 |     const std::string &value,
567 |     std::vector<llama_model_tensor_buft_override> &overrides) {
568 |   std::map<std::string, ggml_backend_buffer_type_t> buft_list;
569 |   for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
570 |     auto *dev = ggml_backend_dev_get(i);
571 |     auto *buft = ggml_backend_dev_buffer_type(dev);
572 |     if (buft) {
573 |       buft_list[ggml_backend_buft_name(buft)] = buft;
574 |     }
575 |   }
576 | 
577 |   for (const auto &override : string_split<std::string>(value, ',')) {
578 |     std::string::size_type pos = override.find('=');
579 |     if (pos == std::string::npos) {
580 |       throw std::invalid_argument("invalid value");
581 |     }
582 |     std::string tensor_name = override.substr(0, pos);
583 |     std::string buffer_type = override.substr(pos + 1);
584 | 
585 |     if (buft_list.find(buffer_type) == buft_list.end()) {
586 |       printf("Available buffer types:\n");
587 |       for (const auto &it : buft_list) {
588 |         printf("  %s\n", ggml_backend_buft_name(it.second));
589 |       }
590 |       throw std::invalid_argument("unknown buffer type");
591 |     }
592 |     // keep strings alive and avoid leaking memory by storing them in a static
593 |     // vector
594 |     static std::list<std::string> buft_overrides;
595 |     buft_overrides.push_back(tensor_name);
596 |     overrides.push_back(
597 |         {buft_overrides.back().c_str(), buft_list.at(buffer_type)});
598 |   }
599 | }
600 | 
601 | // Helper function to build tensor buffer override strings
602 | void build_tensor_buffer_overrides(
603 |     const std::vector<llama_model_tensor_buft_override> &overrides,
604 |     std::string &value) {
605 |   std::map<ggml_backend_buffer_type_t, std::string> buft_list;
606 |   for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
607 |     auto *dev = ggml_backend_dev_get(i);
608 |     auto *buft = ggml_backend_dev_buffer_type(dev);
609 |     if (buft) {
610 |       buft_list[buft] = ggml_backend_buft_name(buft);
611 |     }
612 |   }
613 | 
614 |   std::vector<std::string> parts;
615 |   for (auto &override : overrides) {
616 |     std::string ov_str =
617 |         std::string(override.pattern) + "=" + buft_list[override.buft];
618 |     parts.emplace_back(ov_str);
619 |   }
620 | 
621 |   value = string_join(parts, ",");
622 | }
623 | 
624 | } // namespace xllamacpp
625 | 


--------------------------------------------------------------------------------