├── src ├── xllamacpp │ ├── __init__.pxd │ ├── __init__.py │ ├── server.h │ ├── server.pxd │ ├── memory.py │ └── server.cpp └── llama.cpp │ ├── src │ └── server.cpp │ └── include │ └── ggml-backend.h ├── .gitattributes ├── assets ├── logo.png ├── logo-white.png ├── logo.svg └── logo-white.svg ├── tests ├── dummy.gguf ├── data │ └── 11_truck.png ├── conftest.py ├── test_memory.py ├── bge-m3-metadata.json ├── test_params.py ├── test_server.py └── test_server_http.py ├── .gitmodules ├── MANIFEST.in ├── requirements.txt ├── pyproject.toml ├── LICENSE ├── Makefile ├── scripts ├── get-releases.sh ├── releases-to-pep-503.sh ├── setup.sh ├── copy_libs.py └── .clang-format ├── .github └── workflows │ ├── release-github-pypi.yaml │ ├── ci.yaml │ ├── build-wheel.yaml │ └── build-wheel-cuda-hip.yaml ├── .gitignore ├── setup.py └── README.md /src/xllamacpp/__init__.pxd: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | src/xllamacpp/_version.py export-subst 2 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/assets/logo.png -------------------------------------------------------------------------------- /tests/dummy.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/tests/dummy.gguf -------------------------------------------------------------------------------- /assets/logo-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/assets/logo-white.png -------------------------------------------------------------------------------- /tests/data/11_truck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xorbitsai/xllamacpp/HEAD/tests/data/11_truck.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/llama.cpp"] 2 | path = thirdparty/llama.cpp 3 | url = https://github.com/ggml-org/llama.cpp.git 4 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT = Path(__file__).parent.parent 4 | 5 | 6 | import pytest 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def model_path(): 11 | return str(ROOT / "models") 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.pxd 2 | global-include *.pyx 3 | global-include *.pxi 4 | include scripts/setup.sh 5 | include scripts/copy_libs.py 6 | include requirements.txt 7 | include Makefile 8 | recursive-include thirdparty/llama.cpp * 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # build requirements 2 | setuptools 3 | cython 4 | wheel 5 | 6 | # # macos fix wheel tool 7 | # delocate; sys_platform == 'darwin' 8 | 9 | # # macos fix wheel tool 10 | # auditwheel; sys_platform == 'linux' 11 | 12 | # # windows fix wheel tool 13 | # delvewheel; sys_platform == 'win32' 14 | 15 | # runtime requirements (optional) 16 | # numpy 17 | 18 | # testing tools (optional) 19 | # pytest pytest-cov pytest-memray 20 | -------------------------------------------------------------------------------- /src/xllamacpp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 XProbe Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .xllamacpp import * 16 | from .memory import estimate_gpu_layers 17 | 18 | from . import _version 19 | 20 | __version__ = _version.get_versions()["version"] 21 | if __version__ == "0+unknown": 22 | print(_version.get_versions()) 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "xllamacpp" 3 | dynamic = ["version", "license"] 4 | description = "A Python wrapper of llama.cpp" 5 | readme = "README.md" 6 | authors = [ 7 | { name = "codingl2k1", email = "codingl2k1@outlook.com" } 8 | ] 9 | requires-python = ">3.8" 10 | dependencies = [] 11 | 12 | [project.optional-dependencies] 13 | test = ['pytest', 'pytest-cov'] 14 | all = ['gguf', 'orjson'] 15 | 16 | [build-system] 17 | requires = ["setuptools >= 61", "cython", "versioneer[toml]"] 18 | build-backend = "setuptools.build_meta" 19 | 20 | [tool.setuptools] 21 | include-package-data = false 22 | 23 | [tool.pytest.ini_options] 24 | pythonpath = ["src"] 25 | testpaths = ["tests"] 26 | 27 | [tool.versioneer] 28 | VCS = "git" 29 | style = "pep440" 30 | versionfile_source = "src/xllamacpp/_version.py" 31 | versionfile_build = "xllamacpp/_version.py" 32 | tag_prefix = "v" 33 | parentdir_prefix = "xllamacpp-" 34 | 35 | [tool.black] 36 | required-version = "25.1.0" 37 | include = '\.pyi?$' 38 | exclude = '_version.py' 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Shakeeb Alireza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # set path so `llama-cli` etc.. be in path 2 | export PATH := $(PWD)/bin:$(PATH) 3 | export MACOSX_DEPLOYMENT_TARGET := 12 4 | 5 | # models 6 | MODEL := bge-reranker-v2-m3-Q2_K.gguf 7 | 8 | THIRDPARTY := $(PWD)/thirdparty 9 | LLAMACPP := $(THIRDPARTY)/llama.cpp 10 | 11 | .PHONY: all build wheel clean test download 12 | 13 | all: build 14 | 15 | build: 16 | @bash scripts/setup.sh 17 | python setup.py build_ext --inplace 18 | 19 | wheel: 20 | @python setup.py bdist_wheel 21 | 22 | clean: 23 | @rm -rf build dist src/llama.cpp src/*.egg-inf thirdparty/llama.cpp/build o .pytest_cache .coverage 24 | 25 | test: build 26 | @pytest 27 | 28 | $(MODEL): 29 | @mkdir -p models && cd models && \ 30 | curl --output Llama-3.2-1B-Instruct-Q8_0.gguf -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf && \ 31 | curl --output tinygemma3-Q8_0.gguf -L https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/tinygemma3-Q8_0.gguf && \ 32 | curl --output mmproj-tinygemma3.gguf -L https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf && \ 33 | curl --output Qwen3-Embedding-0.6B-Q8_0.gguf -L https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF/resolve/main/Qwen3-Embedding-0.6B-Q8_0.gguf && \ 34 | curl --output bge-reranker-v2-m3-Q2_K.gguf -L https://modelscope.cn/models/gpustack/bge-reranker-v2-m3-GGUF/resolve/master/bge-reranker-v2-m3-Q2_K.gguf 35 | 36 | download: $(MODEL) 37 | @echo "minimal model downloaded to models directory" 38 | -------------------------------------------------------------------------------- /scripts/get-releases.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to get all releases 4 | get_all_releases() { 5 | local page=1 6 | local per_page=100 7 | local releases="" 8 | local new_releases 9 | 10 | # Prepare headers 11 | local headers=(-H "Accept: application/vnd.github.v3+json") 12 | if [ -n "$GITHUB_TOKEN" ]; then 13 | headers+=(-H "Authorization: Bearer $GITHUB_TOKEN") 14 | fi 15 | 16 | while true; do 17 | response=$(curl -s "${headers[@]}" \ 18 | "https://api.github.com/repos/xorbitsai/xllamacpp/releases?page=$page&per_page=$per_page") 19 | 20 | # Check if the response is valid JSON 21 | if ! echo "$response" | jq empty > /dev/null 2>&1; then 22 | echo "Error: Invalid response from GitHub API" >&2 23 | echo "Response: $response" >&2 24 | return 1 25 | fi 26 | 27 | new_releases=$(echo "$response" | jq -r '.[].tag_name') 28 | if [ -z "$new_releases" ]; then 29 | break 30 | fi 31 | releases="$releases $new_releases" 32 | ((page++)) 33 | done 34 | 35 | echo $releases 36 | } 37 | 38 | # Get all releases and save to file 39 | releases=$(get_all_releases) 40 | if [ $? -ne 0 ]; then 41 | echo "Failed to fetch releases. Please check your internet connection and try again later." >&2 42 | exit 1 43 | fi 44 | 45 | echo "$releases" | tr ' ' '\n' > all_releases.txt 46 | 47 | echo "All releases have been saved to all_releases.txt" 48 | -------------------------------------------------------------------------------- /src/xllamacpp/server.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "common.h" 7 | 8 | struct server_context; 9 | struct server_routes; 10 | 11 | namespace xllamacpp { 12 | 13 | std::string get_system_info(); 14 | 15 | std::vector get_device_info(); 16 | 17 | typedef bool (*Callback)(std::string &&, void *py_cb); 18 | 19 | // Convert a JSON schema string into a llama.cpp grammar string for structured 20 | // outputs 21 | std::string json_schema_to_grammar_str(const std::string &schema_json_str); 22 | 23 | class Server { 24 | public: 25 | Server(const common_params ¶ms); 26 | ~Server(); 27 | 28 | std::string listening_address() const; 29 | 30 | std::string handle_metrics(); 31 | 32 | std::string handle_embeddings(const std::string &input_json_str); 33 | 34 | std::string handle_rerank(const std::string &input_json_str); 35 | 36 | void handle_completions(const std::string &prompt_json_str, 37 | Callback res_error, void *py_cb_error, 38 | Callback res_ok, void *py_cb_ok); 39 | 40 | void handle_chat_completions(const std::string &prompt_json_str, 41 | Callback res_error, void *py_cb_error, 42 | Callback res_ok, void *py_cb_ok); 43 | 44 | private: 45 | common_params _params; 46 | std::string _listening_address; 47 | // Incomplete type of server_context 48 | std::shared_ptr _ctx_server; 49 | std::shared_ptr _routes; 50 | std::thread _loop_thread; 51 | }; 52 | 53 | void parse_tensor_buffer_overrides( 54 | const std::string &value, 55 | std::vector &overrides); 56 | void build_tensor_buffer_overrides( 57 | const std::vector &overrides, 58 | std::string &value); 59 | } // namespace xllamacpp 60 | -------------------------------------------------------------------------------- /src/xllamacpp/server.pxd: -------------------------------------------------------------------------------- 1 | # distutils: language=c++ 2 | 3 | from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override 4 | from libcpp cimport bool as c_bool 5 | from libcpp.string cimport string as std_string 6 | from libcpp.vector cimport vector as std_vector 7 | 8 | cdef extern from "server.h" namespace "xllamacpp" nogil: 9 | std_string c_get_system_info "xllamacpp::get_system_info" () 10 | 11 | std_vector[ggml_backend_dev_props] c_get_device_info "xllamacpp::get_device_info" () 12 | 13 | std_string c_json_schema_to_grammar_str "xllamacpp::json_schema_to_grammar_str" (const std_string & schema_json_str) except + 14 | 15 | ctypedef c_bool (*Callback "xllamacpp::Callback")(std_string &&, void *py_cb) 16 | cdef cppclass CServer "xllamacpp::Server": 17 | 18 | CServer(const common_params& params) except + 19 | 20 | std_string listening_address() except + 21 | 22 | std_string handle_metrics() except + 23 | 24 | std_string handle_embeddings(const std_string &input_json_str) except + 25 | 26 | std_string handle_rerank(const std_string &input_json_str) except + 27 | 28 | void handle_completions(const std_string &prompt_json_str, 29 | Callback res_error, 30 | void *py_cb_error, 31 | Callback res_ok, 32 | void *py_cb_ok) except + 33 | 34 | void handle_chat_completions(const std_string &prompt_json_str, 35 | Callback res_error, 36 | void *py_cb_error, 37 | Callback res_ok, 38 | void *py_cb_ok) except + 39 | 40 | void c_parse_tensor_buffer_overrides "xllamacpp::parse_tensor_buffer_overrides" ( 41 | const std_string & value, std_vector[llama_model_tensor_buft_override] & overrides) except + 42 | void c_build_tensor_buffer_overrides "xllamacpp::build_tensor_buffer_overrides" ( 43 | const std_vector[llama_model_tensor_buft_override] & overrides, std_string & value) except + 44 | -------------------------------------------------------------------------------- /.github/workflows/release-github-pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels Index 2 | 3 | on: 4 | # Trigger on new release 5 | workflow_run: 6 | workflows: ["Build Wheels (CUDA & HIP)"] 7 | types: 8 | - completed 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 21 | concurrency: 22 | group: "pages" 23 | cancel-in-progress: false 24 | 25 | jobs: 26 | # Single deploy job since we're just deploying 27 | deploy: 28 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 29 | environment: 30 | name: github-pages 31 | url: ${{ steps.deployment.outputs.page_url }} 32 | runs-on: ubuntu-latest 33 | steps: 34 | - name: Checkout 35 | uses: actions/checkout@v4 36 | - name: Setup Pages 37 | uses: actions/configure-pages@v5 38 | - name: Build 39 | env: 40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 41 | run: | 42 | ./scripts/get-releases.sh 43 | ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' 44 | ./scripts/releases-to-pep-503.sh index/whl/cu128 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu128$' 45 | ./scripts/releases-to-pep-503.sh index/whl/rocm-6.3.4 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm-6.3.4$' 46 | ./scripts/releases-to-pep-503.sh index/whl/rocm-6.4.1 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm-6.4.1$' 47 | ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan*' 48 | - name: Upload artifact 49 | uses: actions/upload-pages-artifact@v3 50 | with: 51 | # Upload entire repository 52 | path: 'index' 53 | - name: Deploy to GitHub Pages 54 | id: deployment 55 | uses: actions/deploy-pages@v4 56 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | # on: 4 | # push: 5 | # tags: 6 | # - '*' 7 | # workflow_dispatch: 8 | 9 | on: 10 | push: 11 | branches: 12 | - '*' 13 | pull_request: 14 | types: ['opened', 'reopened', 'synchronize'] 15 | 16 | concurrency: 17 | group: ${{ github.workflow }}-${{ github.ref }} 18 | cancel-in-progress: true 19 | 20 | jobs: 21 | lint: 22 | runs-on: ${{ matrix.os }} 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | os: [ "ubuntu-latest" ] 27 | python-version: [ "3.11" ] 28 | steps: 29 | - name: Check out code 30 | uses: actions/checkout@v3 31 | with: 32 | fetch-depth: 0 33 | submodules: recursive 34 | - name: Set up Python environment 35 | uses: actions/setup-python@v4 36 | with: 37 | python-version: "3.11" 38 | - name: black 39 | uses: psf/black@stable 40 | with: 41 | src: "src/xllamacpp" 42 | options: "--check --verbose" 43 | use_pyproject: true 44 | - name: clang-format 45 | uses: jidicula/clang-format-action@v4.15.0 46 | with: 47 | clang-format-version: '16' 48 | check-path: 'src/xllamacpp' 49 | 50 | build_test_job: 51 | runs-on: ${{ matrix.os }} 52 | needs: lint 53 | defaults: 54 | run: 55 | shell: bash -l {0} 56 | strategy: 57 | fail-fast: false 58 | matrix: 59 | os: [ "ubuntu-latest", "macos-latest", "windows-latest" ] 60 | python-version: [ "3.11", "3.13" ] 61 | 62 | steps: 63 | - name: Check out code 64 | uses: actions/checkout@v3 65 | with: 66 | fetch-depth: 0 67 | submodules: recursive 68 | 69 | - name: Set up conda ${{ matrix.python-version }} 70 | uses: conda-incubator/setup-miniconda@v3 71 | with: 72 | python-version: ${{ matrix.python-version }} 73 | 74 | # Fix "version `GLIBCXX_3.4.30' not found (required by xoscar_store.cpython-311-x86_64-linux-gnu.so)" issue 75 | - name: Install libstdcxx-ng 76 | if: ${{ matrix.os == 'ubuntu-latest' }} 77 | run: | 78 | conda install -c conda-forge libstdcxx-ng 79 | 80 | - name: Install dependencies 81 | env: 82 | OS: ${{ matrix.os }} 83 | run: | 84 | pip install -r requirements.txt 85 | pip install pytest pytest-timeout requests gguf orjson 86 | make clean 87 | make 88 | make download 89 | working-directory: . 90 | 91 | - name: Test with pytest 92 | run: | 93 | pytest --timeout=1500 -W ignore::PendingDeprecationWarning tests 94 | working-directory: . 95 | -------------------------------------------------------------------------------- /scripts/releases-to-pep-503.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Enable exit on error 4 | set -e 5 | 6 | # Function for logging 7 | log_error() { 8 | echo "ERROR: $1" >&2 9 | } 10 | 11 | log_info() { 12 | echo "INFO: $1" 13 | } 14 | 15 | # Get output directory or default to index/whl/cpu 16 | output_dir=${1:-"index/whl/cpu"} 17 | 18 | # Get pattern from second arg or default to valid python package version pattern 19 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"} 20 | 21 | # Get the current directory (where the script is run from) 22 | current_dir="$(pwd)" 23 | 24 | # Check if all_releases.txt exists 25 | if [ ! -f "$current_dir/all_releases.txt" ]; then 26 | log_error "all_releases.txt not found in the current directory." 27 | exit 1 28 | fi 29 | 30 | # Create output directory 31 | mkdir -p "$output_dir" 32 | 33 | # Create an index html file 34 | cat << EOF > "$output_dir/index.html" 35 | 36 | 37 | 38 | 39 | xllamacpp 40 |
41 | 42 | 43 | 44 | EOF 45 | 46 | # Create xllamacpp directory 47 | mkdir -p "$output_dir/xllamacpp" 48 | 49 | # Create an index html file in xllamacpp directory 50 | cat << EOF > "$output_dir/xllamacpp/index.html" 51 | 52 | 53 | 54 |

Links for xllamacpp

55 | EOF 56 | 57 | # Temporary aggregation directory for per-version links 58 | tmp_dir="$output_dir/.tmp_xllamacpp_links" 59 | rm -rf "$tmp_dir" 60 | mkdir -p "$tmp_dir" 61 | 62 | # Filter releases by pattern 63 | releases=$(grep -E "$pattern" "$current_dir/all_releases.txt") 64 | 65 | # Prepare curl headers 66 | headers=('--header' 'Accept: application/vnd.github.v3+json') 67 | if [ -n "$GITHUB_TOKEN" ]; then 68 | headers+=('--header' "authorization: Bearer $GITHUB_TOKEN") 69 | fi 70 | headers+=('--header' 'content-type: application/json') 71 | 72 | # For each release, get all assets 73 | for release in $releases; do 74 | log_info "Processing release: $release" 75 | response=$(curl -s "${headers[@]}" \ 76 | "https://api.github.com/repos/xorbitsai/xllamacpp/releases/tags/$release") 77 | 78 | if [ -z "$response" ]; then 79 | log_error "Empty response from GitHub API for release $release" 80 | continue 81 | fi 82 | 83 | if ! echo "$response" | jq -e '.assets' > /dev/null 2>&1; then 84 | log_error "Invalid or unexpected response from GitHub API for release $release" 85 | log_error "Response: $response" 86 | continue 87 | fi 88 | 89 | # Get release version from release ie v0.1.0-cu121 -> v0.1.0 90 | release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") 91 | # Track first-seen order of versions 92 | if [ ! -f "$tmp_dir/.${release_version}.seen" ]; then 93 | echo "$release_version" >> "$tmp_dir/order.txt" 94 | : > "$tmp_dir/${release_version}.html" 95 | touch "$tmp_dir/.${release_version}.seen" 96 | fi 97 | 98 | wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url') 99 | if [ -z "$wheel_urls" ]; then 100 | log_error "No wheel files found for release $release" 101 | continue 102 | fi 103 | 104 | echo "$wheel_urls" | while read -r asset; do 105 | echo " $asset" >> "$tmp_dir/${release_version}.html" 106 | echo "
" >> "$tmp_dir/${release_version}.html" 107 | done 108 | done 109 | 110 | if [ -f "$tmp_dir/order.txt" ]; then 111 | while IFS= read -r ver; do 112 | echo "

$ver

" >> "$output_dir/xllamacpp/index.html" 113 | cat "$tmp_dir/${ver}.html" >> "$output_dir/xllamacpp/index.html" 114 | done < "$tmp_dir/order.txt" 115 | fi 116 | 117 | # Close HTML and clean up 118 | echo " " >> "$output_dir/xllamacpp/index.html" 119 | echo "" >> "$output_dir/xllamacpp/index.html" 120 | echo "" >> "$output_dir/xllamacpp/index.html" 121 | 122 | rm -rf "$tmp_dir" 123 | 124 | log_info "Index generation complete. Output directory: $output_dir" 125 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | # scripts/setup.sh [download_last_working] [release-tag] 2 | # 3 | # setup.sh : (default run) downloads, builds and install last working release of llama.cpp 4 | # setup.sh 1 : like default 5 | # setup.sh 0 : downloads, builds and install bleeding edge llama.cpp from repo 6 | # setup.sh 1 : downloads, builds and install release of llama.cpp 7 | 8 | CWD=$(pwd) 9 | THIRDPARTY=${CWD}/thirdparty 10 | 11 | build_llamacpp() { 12 | echo "update from llama.cpp main repo" 13 | PROJECT=${THIRDPARTY}/llama.cpp 14 | PREFIX=${CWD}/src/llama.cpp 15 | NPROC=2 16 | cd ${PROJECT} && \ 17 | mkdir -p build && 18 | cd build 19 | # Base CMake arguments 20 | local cmake_args=( 21 | "-DBUILD_SHARED_LIBS=OFF" 22 | "-DCMAKE_POSITION_INDEPENDENT_CODE=ON" 23 | "-DCMAKE_INSTALL_LIBDIR=lib" 24 | "-DLLAMA_CURL=OFF" 25 | "-DLLAMA_LLGUIDANCE=ON" 26 | ) 27 | 28 | # Add any additional CMake arguments from environment 29 | if [ -n "${CMAKE_ARGS}" ]; then 30 | cmake_args+=(${CMAKE_ARGS}) 31 | fi 32 | 33 | # Build targets 34 | local targets=("common" "llama" "ggml" "ggml-cpu" "mtmd" "cpp-httplib" "server-context" "llama-server") 35 | 36 | if [[ -n "${XLLAMACPP_BUILD_CUDA}" ]]; then 37 | echo "Building for CUDA" 38 | cmake_args+=( 39 | "-DGGML_NATIVE=OFF" 40 | "-DGGML_CUDA=ON" 41 | "-DGGML_CUDA_FORCE_MMQ=ON" 42 | "-DCMAKE_CUDA_ARCHITECTURES=all" 43 | ) 44 | targets+=("ggml-cuda") 45 | elif [[ -n "${XLLAMACPP_BUILD_HIP}" ]]; then 46 | echo "Building for AMD GPU" 47 | cmake_args+=( 48 | "-DGGML_NATIVE=OFF" 49 | "-DAMDGPU_TARGETS=gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" 50 | "-DCMAKE_HIP_COMPILER=$(hipconfig -l)/clang" 51 | "-DGGML_HIP_ROCWMMA_FATTN=ON" 52 | "-DGGML_HIP=ON" 53 | ) 54 | targets+=("ggml-hip") 55 | elif [[ -n "${XLLAMACPP_BUILD_VULKAN}" ]]; then 56 | if [[ "$(uname -s)" == "Darwin" ]]; then 57 | cmake_args+=("-DCMAKE_BUILD_RPATH=@loader_path") 58 | if [[ "$(uname -m)" == "x86_64" ]]; then 59 | echo "Building for Intel with Vulkan" 60 | cmake_args+=( 61 | "-DGGML_METAL=OFF" 62 | "-DGGML_VULKAN=ON" 63 | ) 64 | targets+=("ggml-blas" "ggml-vulkan") 65 | else 66 | echo "Building for Apple Silicon with Vulkan is not supported" 67 | exit 1 68 | fi 69 | else 70 | echo "Building with Vulkan" 71 | cmake_args+=( 72 | "-DGGML_NATIVE=OFF" 73 | "-DGGML_VULKAN=ON" 74 | ) 75 | targets+=("ggml-vulkan") 76 | fi 77 | elif [[ -n "${XLLAMACPP_BUILD_AARCH64}" ]]; then 78 | echo "Building for aarch64" 79 | cmake_args+=( 80 | "-DGGML_NATIVE=OFF" 81 | "-DGGML_CPU_ARM_ARCH=armv8-a" 82 | ) 83 | # Add ggml-blas target if BLAS is enabled via CMAKE_ARGS 84 | if [[ "${CMAKE_ARGS:-}" == *"-DGGML_BLAS=ON"* ]]; then 85 | echo "BLAS is enabled via CMAKE_ARGS, adding ggml-blas to build targets" 86 | targets+=("ggml-blas") 87 | fi 88 | else 89 | if [[ "$(uname -s)" == "Darwin" ]]; then 90 | cmake_args+=("-DCMAKE_BUILD_RPATH=@loader_path") 91 | if [[ "$(uname -m)" == "x86_64" ]]; then 92 | echo "Building for Intel" 93 | cmake_args+=("-DGGML_METAL=OFF") 94 | targets+=("ggml-blas") 95 | else 96 | echo "Building for Apple Silicon" 97 | cmake_args+=("-DGGML_METAL_EMBED_LIBRARY=ON") 98 | targets+=("ggml-blas" "ggml-metal") 99 | fi 100 | else 101 | echo "Building for non-MacOS CPU (optimize for native CPU)" 102 | # Let CMake handle GGML_BLAS from environment 103 | if [[ "${CMAKE_ARGS:-}" == *"-DGGML_BLAS=ON"* ]]; then 104 | echo "BLAS is enabled via CMAKE_ARGS, adding ggml-blas to build targets" 105 | targets+=("ggml-blas") 106 | fi 107 | fi 108 | fi 109 | 110 | # Run CMake and build 111 | echo "Running CMake with arguments: ${cmake_args[*]}" 112 | echo "Building targets: ${targets[*]}" 113 | 114 | cmake .. "${cmake_args[@]}" && \ 115 | cmake --build . --config Release -j ${NPROC} --target "${targets[@]}" 116 | rm -rf ${PREFIX} 117 | python ${CWD}/scripts/copy_libs.py 118 | cd ${CWD} 119 | } 120 | 121 | build_llamacpp 122 | -------------------------------------------------------------------------------- /scripts/copy_libs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to copy: 4 | 1. All .a and .lib files from thirdparty/llama.cpp/build to src/llama.cpp/lib 5 | 2. All .h, .hpp files from thirdparty/llama.cpp to src/llama.cpp/include 6 | 3. All .cpp, .cc files from thirdparty/llama.cpp to src/llama.cpp/src 7 | """ 8 | 9 | import os 10 | import shutil 11 | import glob 12 | import logging 13 | 14 | ROOT = os.path.join(os.path.dirname(__file__), "..") 15 | 16 | 17 | def copy_library_files(): 18 | # Define source and destination directories 19 | src_dir = os.path.join(ROOT, "thirdparty", "llama.cpp", "build") 20 | dst_dir = os.path.join(ROOT, "src", "llama.cpp", "lib") 21 | 22 | # Create destination directory if it doesn't exist 23 | if not os.path.exists(dst_dir): 24 | logging.info(f"Creating destination directory: {dst_dir}") 25 | os.makedirs(dst_dir, exist_ok=True) 26 | 27 | # Recursively find all static library files 28 | lib_files = [] 29 | for ext in (".a", ".lib"): 30 | pattern = os.path.join(src_dir, "**", f"*{ext}") 31 | lib_files.extend(glob.glob(pattern, recursive=True)) 32 | 33 | if not lib_files: 34 | logging.warning(f"No .a or .lib files found in {src_dir}") 35 | return 36 | 37 | linked_count = 0 38 | skipped_count = 0 39 | 40 | for lib_file in lib_files: 41 | filename = os.path.basename(lib_file) 42 | dst_file = os.path.join(dst_dir, filename) 43 | 44 | # Skip if link or file already exists 45 | if os.path.exists(dst_file): 46 | logging.info(f"Skipping {filename} - already exists at {dst_file}") 47 | skipped_count += 1 48 | continue 49 | 50 | logging.info(f"Linking {lib_file} -> {dst_file}") 51 | os.symlink(lib_file, dst_file) 52 | linked_count += 1 53 | 54 | logging.info( 55 | f"Successfully linked {linked_count} libraries to {dst_dir} " 56 | f"({skipped_count} skipped)" 57 | ) 58 | 59 | 60 | def copy_source_files(target, source_paths): 61 | # Define source base directory and destination directory 62 | src_base = os.path.join(ROOT, "thirdparty", "llama.cpp") 63 | dst_dir = os.path.join(ROOT, "src", "llama.cpp", target) 64 | 65 | # Create destination directory if it doesn't exist 66 | if not os.path.exists(dst_dir): 67 | logging.info(f"Creating destination directory: {dst_dir}") 68 | os.makedirs(dst_dir, exist_ok=True) 69 | 70 | # Copy each file to destination 71 | copied_count = 0 72 | skipped_count = 0 73 | for rel_path in source_paths: 74 | src_file = os.path.join(src_base, rel_path) 75 | 76 | # Skip if source file doesn't exist 77 | if not os.path.exists(src_file): 78 | logging.warning(f"Source file not found: {src_file}") 79 | continue 80 | 81 | # Create subdirectories in destination if needed 82 | filename = os.path.basename(rel_path) 83 | dst_file = os.path.join(dst_dir, filename) 84 | 85 | # Skip if destination file already exists 86 | if os.path.exists(dst_file): 87 | logging.info(f"Skipping {filename} - already exists at {dst_file}") 88 | skipped_count += 1 89 | continue 90 | 91 | logging.info(f"Copying {src_file} to {dst_file}") 92 | shutil.copy2(src_file, dst_file) 93 | copied_count += 1 94 | 95 | logging.info( 96 | f"Successfully copied {copied_count} source files to {dst_dir} ({skipped_count} skipped)" 97 | ) 98 | 99 | 100 | def main(): 101 | # Setup logging 102 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s") 103 | 104 | # Copy library files 105 | copy_library_files() 106 | 107 | # Copy header files 108 | copy_source_files( 109 | "include", 110 | [ 111 | "common/common.h", 112 | "ggml/include/ggml.h", 113 | "ggml/include/ggml-backend.h", 114 | "include/llama.h", 115 | ], 116 | ) 117 | copy_source_files( 118 | "src", 119 | [ 120 | "tools/server/server.cpp", 121 | ], 122 | ) 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /tests/test_memory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 XProbe Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os.path 15 | import json 16 | from dataclasses import dataclass 17 | 18 | from xllamacpp import estimate_gpu_layers 19 | from xllamacpp.memory import graph_size 20 | 21 | TEST_GGUF = os.path.join(os.path.dirname(os.path.abspath(__file__)), "dummy.gguf") 22 | TEST_METADATA_JSON = os.path.join( 23 | os.path.dirname(os.path.abspath(__file__)), "bge-m3-metadata.json" 24 | ) 25 | 26 | 27 | def test_estimate_gpu_layers(): 28 | estimate = estimate_gpu_layers( 29 | [{"name": "CPU", "memory_free": 0}], 30 | TEST_GGUF, 31 | [], 32 | context_length=2048, 33 | batch_size=512, 34 | num_parallel=1, 35 | kv_cache_type="", 36 | ) 37 | assert estimate.layers == 0 38 | assert estimate.graph == 0 39 | 40 | graph_partial_offload = 202377216 41 | graph_full_offload = 171968512 42 | layer_size = 33554436 43 | projector_size = 0 44 | memory_layer_output = 4 45 | gpu_minimum_memory = 2048 46 | 47 | gpus = [ 48 | {"name": "cuda", "memory_min": gpu_minimum_memory}, 49 | {"name": "cuda", "memory_min": gpu_minimum_memory}, 50 | ] 51 | 52 | @dataclass 53 | class _TestInfo: 54 | layer0: int # type: ignore 55 | layer1: int # type: ignore 56 | expect0: int # type: ignore 57 | expect1: int # type: ignore 58 | 59 | test_data = [ 60 | _TestInfo(*v) 61 | for v in [ 62 | [1, 1, 1, 1], 63 | [2, 1, 2, 1], 64 | [2, 2, 2, 2], 65 | [1, 2, 1, 2], 66 | [3, 3, 3, 3], 67 | [4, 4, 3, 3], 68 | [6, 6, 3, 3], 69 | [0, 3, 0, 3], 70 | ] 71 | ] 72 | for i, s in enumerate(test_data): 73 | gpus[0]["memory_free"] = 0 74 | gpus[1]["memory_free"] = 0 75 | gpus[0]["memory_free"] += projector_size 76 | if s.layer0 > 0: 77 | gpus[0]["memory_free"] += memory_layer_output 78 | else: 79 | gpus[1]["memory_free"] += memory_layer_output 80 | gpus[0]["memory_free"] += ( 81 | gpu_minimum_memory + layer_size + s.layer0 * layer_size + 1 82 | ) 83 | gpus[1]["memory_free"] += ( 84 | gpu_minimum_memory + layer_size + s.layer1 * layer_size + 1 85 | ) 86 | gpus[0]["memory_free"] += max(graph_full_offload, graph_partial_offload) 87 | gpus[1]["memory_free"] += max(graph_full_offload, graph_partial_offload) 88 | estimate = estimate_gpu_layers( 89 | gpus, 90 | TEST_GGUF, 91 | [], 92 | context_length=2048, 93 | batch_size=512, 94 | num_parallel=1, 95 | kv_cache_type="", 96 | ) 97 | assert s.expect0 + s.expect1 == estimate.layers 98 | assert [ 99 | s.expect0 / estimate.layers, 100 | s.expect1 / estimate.layers, 101 | ] == estimate.tensor_split 102 | layer_sums = sum(estimate.gpu_sizes) 103 | if estimate.layers < 6: 104 | assert estimate.vram_size < estimate.total_size 105 | assert estimate.vram_size == layer_sums 106 | else: 107 | assert estimate.vram_size == estimate.total_size 108 | assert estimate.total_size == layer_sums 109 | 110 | 111 | def test_missing_keys(): 112 | with open(TEST_METADATA_JSON, "r") as f: 113 | metadata = json.load(f) 114 | kv, partial_offload, full_offload = graph_size( 115 | metadata, context_length=4096, batch_size=2048, num_parallel=8, kv_cache_type="" 116 | ) 117 | assert full_offload == 67108864.0 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | vscode/ 4 | build/ 5 | models/ 6 | 7 | src/xllamacpp/xllamacpp.cpp 8 | src/llama.cpp/bin/ 9 | src/llama.cpp/lib/ 10 | src/llama.cpp/include/* 11 | !src/llama.cpp/include/common.h 12 | !src/llama.cpp/include/ggml.h 13 | !src/llama.cpp/include/ggml-backend.h 14 | !src/llama.cpp/include/llama.h 15 | tests/*.cpp 16 | changes.diff 17 | 18 | # editor 19 | .vscode 20 | 21 | # project detritus 22 | .ruff_cache 23 | 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | share/python-wheels/ 48 | *.egg-info/ 49 | .installed.cfg 50 | *.egg 51 | MANIFEST 52 | 53 | # PyInstaller 54 | # Usually these files are written by a python script from a template 55 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 56 | *.manifest 57 | *.spec 58 | 59 | # Installer logs 60 | pip-log.txt 61 | pip-delete-this-directory.txt 62 | 63 | # Unit test / coverage reports 64 | htmlcov/ 65 | .tox/ 66 | .nox/ 67 | .coverage 68 | .coverage.* 69 | .cache 70 | nosetests.xml 71 | coverage.xml 72 | *.cover 73 | *.py,cover 74 | .hypothesis/ 75 | .pytest_cache/ 76 | cover/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | db.sqlite3 86 | db.sqlite3-journal 87 | 88 | # Flask stuff: 89 | instance/ 90 | .webassets-cache 91 | 92 | # Scrapy stuff: 93 | .scrapy 94 | 95 | # Sphinx documentation 96 | docs/_build/ 97 | 98 | # PyBuilder 99 | .pybuilder/ 100 | target/ 101 | 102 | # Jupyter Notebook 103 | .ipynb_checkpoints 104 | 105 | # IPython 106 | profile_default/ 107 | ipython_config.py 108 | 109 | # pyenv 110 | # For a library or package, you might want to ignore these files since the code is 111 | # intended to run in multiple environments; otherwise, check them in: 112 | # .python-version 113 | 114 | # pipenv 115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 118 | # install all needed dependencies. 119 | #Pipfile.lock 120 | 121 | # poetry 122 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 123 | # This is especially recommended for binary packages to ensure reproducibility, and is more 124 | # commonly ignored for libraries. 125 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 126 | #poetry.lock 127 | 128 | # pdm 129 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 130 | #pdm.lock 131 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 132 | # in version control. 133 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 134 | .pdm.toml 135 | .pdm-python 136 | .pdm-build/ 137 | 138 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 139 | __pypackages__/ 140 | 141 | # Celery stuff 142 | celerybeat-schedule 143 | celerybeat.pid 144 | 145 | # SageMath parsed files 146 | *.sage.py 147 | 148 | # Environments 149 | .env 150 | .venv 151 | env/ 152 | venv/ 153 | ENV/ 154 | env.bak/ 155 | venv.bak/ 156 | 157 | # Spyder project settings 158 | .spyderproject 159 | .spyproject 160 | 161 | # Rope project settings 162 | .ropeproject 163 | 164 | # mkdocs documentation 165 | /site 166 | 167 | # mypy 168 | .mypy_cache/ 169 | .dmypy.json 170 | dmypy.json 171 | 172 | # Pyre type checker 173 | .pyre/ 174 | 175 | # pytype static type analyzer 176 | .pytype/ 177 | 178 | # Cython debug symbols 179 | cython_debug/ 180 | 181 | # PyCharm 182 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 183 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 184 | # and can be added to the global gitignore or merged into this file. For a more nuclear 185 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 186 | .idea/ 187 | -------------------------------------------------------------------------------- /assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /assets/logo-white.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/bge-m3-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "GGUF.version": { 3 | "index": 0, 4 | "type": "UINT32", 5 | "offset": 4, 6 | "value": 3 7 | }, 8 | "GGUF.tensor_count": { 9 | "index": 1, 10 | "type": "UINT64", 11 | "offset": 8, 12 | "value": 389 13 | }, 14 | "GGUF.kv_count": { 15 | "index": 2, 16 | "type": "UINT64", 17 | "offset": 16, 18 | "value": 33 19 | }, 20 | "general.architecture": { 21 | "index": 3, 22 | "type": "STRING", 23 | "offset": 24, 24 | "value": "bert" 25 | }, 26 | "general.type": { 27 | "index": 4, 28 | "type": "STRING", 29 | "offset": 68, 30 | "value": "model" 31 | }, 32 | "general.size_label": { 33 | "index": 5, 34 | "type": "STRING", 35 | "offset": 105, 36 | "value": "567M" 37 | }, 38 | "general.license": { 39 | "index": 6, 40 | "type": "STRING", 41 | "offset": 147, 42 | "value": "mit" 43 | }, 44 | "general.tags": { 45 | "index": 7, 46 | "type": "ARRAY", 47 | "offset": 185, 48 | "array_types": [ 49 | "STRING" 50 | ], 51 | "value": [] 52 | }, 53 | "bert.block_count": { 54 | "index": 8, 55 | "type": "UINT32", 56 | "offset": 330, 57 | "value": 24 58 | }, 59 | "bert.context_length": { 60 | "index": 9, 61 | "type": "UINT32", 62 | "offset": 362, 63 | "value": 8192 64 | }, 65 | "bert.embedding_length": { 66 | "index": 10, 67 | "type": "UINT32", 68 | "offset": 397, 69 | "value": 1024 70 | }, 71 | "bert.feed_forward_length": { 72 | "index": 11, 73 | "type": "UINT32", 74 | "offset": 434, 75 | "value": 4096 76 | }, 77 | "bert.attention.head_count": { 78 | "index": 12, 79 | "type": "UINT32", 80 | "offset": 474, 81 | "value": 16 82 | }, 83 | "bert.attention.layer_norm_epsilon": { 84 | "index": 13, 85 | "type": "FLOAT32", 86 | "offset": 515, 87 | "value": 0.000009999999747378752 88 | }, 89 | "general.file_type": { 90 | "index": 14, 91 | "type": "UINT32", 92 | "offset": 564, 93 | "value": 15 94 | }, 95 | "bert.attention.causal": { 96 | "index": 15, 97 | "type": "BOOL", 98 | "offset": 597, 99 | "value": false 100 | }, 101 | "bert.pooling_type": { 102 | "index": 16, 103 | "type": "UINT32", 104 | "offset": 631, 105 | "value": 2 106 | }, 107 | "tokenizer.ggml.model": { 108 | "index": 17, 109 | "type": "STRING", 110 | "offset": 664, 111 | "value": "t5" 112 | }, 113 | "tokenizer.ggml.pre": { 114 | "index": 18, 115 | "type": "STRING", 116 | "offset": 706, 117 | "value": "default" 118 | }, 119 | "tokenizer.ggml.tokens": { 120 | "index": 19, 121 | "type": "ARRAY", 122 | "offset": 751, 123 | "array_types": [ 124 | "STRING" 125 | ], 126 | "value": [] 127 | }, 128 | "tokenizer.ggml.scores": { 129 | "index": 20, 130 | "type": "ARRAY", 131 | "offset": 4582162, 132 | "array_types": [ 133 | "FLOAT32" 134 | ], 135 | "value": [] 136 | }, 137 | "tokenizer.ggml.token_type": { 138 | "index": 21, 139 | "type": "ARRAY", 140 | "offset": 5582215, 141 | "array_types": [ 142 | "INT32" 143 | ], 144 | "value": [] 145 | }, 146 | "tokenizer.ggml.add_space_prefix": { 147 | "index": 22, 148 | "type": "BOOL", 149 | "offset": 6582272, 150 | "value": true 151 | }, 152 | "tokenizer.ggml.token_type_count": { 153 | "index": 23, 154 | "type": "UINT32", 155 | "offset": 6582316, 156 | "value": 1 157 | }, 158 | "tokenizer.ggml.remove_extra_whitespaces": { 159 | "index": 24, 160 | "type": "BOOL", 161 | "offset": 6582363, 162 | "value": true 163 | }, 164 | "tokenizer.ggml.precompiled_charsmap": { 165 | "index": 25, 166 | "type": "ARRAY", 167 | "offset": 6582415, 168 | "array_types": [ 169 | "UINT8" 170 | ], 171 | "value": [] 172 | }, 173 | "tokenizer.ggml.bos_token_id": { 174 | "index": 26, 175 | "type": "UINT32", 176 | "offset": 6820013, 177 | "value": 0 178 | }, 179 | "tokenizer.ggml.eos_token_id": { 180 | "index": 27, 181 | "type": "UINT32", 182 | "offset": 6820056, 183 | "value": 2 184 | }, 185 | "tokenizer.ggml.unknown_token_id": { 186 | "index": 28, 187 | "type": "UINT32", 188 | "offset": 6820099, 189 | "value": 3 190 | }, 191 | "tokenizer.ggml.seperator_token_id": { 192 | "index": 29, 193 | "type": "UINT32", 194 | "offset": 6820146, 195 | "value": 2 196 | }, 197 | "tokenizer.ggml.padding_token_id": { 198 | "index": 30, 199 | "type": "UINT32", 200 | "offset": 6820195, 201 | "value": 1 202 | }, 203 | "tokenizer.ggml.cls_token_id": { 204 | "index": 31, 205 | "type": "UINT32", 206 | "offset": 6820242, 207 | "value": 0 208 | }, 209 | "tokenizer.ggml.mask_token_id": { 210 | "index": 32, 211 | "type": "UINT32", 212 | "offset": 6820285, 213 | "value": 250001 214 | }, 215 | "tokenizer.ggml.add_bos_token": { 216 | "index": 33, 217 | "type": "BOOL", 218 | "offset": 6820329, 219 | "value": true 220 | }, 221 | "tokenizer.ggml.add_eos_token": { 222 | "index": 34, 223 | "type": "BOOL", 224 | "offset": 6820370, 225 | "value": true 226 | }, 227 | "general.quantization_version": { 228 | "index": 35, 229 | "type": "UINT32", 230 | "offset": 6820411, 231 | "value": 2 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /.github/workflows/build-wheel.yaml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | workflow_dispatch: 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | build_wheels: 15 | name: Build wheels on ${{ matrix.os }} for Python ${{ matrix.python }} ${{ matrix.arch }} 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | include: 21 | # Linux x86_64 22 | - os: ubuntu-latest 23 | arch: auto 24 | platform-id: manylinux_x86_64 25 | python: 310 26 | requires-python: ">=3.10,<3.11" 27 | - os: ubuntu-latest 28 | arch: auto 29 | platform-id: manylinux_x86_64 30 | python: 311 31 | requires-python: ">=3.11,<3.12" 32 | - os: ubuntu-latest 33 | arch: auto 34 | platform-id: manylinux_x86_64 35 | python: 312 36 | requires-python: ">=3.12,<3.13" 37 | - os: ubuntu-latest 38 | arch: auto 39 | platform-id: manylinux_x86_64 40 | python: 313 41 | requires-python: ">=3.13,<3.14" 42 | 43 | # Linux aarch64 44 | - os: ubuntu-22.04-arm 45 | arch: aarch64 46 | platform-id: manylinux_aarch64 47 | python: 310 48 | requires-python: ">=3.10,<3.11" 49 | - os: ubuntu-22.04-arm 50 | arch: aarch64 51 | platform-id: manylinux_aarch64 52 | python: 311 53 | requires-python: ">=3.11,<3.12" 54 | - os: ubuntu-22.04-arm 55 | arch: aarch64 56 | platform-id: manylinux_aarch64 57 | python: 312 58 | requires-python: ">=3.12,<3.13" 59 | - os: ubuntu-22.04-arm 60 | arch: aarch64 61 | platform-id: manylinux_aarch64 62 | python: 313 63 | requires-python: ">=3.13,<3.14" 64 | 65 | # macOS x86_64 66 | - os: macos-15-intel 67 | arch: x86_64 68 | platform-id: macosx_x86_64 69 | python: 310 70 | requires-python: ">=3.10,<3.11" 71 | - os: macos-15-intel 72 | arch: x86_64 73 | platform-id: macosx_x86_64 74 | python: 311 75 | requires-python: ">=3.11,<3.12" 76 | - os: macos-15-intel 77 | arch: x86_64 78 | platform-id: macosx_x86_64 79 | python: 312 80 | requires-python: ">=3.12,<3.13" 81 | - os: macos-15-intel 82 | arch: x86_64 83 | platform-id: macosx_x86_64 84 | python: 313 85 | requires-python: ">=3.13,<3.14" 86 | 87 | # macOS arm64 88 | - os: macos-14 89 | arch: arm64 90 | platform-id: macosx_arm64 91 | python: 310 92 | requires-python: ">=3.10,<3.11" 93 | - os: macos-14 94 | arch: arm64 95 | platform-id: macosx_arm64 96 | python: 311 97 | requires-python: ">=3.11,<3.12" 98 | - os: macos-14 99 | arch: arm64 100 | platform-id: macosx_arm64 101 | python: 312 102 | requires-python: ">=3.12,<3.13" 103 | - os: macos-14 104 | arch: arm64 105 | platform-id: macosx_arm64 106 | python: 313 107 | requires-python: ">=3.13,<3.14" 108 | 109 | # Windows AMD64 110 | - os: windows-2022 111 | arch: AMD64 112 | platform-id: win_amd64 113 | python: 310 114 | requires-python: ">=3.10,<3.11" 115 | - os: windows-2022 116 | arch: AMD64 117 | platform-id: win_amd64 118 | python: 311 119 | requires-python: ">=3.11,<3.12" 120 | - os: windows-2022 121 | arch: AMD64 122 | platform-id: win_amd64 123 | python: 312 124 | requires-python: ">=3.12,<3.13" 125 | - os: windows-2022 126 | arch: AMD64 127 | platform-id: win_amd64 128 | python: 313 129 | requires-python: ">=3.13,<3.14" 130 | 131 | steps: 132 | - uses: actions/checkout@v4 133 | with: 134 | fetch-depth: 0 135 | submodules: recursive 136 | 137 | - name: Add msbuild to PATH 138 | if: ${{ matrix.os == 'windows-latest'}} 139 | uses: microsoft/setup-msbuild@v2 140 | with: 141 | vs-version: '[17.13,17.14)' 142 | 143 | - name: Build wheels 144 | uses: pypa/cibuildwheel@v2.22.0 145 | env: 146 | VERSIONEER_CLOSEST_TAG_ONLY: 1 147 | CIBW_SKIP: pp* *i686 148 | CIBW_ARCHS: ${{ matrix.arch }} 149 | CIBW_PROJECT_REQUIRES_PYTHON: ${{ matrix.requires-python }} 150 | CIBW_TEST_REQUIRES: pytest requests pytest-asyncio pytest-timeout 151 | CIBW_BEFORE_ALL_LINUX: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && source $HOME/.cargo/env 152 | CIBW_BEFORE_BUILD: pip install -r requirements.txt && make 153 | CIBW_BUILD_VERBOSITY: 1 154 | CIBW_ENVIRONMENT_LINUX: "PATH=$HOME/.cargo/bin:$PATH XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}" 155 | CIBW_ENVIRONMENT_MACOS: "XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}" 156 | CIBW_ENVIRONMENT_WINDOWS: "XLLAMACPP_BUILD_AARCH64=${{ matrix.arch == 'aarch64' && '1' || '' }}" 157 | with: 158 | package-dir: ./ 159 | 160 | - uses: actions/upload-artifact@v4 161 | with: 162 | name: wheel-${{ matrix.python }}-${{ matrix.platform-id }} 163 | path: wheelhouse/*.whl 164 | 165 | build_sdist: 166 | name: Build source distribution 167 | runs-on: ubuntu-latest 168 | steps: 169 | - uses: actions/checkout@v4 170 | with: 171 | fetch-depth: 0 172 | submodules: recursive 173 | 174 | - name: Build sdist 175 | run: pipx run build --sdist 176 | 177 | - uses: actions/upload-artifact@v4 178 | with: 179 | name: artifacts 180 | path: ./dist/*.tar.gz 181 | 182 | upload_pypi: 183 | needs: [build_wheels, build_sdist] 184 | runs-on: ubuntu-latest 185 | # upload to PyPI on every tag starting with 'v' 186 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 187 | steps: 188 | - uses: actions/download-artifact@v4 189 | with: 190 | path: dist 191 | pattern: 'wheel-*' 192 | merge-multiple: true 193 | 194 | - uses: actions/download-artifact@v4 195 | with: 196 | path: dist 197 | name: artifacts 198 | 199 | - name: Publish to PyPI 200 | if: github.repository == 'xorbitsai/xllamacpp' 201 | uses: pypa/gh-action-pypi-publish@v1.12.4 202 | with: 203 | user: __token__ 204 | password: ${{ secrets.PYPI_PASSWORD }} 205 | 206 | - name: Publish to Test PyPI 207 | if: github.repository != 'xorbitsai/xllamacpp' 208 | uses: pypa/gh-action-pypi-publish@v1.12.4 209 | with: 210 | user: __token__ 211 | password: ${{ secrets.TEST_PYPI_PASSWORD }} 212 | verbose: true 213 | repository_url: https://test.pypi.org/legacy/ 214 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import sys 5 | import platform 6 | import subprocess 7 | from setuptools import Extension, setup 8 | 9 | from Cython.Build import cythonize 10 | 11 | # ----------------------------------------------------------------------------- 12 | # constants 13 | 14 | BUILD_CUDA = os.getenv("XLLAMACPP_BUILD_CUDA") 15 | BUILD_HIP = os.getenv("XLLAMACPP_BUILD_HIP") 16 | BUILD_VULKAN = os.getenv("XLLAMACPP_BUILD_VULKAN") 17 | NAME = "xllamacpp" 18 | # NAME = "xllamacpp-cuda12x" if BUILD_CUDA else "xllamacpp" 19 | CWD = os.path.dirname(os.path.abspath(__file__)) 20 | 21 | sys.path.insert(0, CWD) 22 | import versioneer 23 | 24 | VERSION = versioneer.get_version() 25 | 26 | PLATFORM = platform.system() 27 | 28 | LLAMACPP_LIBS_DIR = os.path.join(CWD, "src/llama.cpp/lib") 29 | 30 | DEFINE_MACROS = [] 31 | if PLATFORM == "Windows": 32 | EXTRA_COMPILE_ARGS = ["/std:c++17"] 33 | else: 34 | EXTRA_COMPILE_ARGS = ["-std=c++17"] 35 | if PLATFORM == "Darwin": 36 | EXTRA_COMPILE_ARGS.append("-mmacosx-version-min=12.0") 37 | EXTRA_LINK_ARGS = [] 38 | EXTRA_OBJECTS = [] 39 | INCLUDE_DIRS = [ 40 | "src/xllamacpp", 41 | os.path.join(CWD, "thirdparty/llama.cpp/include"), 42 | os.path.join(CWD, "thirdparty/llama.cpp/common"), 43 | os.path.join(CWD, "thirdparty/llama.cpp/ggml/include"), 44 | os.path.join( 45 | CWD, "thirdparty/llama.cpp" 46 | ), # For including 'common/base64.hpp' in server/utils.hpp 47 | os.path.join( 48 | CWD, "thirdparty/llama.cpp/build/tools/server" 49 | ), # For including index.html.gz.hpp and loading.html.hpp 50 | os.path.join(CWD, "thirdparty/llama.cpp/tools/server"), 51 | os.path.join(CWD, "thirdparty/llama.cpp/tools/mtmd"), 52 | os.path.join(CWD, "thirdparty/llama.cpp/vendor"), 53 | ] 54 | LIBRARY_DIRS = [ 55 | LLAMACPP_LIBS_DIR, 56 | ] 57 | LIBRARIES = [] 58 | 59 | if PLATFORM == "Windows": 60 | LIBRARIES.extend( 61 | [ 62 | "common", 63 | "llama", 64 | "ggml", 65 | "ggml-base", 66 | "ggml-cpu", 67 | "mtmd", 68 | "cpp-httplib", 69 | "server-context", 70 | "llguidance", 71 | "Advapi32", 72 | "userenv", 73 | "ntdll", 74 | ] 75 | ) 76 | if BUILD_CUDA: 77 | LIBRARY_DIRS.extend([os.getenv("CUDA_PATH", "") + "\\Lib\\x64"]) 78 | LIBRARIES.extend(["ggml-cuda", "cudart", "cublas", "cublasLt", "cuda"]) 79 | if BUILD_VULKAN: 80 | LIBRARY_DIRS.extend([os.getenv("VULKAN_SDK", "") + "\\Lib"]) 81 | LIBRARIES.extend(["ggml-vulkan", "vulkan-1"]) 82 | else: 83 | LIBRARIES.extend(["pthread"]) 84 | EXTRA_OBJECTS.extend( 85 | [ 86 | f"{LLAMACPP_LIBS_DIR}/libserver-context.a", 87 | f"{LLAMACPP_LIBS_DIR}/libcpp-httplib.a", 88 | f"{LLAMACPP_LIBS_DIR}/libmtmd.a", 89 | f"{LLAMACPP_LIBS_DIR}/libcommon.a", 90 | f"{LLAMACPP_LIBS_DIR}/libllguidance.a", 91 | f"{LLAMACPP_LIBS_DIR}/libllama.a", 92 | f"{LLAMACPP_LIBS_DIR}/libggml.a", 93 | f"{LLAMACPP_LIBS_DIR}/libggml-cpu.a", 94 | f"{LLAMACPP_LIBS_DIR}/libggml-base.a", 95 | ] 96 | ) 97 | if BUILD_CUDA: 98 | EXTRA_OBJECTS.extend( 99 | [ 100 | f"{LLAMACPP_LIBS_DIR}/libggml-cuda.a", 101 | ] 102 | ) 103 | LIBRARY_DIRS.extend( 104 | [ 105 | os.getenv("CUDA_PATH", "") + "/lib/stubs", 106 | os.getenv("CUDA_PATH", "") + "/lib", 107 | ], 108 | ) 109 | LIBRARIES.extend(["cudart", "cublas", "cublasLt", "cuda"]) 110 | if BUILD_HIP: 111 | EXTRA_OBJECTS.extend( 112 | [ 113 | f"{LLAMACPP_LIBS_DIR}/libggml-hip.a", 114 | ] 115 | ) 116 | LIBRARY_DIRS.extend(["/opt/rocm/lib"]) 117 | LIBRARIES.extend(["amdhip64", "hipblas", "rocblas"]) 118 | if BUILD_VULKAN: 119 | EXTRA_OBJECTS.extend( 120 | [ 121 | f"{LLAMACPP_LIBS_DIR}/libggml-vulkan.a", 122 | ] 123 | ) 124 | LIBRARIES.extend(["vulkan"]) 125 | 126 | if PLATFORM == "Darwin": 127 | EXTRA_LINK_ARGS.append("-Wl,-rpath," + LLAMACPP_LIBS_DIR) 128 | os.environ["LDFLAGS"] = " ".join( 129 | [ 130 | "-framework Accelerate", 131 | "-framework Foundation", 132 | "-framework Metal", 133 | "-framework MetalKit", 134 | ] 135 | ) 136 | # Both the Intel and ARM platforms need to be linked with BLAS. 137 | EXTRA_OBJECTS.extend( 138 | [ 139 | f"{LLAMACPP_LIBS_DIR}/libggml-blas.a", 140 | ] 141 | ) 142 | if platform.processor() == "arm": 143 | EXTRA_OBJECTS.extend( 144 | [ 145 | f"{LLAMACPP_LIBS_DIR}/libggml-metal.a", 146 | ] 147 | ) 148 | elif PLATFORM == "Linux": 149 | EXTRA_LINK_ARGS.extend(["-fopenmp", "-static-libgcc"]) 150 | # Check if BLAS is enabled in environment 151 | if os.path.exists(f"{LLAMACPP_LIBS_DIR}/libggml-blas.a"): 152 | print("BLAS is enabled, adding ggml-blas to link targets") 153 | EXTRA_OBJECTS.extend([f"{LLAMACPP_LIBS_DIR}/libggml-blas.a"]) 154 | EXTRA_LINK_ARGS.extend(["-lopenblas"]) 155 | 156 | INCLUDE_DIRS.append(os.path.join(CWD, "src/xllamacpp")) 157 | 158 | 159 | def mk_extension(name, sources, define_macros=None): 160 | return Extension( 161 | name=name, 162 | sources=sources, 163 | define_macros=define_macros if define_macros else [], 164 | include_dirs=INCLUDE_DIRS, 165 | libraries=LIBRARIES, 166 | library_dirs=LIBRARY_DIRS, 167 | extra_objects=EXTRA_OBJECTS, 168 | extra_compile_args=EXTRA_COMPILE_ARGS, 169 | extra_link_args=EXTRA_LINK_ARGS, 170 | language="c++", 171 | ) 172 | 173 | 174 | # ---------------------------------------------------------------------------- 175 | # COMMON SETUP CONFIG 176 | 177 | common = { 178 | "name": NAME, 179 | "version": VERSION, 180 | "description": "A cython wrapper of the llama.cpp inference engine.", 181 | "python_requires": ">=3.9", 182 | "cmdclass": versioneer.get_cmdclass(), 183 | "license": "MIT", 184 | # "include_package_data": True, 185 | } 186 | 187 | 188 | # forces cythonize in this case 189 | subprocess.call("cythonize *.pyx", cwd="src/xllamacpp", shell=True) 190 | 191 | if not os.path.exists("MANIFEST.in"): 192 | with open("MANIFEST.in", "w") as f: 193 | f.write("exclude src/xllamacpp/*.pxd\n") 194 | f.write("exclude src/xllamacpp/*.pyx\n") 195 | f.write("exclude src/xllamacpp/*.cpp\n") 196 | f.write("exclude src/xllamacpp/*.h\n") 197 | f.write("exclude src/xllamacpp/py.typed\n") 198 | 199 | extensions = [ 200 | mk_extension( 201 | "xllamacpp.xllamacpp", 202 | sources=[ 203 | "src/xllamacpp/xllamacpp.pyx", 204 | "src/xllamacpp/server.cpp", 205 | "thirdparty/llama.cpp/tools/server/server-models.cpp", 206 | "thirdparty/llama.cpp/tools/server/server-http.cpp", 207 | ], 208 | ), 209 | ] 210 | 211 | setup( 212 | **common, 213 | ext_modules=cythonize( 214 | extensions, 215 | compiler_directives={ 216 | "language_level": "3", 217 | "embedsignature": False, # default: False 218 | "emit_code_comments": False, # default: True 219 | "warn.unused": True, # default: False 220 | }, 221 | ), 222 | package_dir={"": "src"}, 223 | ) 224 | -------------------------------------------------------------------------------- /scripts/.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -4 4 | AlignAfterOpenBracket: DontAlign 5 | AlignArrayOfStructures: None 6 | AlignConsecutiveAssignments: 7 | Enabled: false 8 | AcrossEmptyLines: false 9 | AcrossComments: false 10 | AlignCompound: false 11 | AlignFunctionPointers: false 12 | PadOperators: true 13 | AlignConsecutiveBitFields: 14 | Enabled: false 15 | AcrossEmptyLines: false 16 | AcrossComments: false 17 | AlignCompound: false 18 | AlignFunctionPointers: false 19 | PadOperators: false 20 | AlignConsecutiveDeclarations: 21 | Enabled: false 22 | AcrossEmptyLines: false 23 | AcrossComments: false 24 | AlignCompound: false 25 | AlignFunctionPointers: false 26 | PadOperators: false 27 | AlignConsecutiveMacros: 28 | Enabled: false 29 | AcrossEmptyLines: false 30 | AcrossComments: false 31 | AlignCompound: false 32 | AlignFunctionPointers: false 33 | PadOperators: false 34 | AlignConsecutiveShortCaseStatements: 35 | Enabled: false 36 | AcrossEmptyLines: false 37 | AcrossComments: false 38 | AlignCaseArrows: false 39 | AlignCaseColons: false 40 | AlignConsecutiveTableGenBreakingDAGArgColons: 41 | Enabled: false 42 | AcrossEmptyLines: false 43 | AcrossComments: false 44 | AlignCompound: false 45 | AlignFunctionPointers: false 46 | PadOperators: false 47 | AlignConsecutiveTableGenCondOperatorColons: 48 | Enabled: false 49 | AcrossEmptyLines: false 50 | AcrossComments: false 51 | AlignCompound: false 52 | AlignFunctionPointers: false 53 | PadOperators: false 54 | AlignConsecutiveTableGenDefinitionColons: 55 | Enabled: false 56 | AcrossEmptyLines: false 57 | AcrossComments: false 58 | AlignCompound: false 59 | AlignFunctionPointers: false 60 | PadOperators: false 61 | AlignEscapedNewlines: Left 62 | AlignOperands: DontAlign 63 | AlignTrailingComments: 64 | Kind: Never 65 | OverEmptyLines: 0 66 | # AllowAllArgumentsOnNextLine: false 67 | # AllowAllParametersOfDeclarationOnNextLine: false 68 | AllowAllArgumentsOnNextLine: true 69 | AllowAllParametersOfDeclarationOnNextLine: true 70 | AllowBreakBeforeNoexceptSpecifier: Never 71 | AllowShortBlocksOnASingleLine: Empty 72 | AllowShortCaseExpressionOnASingleLine: true 73 | AllowShortCaseLabelsOnASingleLine: false 74 | AllowShortCompoundRequirementOnASingleLine: true 75 | AllowShortEnumsOnASingleLine: true 76 | AllowShortFunctionsOnASingleLine: None 77 | AllowShortIfStatementsOnASingleLine: Never 78 | AllowShortLambdasOnASingleLine: All 79 | AllowShortLoopsOnASingleLine: false 80 | AlwaysBreakAfterDefinitionReturnType: None 81 | AlwaysBreakBeforeMultilineStrings: false 82 | AttributeMacros: 83 | - __capability 84 | BinPackArguments: true 85 | BinPackParameters: true 86 | BitFieldColonSpacing: Both 87 | BraceWrapping: 88 | AfterCaseLabel: false 89 | AfterClass: false 90 | AfterControlStatement: Never 91 | AfterEnum: false 92 | AfterExternBlock: false 93 | AfterFunction: true 94 | AfterNamespace: false 95 | AfterObjCDeclaration: false 96 | AfterStruct: false 97 | AfterUnion: false 98 | BeforeCatch: false 99 | BeforeElse: false 100 | BeforeLambdaBody: false 101 | BeforeWhile: false 102 | IndentBraces: false 103 | SplitEmptyFunction: true 104 | SplitEmptyRecord: true 105 | SplitEmptyNamespace: true 106 | BreakAdjacentStringLiterals: true 107 | BreakAfterAttributes: Leave 108 | BreakAfterJavaFieldAnnotations: false 109 | BreakAfterReturnType: None 110 | BreakArrays: true 111 | BreakBeforeBinaryOperators: All 112 | BreakBeforeConceptDeclarations: Always 113 | BreakBeforeBraces: WebKit 114 | BreakBeforeInlineASMColon: OnlyMultiline 115 | BreakBeforeTernaryOperators: true 116 | BreakConstructorInitializers: BeforeComma 117 | BreakFunctionDefinitionParameters: false 118 | BreakInheritanceList: BeforeColon 119 | BreakStringLiterals: true 120 | BreakTemplateDeclarations: MultiLine 121 | ColumnLimit: 300 122 | CommentPragmas: '^ IWYU pragma:' 123 | CompactNamespaces: false 124 | ConstructorInitializerIndentWidth: 4 125 | ContinuationIndentWidth: 4 126 | Cpp11BracedListStyle: false 127 | DerivePointerAlignment: false 128 | DisableFormat: false 129 | EmptyLineAfterAccessModifier: Never 130 | EmptyLineBeforeAccessModifier: LogicalBlock 131 | ExperimentalAutoDetectBinPacking: false 132 | FixNamespaceComments: false 133 | ForEachMacros: 134 | - foreach 135 | - Q_FOREACH 136 | - BOOST_FOREACH 137 | IfMacros: 138 | - KJ_IF_MAYBE 139 | IncludeBlocks: Preserve 140 | IncludeCategories: 141 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 142 | Priority: 2 143 | SortPriority: 0 144 | CaseSensitive: false 145 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 146 | Priority: 3 147 | SortPriority: 0 148 | CaseSensitive: false 149 | - Regex: '.*' 150 | Priority: 1 151 | SortPriority: 0 152 | CaseSensitive: false 153 | IncludeIsMainRegex: '(Test)?$' 154 | IncludeIsMainSourceRegex: '' 155 | IndentAccessModifiers: false 156 | IndentCaseBlocks: false 157 | IndentCaseLabels: false 158 | IndentExternBlock: AfterExternBlock 159 | IndentGotoLabels: true 160 | IndentPPDirectives: None 161 | IndentRequiresClause: true 162 | IndentWidth: 4 163 | IndentWrappedFunctionNames: false 164 | InsertBraces: false 165 | InsertNewlineAtEOF: false 166 | InsertTrailingCommas: None 167 | IntegerLiteralSeparator: 168 | Binary: 0 169 | BinaryMinDigits: 0 170 | Decimal: 0 171 | DecimalMinDigits: 0 172 | Hex: 0 173 | HexMinDigits: 0 174 | JavaScriptQuotes: Leave 175 | JavaScriptWrapImports: true 176 | KeepEmptyLines: 177 | AtEndOfFile: false 178 | AtStartOfBlock: true 179 | AtStartOfFile: true 180 | LambdaBodyIndentation: Signature 181 | LineEnding: DeriveLF 182 | MacroBlockBegin: '' 183 | MacroBlockEnd: '' 184 | MainIncludeChar: Quote 185 | MaxEmptyLinesToKeep: 1 186 | NamespaceIndentation: Inner 187 | ObjCBinPackProtocolList: Auto 188 | ObjCBlockIndentWidth: 4 189 | ObjCBreakBeforeNestedBlockParam: true 190 | ObjCSpaceAfterProperty: true 191 | ObjCSpaceBeforeProtocolList: true 192 | PackConstructorInitializers: BinPack 193 | PenaltyBreakAssignment: 2 194 | PenaltyBreakBeforeFirstCallParameter: 19 195 | PenaltyBreakComment: 300 196 | PenaltyBreakFirstLessLess: 120 197 | PenaltyBreakOpenParenthesis: 0 198 | PenaltyBreakScopeResolution: 500 199 | PenaltyBreakString: 1000 200 | PenaltyBreakTemplateDeclaration: 10 201 | PenaltyExcessCharacter: 1000000 202 | PenaltyIndentedWhitespace: 0 203 | PenaltyReturnTypeOnItsOwnLine: 60 204 | PointerAlignment: Left 205 | PPIndentWidth: -1 206 | QualifierAlignment: Leave 207 | ReferenceAlignment: Pointer 208 | ReflowComments: true 209 | RemoveBracesLLVM: false 210 | RemoveParentheses: Leave 211 | RemoveSemicolon: false 212 | RequiresClausePosition: OwnLine 213 | RequiresExpressionIndentation: OuterScope 214 | SeparateDefinitionBlocks: Leave 215 | ShortNamespaceLines: 1 216 | SkipMacroDefinitionBody: false 217 | SortIncludes: CaseSensitive 218 | SortJavaStaticImport: Before 219 | SortUsingDeclarations: LexicographicNumeric 220 | SpaceAfterCStyleCast: false 221 | SpaceAfterLogicalNot: false 222 | SpaceAfterTemplateKeyword: true 223 | SpaceAroundPointerQualifiers: Default 224 | SpaceBeforeAssignmentOperators: true 225 | SpaceBeforeCaseColon: false 226 | SpaceBeforeCpp11BracedList: true 227 | SpaceBeforeCtorInitializerColon: true 228 | SpaceBeforeInheritanceColon: true 229 | SpaceBeforeJsonColon: false 230 | SpaceBeforeParens: ControlStatements 231 | SpaceBeforeParensOptions: 232 | AfterControlStatements: true 233 | AfterForeachMacros: true 234 | AfterFunctionDefinitionName: false 235 | AfterFunctionDeclarationName: false 236 | AfterIfMacros: true 237 | AfterOverloadedOperator: false 238 | AfterPlacementOperator: true 239 | AfterRequiresInClause: false 240 | AfterRequiresInExpression: false 241 | BeforeNonEmptyParentheses: false 242 | SpaceBeforeRangeBasedForLoopColon: true 243 | SpaceBeforeSquareBrackets: false 244 | SpaceInEmptyBlock: true 245 | SpacesBeforeTrailingComments: 1 246 | SpacesInAngles: Never 247 | SpacesInContainerLiterals: true 248 | SpacesInLineCommentPrefix: 249 | Minimum: 1 250 | Maximum: -1 251 | SpacesInParens: Never 252 | SpacesInParensOptions: 253 | ExceptDoubleParentheses: false 254 | InCStyleCasts: false 255 | InConditionalStatements: false 256 | InEmptyParentheses: false 257 | Other: false 258 | SpacesInSquareBrackets: false 259 | Standard: Latest 260 | StatementAttributeLikeMacros: 261 | - Q_EMIT 262 | StatementMacros: 263 | - Q_UNUSED 264 | - QT_REQUIRE_VERSION 265 | TableGenBreakInsideDAGArg: DontBreak 266 | TabWidth: 4 267 | UseTab: Never 268 | VerilogBreakBetweenInstancePorts: true 269 | WhitespaceSensitiveMacros: 270 | - BOOST_PP_STRINGIZE 271 | - CF_SWIFT_NAME 272 | - NS_SWIFT_NAME 273 | - PP_STRINGIZE 274 | - STRINGIZE 275 | ... 276 | 277 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | xorbits 3 | 4 | # xllamacpp - a Python wrapper of llama.cpp 5 | 6 | [![PyPI Latest Release](https://img.shields.io/pypi/v/xllamacpp.svg?style=for-the-badge)](https://pypi.org/project/xllamacpp/) 7 | [![License](https://img.shields.io/pypi/l/xllamacpp.svg?style=for-the-badge)](https://github.com/xorbitsai/inference/blob/main/LICENSE) 8 | [![Discord](https://img.shields.io/badge/join_Discord-5462eb.svg?logo=discord&style=for-the-badge&logoColor=%23f5f5f5)](https://discord.gg/Xw9tszSkr5) 9 | [![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=x&style=for-the-badge)](https://twitter.com/xorbitsio) 10 | 11 |
12 |
13 | 14 | This project forks from [cyllama](https://github.com/shakfu/cyllama) and provides a Python wrapper for @ggerganov's [llama.cpp](https://github.com/ggerganov/llama.cpp) which is likely the most active open-source compiled LLM inference engine. 15 | 16 | ## Compare to llama-cpp-python 17 | 18 | The following table provide an overview of the current implementations / features: 19 | 20 | | implementations / features | xllamacpp | llama-cpp-python | 21 | |:---------------------------|:-------------------:|:--------------------------------:| 22 | | Wrapper-type | cython | ctypes | 23 | | API | Server & Params API | Llama API | 24 | | Server implementation | C++ | Python through wrapped LLama API | 25 | | Continuous batching | yes | no | 26 | | Thread safe | yes | no | 27 | | Release package | prebuilt | build during installation | 28 | 29 | It goes without saying that any help / collaboration / contributions to accelerate the above would be welcome! 30 | 31 | ## Wrapping Guidelines 32 | 33 | As the intent is to provide a very thin wrapping layer and play to the strengths of the original c++ library as well as python, the approach to wrapping intentionally adopts the following guidelines: 34 | 35 | - In general, key structs are implemented as cython extension classses with related functions implemented as methods of said classes. 36 | 37 | - Be as consistent as possible with llama.cpp's naming of its api elements, except when it makes sense to shorten functions names which are used as methods. 38 | 39 | - Minimize non-wrapper python code. 40 | 41 | ## Usage 42 | 43 | Here is a simple example of how to use `xllamacpp` to get embeddings for a list of texts. For this example, you'll need an embedding model like [Qwen3-Embedding-0.6B-Q8_0.gguf](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF/resolve/main/Qwen3-Embedding-0.6B-Q8_0.gguf). 44 | 45 | ```python 46 | import xllamacpp as xlc 47 | 48 | params = xlc.CommonParams() 49 | 50 | params.model.path = "Qwen3-Embedding-0.6B-Q8_0.gguf" 51 | params.embedding = True 52 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST 53 | 54 | server = xlc.Server(params) 55 | 56 | embedding_input = { 57 | "input": [ 58 | "I believe the meaning of life is", 59 | "This is a test", 60 | ], 61 | "model": "My Qwen3 Model", 62 | } 63 | 64 | result = server.handle_embeddings(embedding_input) 65 | 66 | print(result) 67 | 68 | ``` 69 | 70 | Output: 71 | 72 | ```python 73 | {'data': [{'embedding': [-0.006413215305656195, 74 | -0.05906733125448227, 75 | ... 76 | -0.05887744203209877], 77 | 'index': 0, 78 | 'object': 'embedding'}, 79 | {'embedding': [0.041170503944158554, 80 | -0.004472420550882816, 81 | ... 82 | 0.008314250037074089], 83 | 'index': 1, 84 | 'object': 'embedding'}], 85 | 'model': 'My Qwen3 Model', 86 | 'object': 'list', 87 | 'usage': {'prompt_tokens': 11, 'total_tokens': 11}} 88 | ``` 89 | 90 | ## OpenAI API Compatible HTTP Server 91 | 92 | The server provides OpenAI API compatible endpoints. For a complete list of available API endpoints, see the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints). You can use the OpenAI Python client: 93 | 94 | ```python 95 | import xllamacpp as xlc 96 | from openai import OpenAI 97 | 98 | # Start server 99 | params = xlc.CommonParams() 100 | params.model.path = "Llama-3.2-1B-Instruct-Q8_0.gguf" 101 | server = xlc.Server(params) 102 | 103 | # Connect using OpenAI client 104 | client = OpenAI( 105 | base_url=server.listening_address + "/v1", 106 | api_key="not-required" # No API key needed for local server 107 | ) 108 | 109 | # Make chat completion request 110 | response = client.chat.completions.create( 111 | model="local-model", 112 | messages=[{"role": "user", "content": "What is the capital of France?"}], 113 | max_tokens=10 114 | ) 115 | 116 | print(response.choices[0].message.content) 117 | ``` 118 | 119 | ## Prerequisites for Prebuilt Wheels 120 | 121 | Before pip installing `xllamacpp`, please ensure your system meets the following requirements based on your build type: 122 | 123 | - **CPU (aarch64)**: 124 | - Requires ARMv8-A or later architecture 125 | - For best performance, build from source if your CPU supports advanced instruction sets 126 | 127 | - **CUDA (Linux)**: 128 | - Requires glibc 2.35 or later 129 | - Compatible NVIDIA GPU with appropriate drivers (CUDA 12.4 or 12.8) 130 | 131 | - **ROCm (Linux)**: 132 | - Requires glibc 2.35 or later 133 | - Requires gcc 10 or later (ROCm libraries have this dependency) 134 | - Compatible AMD GPU with ROCm support (ROCm 6.3.4 or 6.4.1) 135 | 136 | - **Vulkan (Linux/Windows, Intel/AMD/NVIDIA where supported)**: 137 | - Install the Vulkan SDK and GPU drivers with Vulkan support 138 | - Linux users may need distro packages and the LunarG SDK 139 | - macOS Intel is supported via Vulkan; Apple Silicon Vulkan is not supported in this project 140 | 141 | ## Install 142 | 143 | **Note on Performance and Compatibility** 144 | 145 | For maximum performance, you can build `xllamacpp` from source to optimize for your specific native CPU architecture. The pre-built wheels are designed for broad compatibility. 146 | 147 | Specifically, the `aarch64` wheels are built for the `armv8-a` architecture. This ensures they run on a wide range of ARM64 devices, but it means that more advanced CPU instruction sets (like SVE) are not enabled. If your CPU supports these advanced features, building from source will provide better performance. 148 | 149 | - From pypi for `CPU` or `Mac`: 150 | 151 | ```sh 152 | pip install -U xllamacpp 153 | ``` 154 | 155 | - From github pypi for `CUDA` (use `--force-reinstall` to replace the installed CPU version): 156 | 157 | - CUDA 12.4 158 | ```sh 159 | pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/cu124 160 | ``` 161 | 162 | - CUDA 12.8 163 | ```sh 164 | pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/cu128 165 | ``` 166 | 167 | - From github pypi for `HIP` AMD GPU (use `--force-reinstall` to replace the installed CPU version): 168 | 169 | - ROCm 6.3.4 170 | ```sh 171 | pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.3.4 172 | ``` 173 | 174 | - ROCm 6.4.1 175 | ```sh 176 | pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.4.1 177 | ``` 178 | 179 | - From github pypi for `Vulkan` (use `--force-reinstall` to replace the installed CPU version): 180 | 181 | ```sh 182 | pip install xllamacpp --force-reinstall --index-url https://xorbitsai.github.io/xllamacpp/whl/vulkan 183 | ``` 184 | 185 | ## Build from Source 186 | 187 | ### (Optional) Preparation 188 | 189 | - CUDA 190 | 191 | This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed. 192 | 193 | #### Download directly from NVIDIA 194 | You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads). 195 | 196 | 197 | #### Compile and run inside a Fedora Toolbox Container 198 | We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/). 199 | 200 | **Recommended for:** 201 | - ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/). 202 | - (there are no supported CUDA packages for these systems) 203 | - ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads). 204 | - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system) 205 | - ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean. 206 | - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download) 207 | 208 | - HIP 209 | 210 | This provides GPU acceleration on HIP-supported AMD GPUs. 211 | Make sure to have ROCm installed. 212 | You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). 213 | 214 | 215 | Or you can try to build inside the [ROCm docker container](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html). 216 | 217 | - Vulkan 218 | 219 | Install the Vulkan SDK and drivers for your platform. 220 | - Linux: use your distro packages and/or the [LunarG Vulkan SDK](https://vulkan.lunarg.com/sdk/home). 221 | - Windows: install [LunarG Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and vendor GPU drivers. 222 | - macOS: Intel only; Apple Silicon is not supported for Vulkan in this project. 223 | 224 | ### Build `xllamacpp` 225 | 226 | 1. A recent version of `python3` (testing on python 3.12) 227 | 228 | 2. Install Rust toolchain (required for building): 229 | 230 | ```sh 231 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 232 | ``` 233 | 234 | For more installation options, see the [rustup installation guide](https://rustup.rs/). 235 | 236 | 3. Git clone the latest version of `xllamacpp`: 237 | 238 | ```sh 239 | git clone git@github.com:xorbitsai/xllamacpp.git 240 | cd xllamacpp 241 | git submodule init 242 | git submodule update 243 | ``` 244 | 245 | 4. Install dependencies of `cython`, `setuptools`, and `pytest` for testing: 246 | 247 | ```sh 248 | pip install -r requirements.txt 249 | ``` 250 | 251 | 5. Select backend via environment and build. Examples: 252 | 253 | - CPU (default): 254 | ```sh 255 | make 256 | ``` 257 | 258 | - CUDA: 259 | ```sh 260 | export XLLAMACPP_BUILD_CUDA=1 261 | make 262 | ``` 263 | 264 | - HIP (AMD): 265 | ```sh 266 | export XLLAMACPP_BUILD_HIP=1 267 | make 268 | ``` 269 | 270 | - Vulkan: 271 | ```sh 272 | export XLLAMACPP_BUILD_VULKAN=1 273 | make 274 | ``` 275 | 276 | - Enable BLAS (optional): 277 | ```sh 278 | export CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" 279 | make 280 | ``` 281 | 282 | ## Testing 283 | 284 | The `tests` directory in this repo provides extensive examples of using xllamacpp. 285 | 286 | However, as a first step, you should download a smallish llm in the `.gguf` model from [huggingface](https://huggingface.co/models?search=gguf). A good model to start and which is assumed by tests is [Llama-3.2-1B-Instruct-Q8_0.gguf](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf). `xllamacpp` expects models to be stored in a `models` folder in the cloned `xllamacpp` directory. So to create the `models` directory if doesn't exist and download this model, you can just type: 287 | 288 | ```sh 289 | make download 290 | ``` 291 | 292 | This basically just does: 293 | 294 | ```sh 295 | cd xllamacpp 296 | mkdir models && cd models 297 | wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf 298 | ``` 299 | 300 | Now you can test it using `llama-cli` or `llama-simple`: 301 | 302 | ```sh 303 | bin/llama-cli -c 512 -n 32 -m models/Llama-3.2-1B-Instruct-Q8_0.gguf \ 304 | -p "Is mathematics discovered or invented?" 305 | ``` 306 | 307 | You can also run the test suite with `pytest` by typing `pytest` or: 308 | 309 | ```sh 310 | make test 311 | ``` 312 | -------------------------------------------------------------------------------- /tests/test_params.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest import approx 3 | 4 | import xllamacpp as xlc 5 | 6 | 7 | def test_common_params_sampling(): 8 | with pytest.raises(Exception, match="construct"): 9 | xlc.CommonParamsSampling() 10 | params = xlc.CommonParams() 11 | assert params.sampling.timing_per_token is False 12 | assert params.sampling.user_sampling_config == 0 13 | # assert params.seed == xlc.LLAMA_DEFAULT_SEED 14 | # assert params.n_prev == 64 15 | # assert params.n_probs == 0 16 | # assert params.min_keep == 0 17 | # assert params.top_k == 40 18 | # assert params.top_p == approx(0.95) 19 | # assert params.min_p == approx(0.05) 20 | # assert params.xtc_probability == 0.00 21 | # assert params.xtc_threshold == approx(0.10) 22 | # assert params.typ_p == approx(1.00) 23 | # assert params.temp == approx(0.80) 24 | # assert params.dynatemp_range == 0.00 25 | # assert params.dynatemp_exponent == approx(1.00) 26 | # assert params.penalty_last_n == 64 27 | # assert params.penalty_repeat == approx(1.00) 28 | # assert params.penalty_freq == 0.00 29 | # assert params.penalty_present == 0.00 30 | # assert params.dry_multiplier == 0.0 31 | # assert params.dry_base == approx(1.75) 32 | # assert params.dry_allowed_length == 2 33 | # assert params.dry_penalty_last_n == -1 34 | # assert params.mirostat == 0 35 | # assert params.mirostat_tau == approx(5.00) 36 | # assert params.mirostat_eta == approx(0.10) 37 | # assert params.ignore_eos is False 38 | # assert params.no_perf is False 39 | 40 | 41 | def test_enum_values(): 42 | assert xlc.GGML_MAX_N_THREADS == 512 43 | assert xlc.GGML_ROPE_TYPE_VISION == 24 44 | assert xlc.ggml_sched_priority.GGML_SCHED_PRIO_REALTIME == 3 45 | assert xlc.ggml_numa_strategy.GGML_NUMA_STRATEGY_COUNT == 5 46 | assert xlc.ggml_type.GGML_TYPE_COUNT == 40 47 | assert xlc.ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_ACCEL == 3 48 | assert xlc.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_MAX_VALUE == 3 49 | assert xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK == 4 50 | assert xlc.llama_attention_type.LLAMA_ATTENTION_TYPE_NON_CAUSAL == 1 51 | assert xlc.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_ENABLED == 1 52 | assert xlc.llama_split_mode.LLAMA_SPLIT_MODE_ROW == 2 53 | assert xlc.llama_model_kv_override_type.LLAMA_KV_OVERRIDE_TYPE_STR == 3 54 | assert xlc.dimre_method.DIMRE_METHOD_MEAN == 1 55 | assert xlc.common_conversation_mode.COMMON_CONVERSATION_MODE_AUTO == 2 56 | assert xlc.common_grammar_trigger_type.COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL == 3 57 | assert xlc.common_reasoning_format.COMMON_REASONING_FORMAT_DEEPSEEK == 3 58 | assert xlc.common_params_sampling_config.COMMON_PARAMS_SAMPLING_CONFIG_TEMP == 64 59 | 60 | 61 | def test_common_params(): 62 | params = xlc.CommonParams() 63 | assert params.n_predict == -1 64 | assert params.n_ctx == 4096 65 | assert params.n_batch == 2048 66 | assert params.n_ubatch == 512 67 | assert params.n_keep == 0 68 | assert params.n_chunks == -1 69 | assert params.n_parallel == 1 70 | assert params.n_sequences == 1 71 | # assert params.p_split == approx(0.1) 72 | assert params.n_gpu_layers == -1 73 | # assert params.n_gpu_layers_draft == -1 74 | assert params.main_gpu == 0 75 | assert params.tensor_split == [0] * 128 76 | assert params.grp_attn_n == 1 77 | assert params.grp_attn_w == 512 78 | assert params.n_print == -1 79 | assert params.rope_freq_base == 0.0 80 | assert params.rope_freq_scale == 0.0 81 | assert params.yarn_ext_factor == approx(-1.0) 82 | assert params.yarn_attn_factor == approx(-1.0) 83 | assert params.yarn_beta_fast == approx(-1.0) 84 | assert params.yarn_beta_slow == approx(-1.0) 85 | assert params.yarn_orig_ctx == 0 86 | 87 | assert params.cpuparams.n_threads == -1 88 | assert params.cpuparams.cpumask == [False] * xlc.GGML_MAX_N_THREADS 89 | assert params.cpuparams.mask_valid is False 90 | assert params.cpuparams.priority == xlc.ggml_sched_priority.GGML_SCHED_PRIO_NORMAL 91 | assert params.cpuparams.strict_cpu is False 92 | assert params.cpuparams.poll == 50 93 | 94 | # assert params.cpuparams_batch == 95 | # assert params.draft_cpuparams == 96 | # assert params.draft_cpuparams_batch === 97 | 98 | # assert params.cb_eval == nullptr; 99 | # assert params.cb_eval_user_data == nullptr; 100 | 101 | assert params.numa == xlc.ggml_numa_strategy.GGML_NUMA_STRATEGY_DISABLED 102 | assert params.split_mode == xlc.llama_split_mode.LLAMA_SPLIT_MODE_LAYER 103 | assert ( 104 | params.rope_scaling_type 105 | == xlc.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED 106 | ) 107 | assert params.pooling_type == xlc.llama_pooling_type.LLAMA_POOLING_TYPE_UNSPECIFIED 108 | assert ( 109 | params.attention_type 110 | == xlc.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED 111 | ) 112 | assert ( 113 | params.flash_attn_type == xlc.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO 114 | ) 115 | 116 | # common_sampler_params sparams 117 | 118 | assert params.model.path == "" 119 | assert params.model.url == "" 120 | assert params.model.hf_repo == "" 121 | assert params.model.hf_file == "" 122 | assert params.model.docker_repo == "" 123 | assert params.model.name == "" 124 | assert params.model_alias == "" 125 | assert params.hf_token == "" 126 | assert params.prompt == "" 127 | assert params.prompt_file == "" 128 | assert params.path_prompt_cache == "" 129 | assert params.input_prefix == "" 130 | assert params.input_suffix == "" 131 | assert params.lookup_cache_static == "" 132 | assert params.lookup_cache_dynamic == "" 133 | assert params.logits_file == "" 134 | 135 | assert params.verbosity == 3 136 | assert params.control_vector_layer_start == -1 137 | assert params.control_vector_layer_end == -1 138 | assert params.ppl_stride == 0 139 | assert params.ppl_output_type == 0 140 | 141 | assert params.hellaswag is False 142 | assert params.hellaswag_tasks == 400 143 | assert params.winogrande is False 144 | assert params.winogrande_tasks == 0 145 | assert params.multiple_choice is False 146 | assert params.multiple_choice_tasks == 0 147 | assert params.kl_divergence is False 148 | assert params.usage is False 149 | assert params.use_color is False 150 | assert params.special is False 151 | assert params.interactive is False 152 | assert params.prompt_cache_all is False 153 | assert params.prompt_cache_ro is False 154 | assert params.escape is True 155 | assert params.multiline_input is False 156 | assert params.simple_io is False 157 | assert params.cont_batching is True 158 | assert params.no_perf is False 159 | assert params.show_timings is True 160 | assert params.ctx_shift is False 161 | assert params.swa_full is False 162 | assert params.kv_unified is False 163 | assert params.input_prefix_bos is False 164 | assert params.use_mmap is True 165 | assert params.use_mlock is False 166 | assert params.verbose_prompt is False 167 | assert params.display_prompt is True 168 | assert params.no_kv_offload is False 169 | assert params.warmup is True 170 | assert params.check_tensors is False 171 | assert params.no_op_offload is False 172 | assert params.no_extra_bufts is False 173 | assert params.no_host is False 174 | 175 | assert params.cache_type_k == xlc.ggml_type.GGML_TYPE_F16 176 | assert params.cache_type_v == xlc.ggml_type.GGML_TYPE_F16 177 | 178 | assert params.mmproj.path == "" 179 | assert params.mmproj_use_gpu is True 180 | assert params.no_mmproj is False 181 | assert params.image == [] 182 | assert params.image_min_tokens == -1 183 | assert params.image_max_tokens == -1 184 | 185 | assert params.embedding is False 186 | assert params.embd_normalize == 2 187 | assert params.embd_out == "" 188 | assert params.embd_sep == "\n" 189 | 190 | assert params.port == 0 191 | assert params.timeout_read == 600 192 | assert params.timeout_write == 600 193 | assert params.n_threads_http == -1 194 | assert params.n_cache_reuse == 0 195 | assert params.n_ctx_checkpoints == 8 196 | assert params.cache_ram_mib == 8192 197 | 198 | assert params.hostname == "127.0.0.1" 199 | assert params.public_path == "" 200 | assert params.api_prefix == "" 201 | assert params.chat_template == "" 202 | assert params.use_jinja is True 203 | params.use_jinja = False 204 | assert params.use_jinja is False 205 | assert params.enable_chat_template is True 206 | assert ( 207 | params.reasoning_format 208 | == xlc.common_reasoning_format.COMMON_REASONING_FORMAT_DEEPSEEK 209 | ) 210 | assert params.prefill_assistant is True 211 | 212 | assert params.api_keys == [] 213 | assert params.ssl_file_key == "" 214 | assert params.ssl_file_cert == "" 215 | 216 | params.default_template_kwargs = {"abc": "def"} 217 | assert params.default_template_kwargs == {"abc": "def"} 218 | 219 | assert params.webui is True 220 | assert params.endpoint_slots is True 221 | assert params.endpoint_props is False 222 | assert params.endpoint_metrics is False 223 | 224 | assert params.log_json is False 225 | 226 | assert params.slot_save_path == "" 227 | assert params.media_path == "" 228 | 229 | assert params.slot_prompt_similarity == approx(0.1) 230 | 231 | assert params.is_pp_shared is False 232 | assert params.is_tg_separate is False 233 | 234 | assert params.n_pp == [] 235 | assert params.n_tg == [] 236 | assert params.n_pl == [] 237 | 238 | assert params.context_files == [] 239 | assert params.chunk_size == 64 240 | assert params.chunk_separator == "\n" 241 | 242 | assert params.n_junk == 250 243 | assert params.i_pos == -1 244 | assert params.out_file == "" 245 | 246 | assert params.n_out_freq == 10 247 | assert params.n_save_freq == 0 248 | assert params.i_chunk == 0 249 | assert params.imat_dat == 0 250 | 251 | assert params.process_output is False 252 | assert params.compute_ppl is True 253 | assert params.parse_special is False 254 | 255 | assert params.n_pca_batch == 100 256 | assert params.n_pca_iterations == 1000 257 | 258 | sp = params.sampling.samplers 259 | assert sp 260 | params.sampling.samplers = sp 261 | assert params.sampling.samplers == sp 262 | params.sampling.samplers = "top_k;top_p;min_p;temperature;dry;typ_p;xtc" 263 | assert params.sampling.samplers == "top_k;top_p;min_p;temperature;dry;typ_p;xtc" 264 | assert params.speculative.cache_type_k == xlc.ggml_type.GGML_TYPE_F16 265 | assert params.speculative.cache_type_v == xlc.ggml_type.GGML_TYPE_F16 266 | assert params.speculative.replacements == [] 267 | params.speculative.replacements = [("a", "b")] 268 | assert params.speculative.replacements == [("a", "b")] 269 | 270 | assert params.cls_sep == "\t" 271 | assert params.offline is False 272 | assert params.reasoning_budget == -1 273 | 274 | assert params.diffusion.steps == 128 275 | params.diffusion.steps = 13 276 | assert params.diffusion.steps == 13 277 | assert params.diffusion.visual_mode is False 278 | params.diffusion.visual_mode = True 279 | assert params.diffusion.visual_mode is True 280 | assert params.diffusion.eps < 0.01 281 | params.diffusion.eps = 1.2 282 | assert 1.19 < params.diffusion.eps < 1.21 283 | assert params.diffusion.block_length == 0 284 | params.diffusion.block_length = 13 285 | assert params.diffusion.block_length == 13 286 | assert params.diffusion.algorithm == 4 287 | params.diffusion.algorithm = 1 288 | assert params.diffusion.algorithm == 1 289 | assert params.diffusion.alg_temp == 0.0 290 | params.diffusion.alg_temp = 1.1 291 | assert 1.09 < params.diffusion.alg_temp < 1.11 292 | assert params.diffusion.cfg_scale == 0.0 293 | params.diffusion.cfg_scale = 1.1 294 | assert 1.09 < params.diffusion.cfg_scale < 1.11 295 | assert params.diffusion.add_gumbel_noise is False 296 | params.diffusion.add_gumbel_noise = True 297 | assert params.diffusion.add_gumbel_noise is True 298 | 299 | assert params.tensor_buft_overrides == "" 300 | with pytest.raises(ValueError, match="unknown buffer type"): 301 | params.tensor_buft_overrides = ( 302 | "blk\\.([0-3])\\.ffn_.*=GPU0,blk\\.4\\.ffn_(down|up)_exps\\..*=GPU0" 303 | ) 304 | params.tensor_buft_overrides = ( 305 | "blk\\.([0-3])\\.ffn_.*=CPU,blk\\.4\\.ffn_(down|up)_exps\\..*=CPU" 306 | ) 307 | assert ( 308 | params.tensor_buft_overrides 309 | == "blk\\.([0-3])\\.ffn_.*=CPU,blk\\.4\\.ffn_(down|up)_exps\\..*=CPU" 310 | ) 311 | 312 | # assert params.cvector_dimre_method == cy.DIMRE_METHOD_PCA 313 | # assert params.cvector_outfile == "control_vector.gguf" 314 | # assert params.cvector_positive_file == "examples/cvector-generator/positive.txt" 315 | # assert params.cvector_negative_file == "examples/cvector-generator/negative.txt" 316 | 317 | # assert params.spm_infill is False 318 | 319 | # assert params.lora_outfile == "ggml-lora-merged-f16.gguf" 320 | 321 | # assert params.batched_bench_output_jsonl is False 322 | 323 | # ... rest not yet implemented 324 | 325 | 326 | def test_json_schema_to_grammar(): 327 | schema = { 328 | "type": "object", 329 | "properties": { 330 | "answer": {"type": "string"}, 331 | "score": {"type": "number"}, 332 | }, 333 | "required": ["answer"], 334 | } 335 | grammar = xlc.json_schema_to_grammar(schema) 336 | assert isinstance(grammar, str) 337 | assert grammar.strip() 338 | 339 | with pytest.raises(ValueError): 340 | xlc.json_schema_to_grammar("{not json}") 341 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import os 3 | import sys 4 | import base64 5 | import pytest 6 | import json 7 | import orjson 8 | 9 | import xllamacpp as xlc 10 | 11 | 12 | def test_get_system_info(): 13 | assert "CPU :" in xlc.get_system_info() 14 | 15 | 16 | def test_get_device_info(): 17 | xlc.get_device_info() 18 | info = xlc.get_device_info() 19 | assert len(info) > 0 20 | assert "CPU" in [i["name"] for i in info] 21 | print(info) 22 | 23 | 24 | def test_llama_server(model_path): 25 | params = xlc.CommonParams() 26 | 27 | params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf") 28 | params.prompt = "When did the universe begin?" 29 | params.warmup = False 30 | params.n_predict = 32 31 | params.n_ctx = 256 32 | params.n_parallel = 1 33 | params.cpuparams.n_threads = 2 34 | params.cpuparams_batch.n_threads = 2 35 | params.endpoint_metrics = True 36 | params.cache_ram_mib = 0 37 | 38 | server = xlc.Server(params) 39 | 40 | complete_prompt = { 41 | "max_tokens": 128, 42 | "prompt": "Write the fibonacci function in c++.", 43 | } 44 | 45 | server.handle_completions( 46 | complete_prompt, 47 | lambda v: pprint.pprint(v), 48 | ) 49 | v = server.handle_completions(complete_prompt) 50 | assert isinstance(v, dict) 51 | assert "code" not in v 52 | pprint.pprint(v) 53 | 54 | # If the prompt is a str or bytes, a callback is required. 55 | with pytest.raises(ValueError, match="non dict prompt"): 56 | server.handle_chat_completions(orjson.dumps(complete_prompt)) 57 | 58 | complete_prompt["stream"] = True 59 | 60 | # If the prompt is streaming, a callback is required. 61 | with pytest.raises(ValueError, match="requires a callback for streaming"): 62 | server.handle_completions(complete_prompt) 63 | 64 | server.handle_completions( 65 | complete_prompt, 66 | lambda v: pprint.pprint(v), 67 | ) 68 | 69 | # Test handle_completions with a str or bytes prompt 70 | ok = False 71 | 72 | def _cb_str(v): 73 | nonlocal ok 74 | assert type(v) is str 75 | json.loads(v) 76 | ok = True 77 | 78 | complete_prompt_str = json.dumps(complete_prompt) 79 | server.handle_completions( 80 | complete_prompt_str, 81 | _cb_str, 82 | ) 83 | assert ok 84 | 85 | ok = False 86 | 87 | def _cb_bytes(v): 88 | nonlocal ok 89 | assert type(v) is bytes 90 | orjson.loads(v) 91 | ok = True 92 | 93 | complete_prompt_bytes = orjson.dumps(complete_prompt) 94 | server.handle_completions( 95 | complete_prompt_bytes, 96 | _cb_bytes, 97 | ) 98 | assert ok 99 | 100 | chat_complete_prompt = { 101 | "max_tokens": 128, 102 | "messages": [ 103 | {"role": "system", "content": "You are a coding assistant."}, 104 | {"role": "user", "content": "Write the fibonacci function in c++."}, 105 | ], 106 | } 107 | 108 | server.handle_chat_completions( 109 | chat_complete_prompt, 110 | lambda v: pprint.pprint(v), 111 | ) 112 | v = server.handle_chat_completions(chat_complete_prompt) 113 | assert isinstance(v, dict) 114 | assert "code" not in v 115 | pprint.pprint(v) 116 | 117 | # If the prompt is a str or bytes, a callback is required. 118 | with pytest.raises(ValueError, match="non dict prompt"): 119 | server.handle_chat_completions(json.dumps(chat_complete_prompt)) 120 | 121 | chat_complete_prompt["stream"] = True 122 | 123 | # If the prompt is streaming, a callback is required. 124 | with pytest.raises(ValueError, match="requires a callback for streaming"): 125 | server.handle_chat_completions(chat_complete_prompt) 126 | 127 | server.handle_chat_completions( 128 | chat_complete_prompt, 129 | lambda v: pprint.pprint(v), 130 | ) 131 | 132 | # Test handle_chat_completions with a str or bytes prompt 133 | ok = False 134 | 135 | def _cb_str(v): 136 | nonlocal ok 137 | assert type(v) is str 138 | json.loads(v) 139 | ok = True 140 | 141 | chat_complete_prompt_str = json.dumps(chat_complete_prompt) 142 | server.handle_chat_completions( 143 | chat_complete_prompt_str, 144 | _cb_str, 145 | ) 146 | assert ok 147 | 148 | ok = False 149 | 150 | def _cb_bytes(v): 151 | nonlocal ok 152 | assert type(v) is bytes 153 | orjson.loads(v) 154 | ok = True 155 | 156 | chat_complete_prompt_bytes = orjson.dumps(chat_complete_prompt) 157 | server.handle_chat_completions( 158 | chat_complete_prompt_bytes, 159 | _cb_bytes, 160 | ) 161 | assert ok 162 | 163 | # Test handle_metrics() 164 | result = server.handle_metrics() 165 | assert type(result) is str 166 | assert "llamacpp:prompt_seconds_total" in result 167 | 168 | 169 | def test_llama_server_stream_callback_stop(model_path): 170 | params = xlc.CommonParams() 171 | 172 | params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf") 173 | params.prompt = "When did the universe begin?" 174 | params.warmup = False 175 | params.n_predict = 64 176 | params.n_ctx = 256 177 | params.n_parallel = 1 178 | params.cpuparams.n_threads = 2 179 | params.cpuparams_batch.n_threads = 2 180 | 181 | server = xlc.Server(params) 182 | 183 | # Test handle_completions streaming stop via callback return value 184 | complete_prompt = { 185 | "max_tokens": 128, 186 | "prompt": "Write a long story about the history of the universe.", 187 | "stream": True, 188 | } 189 | 190 | all_chunks = 0 191 | 192 | def _cb_all(v): 193 | nonlocal all_chunks 194 | all_chunks += 1 195 | 196 | stop_chunks = 0 197 | 198 | def _cb_stop(v): 199 | nonlocal stop_chunks 200 | stop_chunks += 1 201 | return True 202 | 203 | server.handle_completions(complete_prompt, _cb_all) 204 | assert all_chunks >= 1 205 | 206 | server.handle_completions(complete_prompt, _cb_stop) 207 | assert stop_chunks == 1 208 | assert all_chunks > stop_chunks 209 | 210 | # Test handle_chat_completions streaming stop via callback return value 211 | chat_complete_prompt = { 212 | "max_tokens": 128, 213 | "messages": [ 214 | {"role": "system", "content": "You are a coding assistant."}, 215 | { 216 | "role": "user", 217 | "content": "Tell me in detail about the history of programming languages.", 218 | }, 219 | ], 220 | "stream": True, 221 | } 222 | 223 | chat_all_chunks = 0 224 | 225 | def _chat_cb_all(v): 226 | nonlocal chat_all_chunks 227 | chat_all_chunks += 1 228 | 229 | chat_stop_chunks = 0 230 | 231 | def _chat_cb_stop(v): 232 | nonlocal chat_stop_chunks 233 | chat_stop_chunks += 1 234 | return True 235 | 236 | server.handle_chat_completions(chat_complete_prompt, _chat_cb_all) 237 | assert chat_all_chunks >= 1 238 | 239 | server.handle_chat_completions(chat_complete_prompt, _chat_cb_stop) 240 | assert chat_stop_chunks == 1 241 | assert chat_all_chunks > chat_stop_chunks 242 | 243 | 244 | def test_llama_server_chat_with_grammar(model_path): 245 | schema = { 246 | "type": "object", 247 | "properties": { 248 | "answer": {"type": "string"}, 249 | "score": {"type": "number"}, 250 | }, 251 | "required": ["answer"], 252 | } 253 | grammar = xlc.json_schema_to_grammar(schema) 254 | 255 | params = xlc.CommonParams() 256 | 257 | params.model.path = os.path.join(model_path, "Llama-3.2-1B-Instruct-Q8_0.gguf") 258 | params.warmup = False 259 | params.n_predict = 64 260 | params.n_ctx = 256 261 | params.cpuparams.n_threads = 2 262 | params.cpuparams_batch.n_threads = 2 263 | params.sampling.temp = 0 264 | params.sampling.top_k = 1 265 | params.sampling.grammar = grammar 266 | 267 | server = xlc.Server(params) 268 | 269 | chat_complete_prompt = { 270 | "max_tokens": 64, 271 | "messages": [ 272 | { 273 | "role": "system", 274 | "content": "Respond with a JSON object matching the provided schema.", 275 | }, 276 | { 277 | "role": "user", 278 | "content": "Provide an answer string and an optional numeric score.", 279 | }, 280 | ], 281 | } 282 | 283 | result = server.handle_chat_completions(chat_complete_prompt) 284 | 285 | assert isinstance(result, dict) 286 | content = result["choices"][0]["message"]["content"] 287 | parsed = json.loads(content) 288 | 289 | assert parsed["answer"] 290 | assert isinstance(parsed["answer"], str) 291 | if "score" in parsed: 292 | assert isinstance(parsed["score"], (int, float)) 293 | 294 | 295 | def test_llama_server_multimodal(model_path): 296 | with open(os.path.join(os.path.dirname(__file__), "data/11_truck.png"), "rb") as f: 297 | content = f.read() 298 | IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(content).decode("utf-8") 299 | 300 | params = xlc.CommonParams() 301 | 302 | params.model.path = os.path.join(model_path, "tinygemma3-Q8_0.gguf") 303 | params.mmproj.path = os.path.join(model_path, "mmproj-tinygemma3.gguf") 304 | params.sampling.seed = 42 305 | params.sampling.top_k = 1 306 | params.sampling.temp = 0 307 | params.n_predict = 4 308 | params.n_ctx = 1024 309 | params.cpuparams.n_threads = 4 310 | params.cpuparams_batch.n_threads = 2 311 | 312 | server = xlc.Server(params) 313 | 314 | chat_complete_prompt = { 315 | "max_tokens": 128, 316 | "messages": [ 317 | { 318 | "role": "user", 319 | "content": [ 320 | {"type": "text", "text": "What is this:\n"}, 321 | { 322 | "type": "image_url", 323 | "image_url": { 324 | "url": IMG_BASE64_0, 325 | }, 326 | }, 327 | ], 328 | }, 329 | ], 330 | } 331 | 332 | server.handle_chat_completions( 333 | chat_complete_prompt, 334 | lambda v: pprint.pprint(v), 335 | ) 336 | 337 | 338 | def test_llama_server_embedding(model_path): 339 | params = xlc.CommonParams() 340 | 341 | params.model.path = os.path.join(model_path, "Qwen3-Embedding-0.6B-Q8_0.gguf") 342 | params.embedding = True 343 | params.n_predict = -1 344 | params.n_ctx = 512 345 | params.n_batch = 128 346 | params.n_ubatch = 128 347 | params.sampling.seed = 42 348 | params.cpuparams.n_threads = 2 349 | params.cpuparams_batch.n_threads = 2 350 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST 351 | 352 | server = xlc.Server(params) 353 | 354 | embedding_input = { 355 | "input": [ 356 | "I believe the meaning of life is", 357 | "Write a joke about AI from a very long prompt which will not be truncated", 358 | "This is a test", 359 | "This is another test", 360 | ], 361 | } 362 | 363 | result = server.handle_embeddings(embedding_input) 364 | 365 | assert type(result) is dict 366 | assert len(result["data"]) == 4 367 | for d in result["data"]: 368 | assert len(d["embedding"]) == 1024 369 | 370 | embedding_input_str = json.dumps(embedding_input) 371 | assert type(embedding_input_str) is str 372 | result_str = server.handle_embeddings(embedding_input_str) 373 | assert type(result_str) is str 374 | result = json.loads(result_str) 375 | 376 | assert type(result) is dict 377 | assert len(result["data"]) == 4 378 | for d in result["data"]: 379 | assert len(d["embedding"]) == 1024 380 | 381 | embedding_input_bytes = orjson.dumps(embedding_input) 382 | assert type(embedding_input_bytes) is bytes 383 | result_bytes = server.handle_embeddings(embedding_input_bytes) 384 | assert type(result_bytes) is bytes 385 | result = orjson.loads(result_str) 386 | 387 | assert type(result) is dict 388 | assert len(result["data"]) == 4 389 | for d in result["data"]: 390 | assert len(d["embedding"]) == 1024 391 | 392 | 393 | @pytest.mark.skipif(sys.platform == "darwin", reason="Rerank test crashes on macOS CI") 394 | def test_llama_server_rerank(model_path): 395 | params = xlc.CommonParams() 396 | 397 | params.model.path = os.path.join(model_path, "bge-reranker-v2-m3-Q2_K.gguf") 398 | params.embedding = True 399 | params.n_predict = -1 400 | params.n_ctx = 512 401 | params.n_batch = 128 402 | params.n_ubatch = 128 403 | params.sampling.seed = 42 404 | params.cpuparams.n_threads = 2 405 | params.cpuparams_batch.n_threads = 2 406 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK 407 | 408 | server = xlc.Server(params) 409 | 410 | rerank_input = { 411 | "query": "What is the capital of France?", 412 | "documents": [ 413 | "Paris is the capital of France.", 414 | "The Eiffel Tower is in Paris.", 415 | "Germany is located in Europe.", 416 | ], 417 | } 418 | 419 | result = server.handle_rerank(rerank_input) 420 | 421 | assert type(result) is dict 422 | assert len(result["results"]) == 3 423 | 424 | rerank_input_str = json.dumps(rerank_input) 425 | result_str = server.handle_rerank(rerank_input_str) 426 | assert type(result_str) is str 427 | result = json.loads(result_str) 428 | 429 | assert type(result) is dict 430 | assert len(result["results"]) == 3 431 | 432 | rerank_input_bytes = orjson.dumps(rerank_input) 433 | result_bytes = server.handle_rerank(rerank_input_bytes) 434 | assert type(result_bytes) is bytes 435 | result = orjson.loads(result_bytes) 436 | 437 | assert type(result) is dict 438 | assert len(result["results"]) == 3 439 | -------------------------------------------------------------------------------- /src/llama.cpp/src/server.cpp: -------------------------------------------------------------------------------- 1 | #include "server-context.h" 2 | #include "server-http.h" 3 | #include "server-models.h" 4 | 5 | #include "arg.h" 6 | #include "common.h" 7 | #include "llama.h" 8 | #include "log.h" 9 | 10 | #include 11 | #include 12 | #include // for std::thread::hardware_concurrency 13 | 14 | #if defined(_WIN32) 15 | #include 16 | #endif 17 | 18 | static std::function shutdown_handler; 19 | static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; 20 | 21 | static inline void signal_handler(int signal) { 22 | if (is_terminating.test_and_set()) { 23 | // in case it hangs, we can force terminate the server by hitting Ctrl+C twice 24 | // this is for better developer experience, we can remove when the server is stable enough 25 | fprintf(stderr, "Received second interrupt, terminating immediately.\n"); 26 | exit(1); 27 | } 28 | 29 | shutdown_handler(signal); 30 | } 31 | 32 | // wrapper function that handles exceptions and logs errors 33 | // this is to make sure handler_t never throws exceptions; instead, it returns an error response 34 | static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { 35 | return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr { 36 | std::string message; 37 | error_type error; 38 | try { 39 | return func(req); 40 | } catch (const std::invalid_argument & e) { 41 | // treat invalid_argument as invalid request (400) 42 | error = ERROR_TYPE_INVALID_REQUEST; 43 | message = e.what(); 44 | } catch (const std::exception & e) { 45 | // treat other exceptions as server error (500) 46 | error = ERROR_TYPE_SERVER; 47 | message = e.what(); 48 | } catch (...) { 49 | error = ERROR_TYPE_SERVER; 50 | message = "unknown error"; 51 | } 52 | 53 | auto res = std::make_unique(); 54 | res->status = 500; 55 | try { 56 | json error_data = format_error_response(message, error); 57 | res->status = json_value(error_data, "code", 500); 58 | res->data = safe_json_to_str({{ "error", error_data }}); 59 | SRV_WRN("got exception: %s\n", res->data.c_str()); 60 | } catch (const std::exception & e) { 61 | SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str()); 62 | res->data = "Internal Server Error"; 63 | } 64 | return res; 65 | }; 66 | } 67 | 68 | int main(int argc, char ** argv, char ** envp) { 69 | // own arguments required by this example 70 | common_params params; 71 | 72 | if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { 73 | return 1; 74 | } 75 | 76 | // TODO: should we have a separate n_parallel parameter for the server? 77 | // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 78 | // TODO: this is a common configuration that is suitable for most local use cases 79 | // however, overriding the parameters is a bit confusing - figure out something more intuitive 80 | if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { 81 | LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); 82 | 83 | params.n_parallel = 4; 84 | params.kv_unified = true; 85 | } 86 | 87 | // for consistency between server router mode and single-model mode, we set the same model name as alias 88 | if (params.model_alias.empty() && !params.model.name.empty()) { 89 | params.model_alias = params.model.name; 90 | } 91 | 92 | common_init(); 93 | 94 | // struct that contains llama context and inference 95 | server_context ctx_server; 96 | 97 | llama_backend_init(); 98 | llama_numa_init(params.numa); 99 | 100 | LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); 101 | LOG_INF("\n"); 102 | LOG_INF("%s\n", common_params_get_system_info(params).c_str()); 103 | LOG_INF("\n"); 104 | 105 | server_http_context ctx_http; 106 | if (!ctx_http.init(params)) { 107 | LOG_ERR("%s: failed to initialize HTTP server\n", __func__); 108 | return 1; 109 | } 110 | 111 | // 112 | // Router 113 | // 114 | 115 | // register API routes 116 | server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); }); 117 | 118 | bool is_router_server = params.model.path.empty(); 119 | std::optional models_routes{}; 120 | if (is_router_server) { 121 | // setup server instances manager 122 | models_routes.emplace(params, argc, argv, envp); 123 | 124 | // proxy handlers 125 | // note: routes.get_health stays the same 126 | routes.get_metrics = models_routes->proxy_get; 127 | routes.post_props = models_routes->proxy_post; 128 | routes.get_api_show = models_routes->proxy_get; 129 | routes.post_completions = models_routes->proxy_post; 130 | routes.post_completions_oai = models_routes->proxy_post; 131 | routes.post_chat_completions = models_routes->proxy_post; 132 | routes.post_anthropic_messages = models_routes->proxy_post; 133 | routes.post_anthropic_count_tokens = models_routes->proxy_post; 134 | routes.post_infill = models_routes->proxy_post; 135 | routes.post_embeddings = models_routes->proxy_post; 136 | routes.post_embeddings_oai = models_routes->proxy_post; 137 | routes.post_rerank = models_routes->proxy_post; 138 | routes.post_tokenize = models_routes->proxy_post; 139 | routes.post_detokenize = models_routes->proxy_post; 140 | routes.post_apply_template = models_routes->proxy_post; 141 | routes.get_lora_adapters = models_routes->proxy_get; 142 | routes.post_lora_adapters = models_routes->proxy_post; 143 | routes.get_slots = models_routes->proxy_get; 144 | routes.post_slots = models_routes->proxy_post; 145 | 146 | // custom routes for router 147 | routes.get_props = models_routes->get_router_props; 148 | routes.get_models = models_routes->get_router_models; 149 | ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); 150 | ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); 151 | ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status)); 152 | } 153 | 154 | ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) 155 | ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) 156 | ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); 157 | ctx_http.get ("/props", ex_wrapper(routes.get_props)); 158 | ctx_http.post("/props", ex_wrapper(routes.post_props)); 159 | ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); 160 | ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) 161 | ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) 162 | ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) 163 | ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy 164 | ctx_http.post("/completions", ex_wrapper(routes.post_completions)); 165 | ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); 166 | ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); 167 | ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); 168 | ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint 169 | ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API 170 | ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting 171 | ctx_http.post("/infill", ex_wrapper(routes.post_infill)); 172 | ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy 173 | ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); 174 | ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); 175 | ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); 176 | ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); 177 | ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); 178 | ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); 179 | ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); 180 | ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); 181 | ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); 182 | // LoRA adapters hotswap 183 | ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); 184 | ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); 185 | // Save & load slots 186 | ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); 187 | ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); 188 | 189 | // 190 | // Start the server 191 | // 192 | 193 | std::function clean_up; 194 | 195 | if (is_router_server) { 196 | LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__); 197 | 198 | clean_up = [&models_routes]() { 199 | SRV_INF("%s: cleaning up before exit...\n", __func__); 200 | if (models_routes.has_value()) { 201 | models_routes->models.unload_all(); 202 | } 203 | llama_backend_free(); 204 | }; 205 | 206 | if (!ctx_http.start()) { 207 | clean_up(); 208 | LOG_ERR("%s: exiting due to HTTP server error\n", __func__); 209 | return 1; 210 | } 211 | ctx_http.is_ready.store(true); 212 | 213 | shutdown_handler = [&](int) { 214 | ctx_http.stop(); 215 | }; 216 | 217 | } else { 218 | // setup clean up function, to be called before exit 219 | clean_up = [&ctx_http, &ctx_server]() { 220 | SRV_INF("%s: cleaning up before exit...\n", __func__); 221 | ctx_http.stop(); 222 | ctx_server.terminate(); 223 | llama_backend_free(); 224 | }; 225 | 226 | // start the HTTP server before loading the model to be able to serve /health requests 227 | if (!ctx_http.start()) { 228 | clean_up(); 229 | LOG_ERR("%s: exiting due to HTTP server error\n", __func__); 230 | return 1; 231 | } 232 | 233 | // load the model 234 | LOG_INF("%s: loading model\n", __func__); 235 | 236 | if (!ctx_server.load_model(params)) { 237 | clean_up(); 238 | if (ctx_http.thread.joinable()) { 239 | ctx_http.thread.join(); 240 | } 241 | LOG_ERR("%s: exiting due to model loading error\n", __func__); 242 | return 1; 243 | } 244 | 245 | ctx_server.init(); 246 | ctx_http.is_ready.store(true); 247 | 248 | LOG_INF("%s: model loaded\n", __func__); 249 | 250 | shutdown_handler = [&](int) { 251 | // this will unblock start_loop() 252 | ctx_server.terminate(); 253 | }; 254 | } 255 | 256 | // TODO: refactor in common/console 257 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) 258 | struct sigaction sigint_action; 259 | sigint_action.sa_handler = signal_handler; 260 | sigemptyset (&sigint_action.sa_mask); 261 | sigint_action.sa_flags = 0; 262 | sigaction(SIGINT, &sigint_action, NULL); 263 | sigaction(SIGTERM, &sigint_action, NULL); 264 | #elif defined (_WIN32) 265 | auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { 266 | return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; 267 | }; 268 | SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); 269 | #endif 270 | 271 | if (is_router_server) { 272 | LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); 273 | LOG_INF("%s: NOTE: router mode is experimental\n", __func__); 274 | LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__); 275 | if (ctx_http.thread.joinable()) { 276 | ctx_http.thread.join(); // keep the main thread alive 277 | } 278 | 279 | // when the HTTP server stops, clean up and exit 280 | clean_up(); 281 | } else { 282 | LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); 283 | LOG_INF("%s: starting the main loop...\n", __func__); 284 | 285 | // optionally, notify router server that this instance is ready 286 | const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); 287 | std::thread monitor_thread; 288 | if (router_port != nullptr) { 289 | monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); 290 | } 291 | 292 | // this call blocks the main thread until queue_tasks.terminate() is called 293 | ctx_server.start_loop(); 294 | 295 | clean_up(); 296 | if (ctx_http.thread.joinable()) { 297 | ctx_http.thread.join(); 298 | } 299 | if (monitor_thread.joinable()) { 300 | monitor_thread.join(); 301 | } 302 | llama_memory_breakdown_print(ctx_server.get_llama_context()); 303 | } 304 | 305 | return 0; 306 | } 307 | -------------------------------------------------------------------------------- /tests/test_server_http.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import time 5 | import base64 6 | import pytest 7 | import requests 8 | import threading 9 | from typing import Dict, Any 10 | 11 | import xllamacpp as xlc 12 | 13 | 14 | class TestServerHTTP: 15 | """Test suite for xllamacpp HTTP server endpoints""" 16 | 17 | @pytest.fixture(scope="class") 18 | def server_url(self): 19 | """Start HTTP server using xllamacpp.Server and return base URL""" 20 | # Configure server parameters 21 | params = xlc.CommonParams() 22 | params.model.path = os.path.join( 23 | os.path.dirname(__file__), "../models/Llama-3.2-1B-Instruct-Q8_0.gguf" 24 | ) 25 | params.n_parallel = 1 26 | params.n_ctx = 256 27 | params.cpuparams.n_threads = 2 28 | params.cpuparams_batch.n_threads = 2 29 | params.endpoint_metrics = True 30 | 31 | # Create server instance - this automatically starts the HTTP server 32 | server = xlc.Server(params) 33 | 34 | # Wait for server to be ready - default port is likely 8080 35 | base_url = server.listening_address 36 | max_wait = 5 # seconds 37 | wait_interval = 0.5 38 | 39 | for _ in range(int(max_wait / wait_interval)): 40 | try: 41 | response = requests.get(f"{base_url}/health", timeout=1) 42 | if response.status_code == 200: 43 | yield base_url 44 | break 45 | except requests.exceptions.RequestException: 46 | time.sleep(wait_interval) 47 | else: 48 | pytest.fail("Server failed to start within timeout period") 49 | 50 | # Server will be automatically cleaned up when the object goes out of scope 51 | 52 | def test_health_endpoints(self, server_url): 53 | """Test health check endpoints""" 54 | # Test /health 55 | response = requests.get(f"{server_url}/health") 56 | assert response.status_code == 200 57 | data = response.json() 58 | assert "status" in data 59 | assert data["status"] == "ok" 60 | 61 | # Test /v1/health 62 | response = requests.get(f"{server_url}/v1/health") 63 | assert response.status_code == 200 64 | data = response.json() 65 | assert "status" in data 66 | assert data["status"] == "ok" 67 | 68 | def test_models_endpoints(self, server_url): 69 | """Test model listing endpoints""" 70 | # Test /models 71 | response = requests.get(f"{server_url}/models") 72 | assert response.status_code == 200 73 | data = response.json() 74 | assert "data" in data 75 | assert len(data["data"]) > 0 76 | model = data["data"][0] 77 | assert "id" in model 78 | assert "object" in model 79 | assert model["object"] == "model" 80 | 81 | # Test /v1/models 82 | response = requests.get(f"{server_url}/v1/models") 83 | assert response.status_code == 200 84 | data = response.json() 85 | assert "data" in data 86 | 87 | # Test /api/tags (ollama compatible) 88 | response = requests.get(f"{server_url}/api/tags") 89 | assert response.status_code == 200 90 | data = response.json() 91 | assert "models" in data 92 | 93 | def test_props_endpoints(self, server_url): 94 | """Test server properties endpoints""" 95 | # Test GET /props 96 | response = requests.get(f"{server_url}/props") 97 | assert response.status_code == 200 98 | data = response.json() 99 | assert "build_info" in data 100 | 101 | def test_metrics_endpoint(self, server_url): 102 | """Test metrics endpoint""" 103 | response = requests.get(f"{server_url}/metrics") 104 | assert response.status_code == 200 105 | # Metrics should be in Prometheus format 106 | assert "llamacpp:" in response.text 107 | 108 | def test_completion_endpoints(self, server_url): 109 | """Test text completion endpoints""" 110 | completion_data = { 111 | "prompt": "The capital of France is", 112 | "max_tokens": 10, 113 | "temperature": 0.1, 114 | } 115 | 116 | # Test /v1/completions (OpenAI compatible) 117 | response = requests.post(f"{server_url}/v1/completions", json=completion_data) 118 | assert response.status_code == 200 119 | data = response.json() 120 | assert "choices" in data 121 | 122 | def test_chat_completion_endpoints(self, server_url): 123 | """Test chat completion endpoints""" 124 | chat_data = { 125 | "messages": [ 126 | {"role": "system", "content": "You are a helpful assistant."}, 127 | {"role": "user", "content": "What is the capital of France?"}, 128 | ], 129 | "max_tokens": 10, 130 | "temperature": 0.1, 131 | } 132 | 133 | # Test /chat/completions 134 | response = requests.post(f"{server_url}/chat/completions", json=chat_data) 135 | assert response.status_code == 200 136 | data = response.json() 137 | assert "choices" in data 138 | assert len(data["choices"]) > 0 139 | assert "message" in data["choices"][0] 140 | assert "content" in data["choices"][0]["message"] 141 | 142 | # Test /v1/chat/completions (OpenAI compatible) 143 | response = requests.post(f"{server_url}/v1/chat/completions", json=chat_data) 144 | assert response.status_code == 200 145 | data = response.json() 146 | assert "choices" in data 147 | 148 | # Test /api/chat (ollama compatible) 149 | response = requests.post(f"{server_url}/api/chat", json=chat_data) 150 | assert response.status_code == 200 151 | 152 | def test_tokenize_endpoints(self, server_url): 153 | """Test tokenization endpoints""" 154 | tokenize_data = {"content": "Hello world, how are you?"} 155 | 156 | # Test /tokenize 157 | response = requests.post(f"{server_url}/tokenize", json=tokenize_data) 158 | assert response.status_code == 200 159 | data = response.json() 160 | assert "tokens" in data 161 | 162 | # Test /detokenize 163 | detokenize_data = {"tokens": [1, 2, 3, 4, 5]} 164 | response = requests.post(f"{server_url}/detokenize", json=detokenize_data) 165 | assert response.status_code == 200 166 | data = response.json() 167 | assert "content" in data 168 | 169 | def test_apply_template_endpoint(self, server_url): 170 | """Test template application endpoint""" 171 | template_data = { 172 | "messages": [ 173 | {"role": "system", "content": "You are a test."}, 174 | {"role": "user", "content": "Hi there"}, 175 | ] 176 | } 177 | 178 | response = requests.post(f"{server_url}/apply-template", json=template_data) 179 | assert response.status_code == 200 180 | body = response.json() 181 | assert "prompt" in body 182 | assert "You are a test." in body["prompt"] 183 | 184 | def test_slots_endpoints(self, server_url): 185 | """Test slots management endpoints""" 186 | # Test GET /slots 187 | response = requests.get(f"{server_url}/slots") 188 | assert response.status_code == 200 189 | data = response.json() 190 | assert isinstance(data, list) 191 | 192 | def test_streaming_completions(self, server_url): 193 | """Test streaming completion endpoints""" 194 | completion_data = { 195 | "prompt": "The capital of France is", 196 | "max_tokens": 10, 197 | "stream": True, 198 | } 199 | 200 | response = requests.post( 201 | f"{server_url}/completions", json=completion_data, stream=True 202 | ) 203 | assert response.status_code == 200 204 | 205 | # Read streaming response 206 | lines = response.iter_lines() 207 | first_line = next(lines, None) 208 | assert first_line is not None 209 | assert first_line.startswith(b"data: ") 210 | 211 | def test_streaming_chat_completions(self, server_url): 212 | """Test streaming chat completion endpoints""" 213 | chat_data = { 214 | "messages": [{"role": "user", "content": "What is the capital of France?"}], 215 | "max_tokens": 10, 216 | "stream": True, 217 | } 218 | 219 | response = requests.post( 220 | f"{server_url}/chat/completions", json=chat_data, stream=True 221 | ) 222 | assert response.status_code == 200 223 | 224 | # Read streaming response 225 | lines = response.iter_lines() 226 | first_line = next(lines, None) 227 | assert first_line is not None 228 | assert first_line.startswith(b"data: ") 229 | 230 | def test_error_handling(self, server_url): 231 | """Test error handling for invalid requests""" 232 | # Test invalid JSON 233 | response = requests.post(f"{server_url}/completions", data="invalid json") 234 | assert response.status_code == 500 235 | 236 | # Test missing required fields 237 | response = requests.post(f"{server_url}/completions", json={}) 238 | assert response.status_code in [400, 422] 239 | 240 | # Test invalid endpoint 241 | response = requests.get(f"{server_url}/invalid_endpoint") 242 | assert response.status_code == 404 243 | 244 | def test_concurrent_requests(self, server_url): 245 | """Test handling of concurrent requests""" 246 | import concurrent.futures 247 | 248 | def make_request(): 249 | response = requests.get(f"{server_url}/health") 250 | return response.status_code 251 | 252 | with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 253 | futures = [executor.submit(make_request) for _ in range(10)] 254 | results = [future.result() for future in futures] 255 | 256 | # All requests should succeed 257 | assert all(status == 200 for status in results) 258 | 259 | 260 | # Test with embedding model 261 | class TestServerHTTPEmbedding: 262 | """Test suite for embedding-specific HTTP endpoints""" 263 | 264 | @pytest.fixture(scope="class") 265 | def embedding_server_url(self): 266 | """Start HTTP server using xllamacpp.Server with embedding model""" 267 | # Configure server parameters for embedding model 268 | params = xlc.CommonParams() 269 | params.model.path = os.path.join( 270 | os.path.dirname(__file__), "../models/Qwen3-Embedding-0.6B-Q8_0.gguf" 271 | ) 272 | params.embedding = True 273 | params.n_predict = -1 274 | params.n_ctx = 512 275 | params.n_batch = 128 276 | params.n_ubatch = 128 277 | params.cpuparams.n_threads = 2 278 | params.cpuparams_batch.n_threads = 2 279 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_LAST 280 | 281 | # Create server instance - this automatically starts the HTTP server 282 | server = xlc.Server(params) 283 | 284 | # Wait for server to be ready - use different port to avoid conflicts 285 | base_url = server.listening_address 286 | max_wait = 5 287 | wait_interval = 0.5 288 | 289 | for _ in range(int(max_wait / wait_interval)): 290 | try: 291 | response = requests.get(f"{base_url}/health", timeout=1) 292 | if response.status_code == 200: 293 | yield base_url 294 | break 295 | except requests.exceptions.RequestException: 296 | time.sleep(wait_interval) 297 | else: 298 | pytest.fail("Embedding server failed to start within timeout period") 299 | 300 | # Server will be automatically cleaned up when the object goes out of scope 301 | 302 | def test_embedding_model_specific(self, embedding_server_url): 303 | """Test embedding-specific functionality""" 304 | embedding_data = { 305 | "input": [ 306 | "I believe the meaning of life is", 307 | "This is a test", 308 | "This is another test", 309 | ] 310 | } 311 | 312 | response = requests.post( 313 | f"{embedding_server_url}/v1/embeddings", json=embedding_data 314 | ) 315 | assert response.status_code == 200 316 | data = response.json() 317 | assert len(data["data"]) == 3 318 | 319 | # Check embedding dimensions (should be consistent) 320 | first_embedding = data["data"][0]["embedding"] 321 | assert len(first_embedding) > 0 322 | 323 | for item in data["data"]: 324 | assert len(item["embedding"]) == len(first_embedding) 325 | 326 | 327 | # Test with rerank model 328 | class TestServerHTTPRerank: 329 | """Test suite for rerank-specific HTTP endpoints""" 330 | 331 | @pytest.fixture(scope="class") 332 | def rerank_server_url(self): 333 | """Start HTTP server using xllamacpp.Server with rerank model""" 334 | # Configure server parameters for rerank model 335 | params = xlc.CommonParams() 336 | params.model.path = os.path.join( 337 | os.path.dirname(__file__), "../models/bge-reranker-v2-m3-Q2_K.gguf" 338 | ) 339 | params.embedding = True 340 | params.n_predict = -1 341 | params.n_ctx = 512 342 | params.n_batch = 128 343 | params.n_ubatch = 128 344 | params.cpuparams.n_threads = 2 345 | params.cpuparams_batch.n_threads = 2 346 | params.pooling_type = xlc.llama_pooling_type.LLAMA_POOLING_TYPE_RANK 347 | 348 | # Create server instance - this automatically starts the HTTP server 349 | server = xlc.Server(params) 350 | 351 | # Wait for server to be ready - use different port to avoid conflicts 352 | base_url = server.listening_address 353 | max_wait = 5 354 | wait_interval = 0.5 355 | 356 | for _ in range(int(max_wait / wait_interval)): 357 | try: 358 | response = requests.get(f"{base_url}/health", timeout=1) 359 | if response.status_code == 200: 360 | yield base_url 361 | break 362 | except requests.exceptions.RequestException: 363 | time.sleep(wait_interval) 364 | else: 365 | pytest.fail("Rerank server failed to start within timeout period") 366 | 367 | # Server will be automatically cleaned up when the object goes out of scope 368 | 369 | def test_rerank_model_specific(self, rerank_server_url): 370 | """Test rerank-specific functionality""" 371 | TEST_DOCUMENTS = [ 372 | "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.", 373 | "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.", 374 | "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.", 375 | "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.", 376 | ] 377 | 378 | response = requests.post( 379 | f"{rerank_server_url}/rerank", 380 | json={ 381 | "query": "Machine learning is", 382 | "documents": TEST_DOCUMENTS, 383 | }, 384 | ) 385 | assert response.status_code == 200 386 | body = response.json() 387 | assert len(body["results"]) == 4 388 | 389 | most_relevant = body["results"][0] 390 | least_relevant = body["results"][0] 391 | for doc in body["results"]: 392 | if doc["relevance_score"] > most_relevant["relevance_score"]: 393 | most_relevant = doc 394 | if doc["relevance_score"] < least_relevant["relevance_score"]: 395 | least_relevant = doc 396 | 397 | assert most_relevant["relevance_score"] > least_relevant["relevance_score"] 398 | assert most_relevant["index"] == 2 399 | assert least_relevant["index"] == 3 400 | -------------------------------------------------------------------------------- /src/xllamacpp/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2023 XProbe Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from collections.abc import Sequence 17 | from dataclasses import dataclass 18 | from typing import Any, TYPE_CHECKING 19 | 20 | if TYPE_CHECKING: 21 | from gguf import GGUFReader, GGUFValueType # type: ignore 22 | 23 | 24 | def _import_gguf(): 25 | try: 26 | from gguf import GGUFReader, GGUFValueType # type: ignore 27 | 28 | return GGUFReader, GGUFValueType 29 | except Exception as e: 30 | raise RuntimeError( 31 | "Optional dependency 'gguf' is required for xllamacpp.memory. Install it with `pip install gguf`." 32 | ) from e 33 | 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | 38 | def get_file_host_endian(reader: "GGUFReader") -> tuple[str, str]: 39 | file_endian = reader.endianess.name # codespell:ignore 40 | if reader.byte_order == "S": 41 | host_endian = "BIG" if file_endian == "LITTLE" else "LITTLE" 42 | else: 43 | host_endian = file_endian 44 | return host_endian, file_endian 45 | 46 | 47 | def dump_metadata_json(reader: "GGUFReader", model_path: str) -> dict: 48 | _, GGUFValueType = _import_gguf() 49 | host_endian, file_endian = get_file_host_endian(reader) 50 | metadata: dict[str, Any] = {} 51 | tensors: dict[str, Any] = {} 52 | result = { 53 | "filename": model_path, 54 | "endian": file_endian, 55 | "metadata": metadata, 56 | "tensors": tensors, 57 | } 58 | for idx, field in enumerate(reader.fields.values()): 59 | curr: dict[str, Any] = { 60 | "index": idx, 61 | "type": field.types[0].name if field.types else "UNKNOWN", 62 | "offset": field.offset, 63 | } 64 | metadata[field.name] = curr 65 | if field.types[:1] == [GGUFValueType.ARRAY]: 66 | curr["array_types"] = [t.name for t in field.types][1:] 67 | curr["value"] = field.contents() 68 | else: 69 | curr["value"] = field.contents() 70 | for i, tensor in enumerate(reader.tensors): 71 | tensors[tensor.name] = { 72 | "index": i, 73 | "shape": tensor.shape.tolist(), 74 | "type": tensor.tensor_type.name, 75 | "offset": tensor.field.offset, 76 | "n_bytes": tensor.n_bytes, 77 | } 78 | return result 79 | 80 | 81 | @dataclass 82 | class MemoryEstimate: 83 | # How many layers we predict we can load 84 | layers: int 85 | # The size of the graph which occupies the main GPU 86 | graph: int 87 | # How much VRAM will be allocated given the number of layers we predict 88 | vram_size: int 89 | # The total size of the model if loaded into VRAM. If all layers are loaded, vram_size == total_size 90 | total_size: int 91 | # For multi-GPU scenarios, this provides the tensor split parameter 92 | tensor_split: list[float] 93 | # For multi-GPU scenarios, this is the size in bytes per GPU 94 | gpu_sizes: list[int] 95 | 96 | 97 | def _get_max_min(value): 98 | if isinstance(value, Sequence): 99 | return max(value), min(value) 100 | else: 101 | return value, value 102 | 103 | 104 | def graph_size( 105 | metadata: dict, 106 | context_length: int, 107 | batch_size: int, 108 | num_parallel: int, 109 | kv_cache_type: str, 110 | ): 111 | """ 112 | Most of the logic comes from `GraphSize` in https://github.com/ollama/ollama/blob/main/fs/ggml/ggml.go 113 | """ 114 | if context_length < batch_size: 115 | batch_size = context_length 116 | 117 | architecture = metadata["general.architecture"]["value"] 118 | embedding_length = metadata[f"{architecture}.embedding_length"]["value"] 119 | block_count = metadata[f"{architecture}.block_count"]["value"] 120 | head_count_max, head_count_min = _get_max_min( 121 | metadata.get(f"{architecture}.attention.head_count", {}).get("value", 1) 122 | ) 123 | head_count_kv_max, head_count_kv_min = _get_max_min( 124 | metadata.get(f"{architecture}.attention.head_count_kv", {}).get("value", 1) 125 | ) 126 | vocab = len(metadata["tokenizer.ggml.tokens"]["value"]) 127 | embedding_head_count_max = ( 128 | (embedding_length // head_count_min) if head_count_min > 0 else 0 129 | ) 130 | embedding_head_count_k = metadata.get( 131 | f"{architecture}.attention.key_length", {} 132 | ).get("value", embedding_head_count_max) 133 | embedding_head_count_v = metadata.get( 134 | f"{architecture}.attention.value_length", {} 135 | ).get("value", embedding_head_count_max) 136 | 137 | # f16(default) 138 | bytes_per_kv_element = { 139 | "q8_0": 1, # 1/2 of fp16 140 | "q4_0": 0.5, # 1/4 of fp16 141 | }.get(kv_cache_type, 2) 142 | 143 | kv = [0] * block_count 144 | for i in range(block_count): 145 | kv[i] = ( 146 | context_length 147 | * (embedding_head_count_k + embedding_head_count_v) 148 | * head_count_kv_max 149 | * bytes_per_kv_element 150 | ) 151 | 152 | full_offload = 0 153 | partial_offload = 0 154 | if architecture in ["llama", "llama4"]: 155 | full_offload = max( 156 | 4 157 | * batch_size 158 | * (1 + 4 * embedding_length + context_length * (1 + head_count_max)), 159 | 4 * batch_size * (embedding_length + vocab), 160 | ) 161 | partial_offload = 4 * batch_size * embedding_length 162 | partial_offload += max( 163 | 4 164 | * batch_size 165 | * (1 + embedding_length + max(context_length, embedding_length)) 166 | + embedding_length * embedding_length * 9 / 16 167 | + 4 168 | * context_length 169 | * ( 170 | batch_size * head_count_max 171 | + embedding_head_count_max * head_count_kv_max 172 | ), 173 | 4 * batch_size * (embedding_length + vocab) 174 | + embedding_length * vocab * 105 / 128, 175 | ) 176 | elif architecture in ["gemma", "gemma2", "gemma3"]: 177 | full_offload = max( 178 | 4 * batch_size * (embedding_length + vocab), 179 | 4 180 | * batch_size 181 | * ( 182 | 2 183 | + context_length 184 | + context_length * head_count_max 185 | + 2 * embedding_length 186 | + 2 * embedding_head_count_k * head_count_max 187 | ), 188 | ) 189 | partial_offload = max( 190 | 4 * embedding_length * batch_size 191 | + embedding_length * vocab * 105 / 128 192 | + 4 * vocab * batch_size, 193 | 4 194 | * batch_size 195 | * ( 196 | 2 * embedding_length 197 | + 1 198 | + 2 * embedding_head_count_k * head_count_max 199 | + context_length 200 | + context_length * head_count_max 201 | ) 202 | + 4 * embedding_head_count_k * context_length * 8 203 | + embedding_length * embedding_head_count_k * head_count_max * 9 / 16, 204 | ) 205 | if architecture == "gemma3": 206 | gemma3_global_cache_count = 6 207 | sliding_window = ( 208 | num_parallel 209 | * metadata[f"{architecture}.attention.sliding_window"]["value"] 210 | + batch_size 211 | ) 212 | for i in range(block_count): 213 | if (i + 1) % gemma3_global_cache_count != 0: 214 | kv[i] = ( 215 | sliding_window 216 | * (embedding_head_count_k + embedding_head_count_v) 217 | * head_count_kv_max 218 | * bytes_per_kv_element 219 | ) 220 | elif architecture == "qwen2": 221 | full_offload = max( 222 | 4 * batch_size * (embedding_length + vocab), 223 | 4 224 | * batch_size 225 | * ( 226 | 1 227 | + 2 * embedding_length 228 | + context_length 229 | + context_length * head_count_max 230 | ), 231 | ) 232 | 233 | partial_offload = max( 234 | 4 * batch_size * (embedding_length + vocab) 235 | + embedding_length * vocab * 105 / 128, 236 | 4 237 | * ( 238 | batch_size 239 | * (1 + 2 * embedding_length + context_length * (1 + head_count_max)) 240 | + embedding_length * (1 + context_length) 241 | ), 242 | ) 243 | elif architecture == "stablelm": 244 | full_offload = ( 245 | 4 246 | * batch_size 247 | * (context_length * (1 + head_count_max) + 3 * embedding_length + 2) 248 | ) 249 | partial_offload = max( 250 | 4 * batch_size * (vocab + 2 * embedding_length), full_offload 251 | ) 252 | elif architecture == "deepseek2": 253 | full_offload = max( 254 | 4 * batch_size * (3 * embedding_length + vocab), 255 | 4 256 | * batch_size 257 | * ( 258 | 3 * embedding_length 259 | + 2 260 | + context_length * (1 + head_count_kv_max) 261 | + 2 * embedding_head_count_k * head_count_kv_max 262 | ), 263 | ) 264 | 265 | partial_offload = max( 266 | 4 * batch_size * (3 * embedding_length + vocab) 267 | + embedding_length * vocab * 105 / 128, 268 | 4 269 | * batch_size 270 | * ( 271 | 2 * embedding_length 272 | + 1 273 | + 2 * embedding_head_count_k * head_count_kv_max 274 | + context_length 275 | + context_length * head_count_kv_max 276 | ) 277 | + 4 * embedding_head_count_k * context_length * head_count_kv_max 278 | + embedding_length * embedding_head_count_k * head_count_kv_max * 9 / 16, 279 | ) 280 | 281 | kv_total = sum(kv) 282 | if partial_offload == 0: 283 | partial_offload = ( 284 | head_count_max 285 | / (1 if head_count_kv_min <= 0 else head_count_kv_min) 286 | * kv_total 287 | / 6 288 | ) 289 | if full_offload == 0: 290 | full_offload = partial_offload 291 | 292 | return kv, partial_offload, full_offload 293 | 294 | 295 | def projector_memory_requirements(projector: str): 296 | GGUFReader, _ = _import_gguf() 297 | reader = GGUFReader(projector, "r") 298 | data = dump_metadata_json(reader, projector) 299 | return sum(t["n_bytes"] for t in data["tensors"].values()) 300 | 301 | 302 | def estimate_gpu_layers( 303 | gpus: list[dict], 304 | model_path: str, 305 | projectors: list[str], 306 | context_length: int, 307 | batch_size: int, 308 | num_parallel: int, 309 | kv_cache_type: str, 310 | ): 311 | """ 312 | Most of the logic comes from `EstimateGPULayers` in https://github.com/ollama/ollama/blob/main/llm/memory.go 313 | """ 314 | # Projectors loaded into GPU0 only 315 | projector_weights = sum(map(projector_memory_requirements, projectors)) 316 | if projector_weights > 0: 317 | # Multimodal models require at least 2048 context 318 | context_length = max(context_length, 2048) 319 | GGUFReader, _ = _import_gguf() 320 | reader = GGUFReader(model_path, "r") 321 | data = dump_metadata_json(reader, model_path) 322 | metadata = data["metadata"] 323 | kv, graph_partial_offload, graph_full_offload = graph_size( 324 | metadata, 325 | context_length=context_length, 326 | batch_size=batch_size, 327 | num_parallel=num_parallel, 328 | kv_cache_type=kv_cache_type, 329 | ) 330 | # Get all layer sizes 331 | architecture = metadata["general.architecture"]["value"] 332 | block_count = metadata[f"{architecture}.block_count"]["value"] 333 | layer_sizes = [0] * block_count 334 | for name, layer in data["tensors"].items(): 335 | if name.startswith("blk."): 336 | index = int(name[len("blk.") :].split(".")[0]) 337 | layer_sizes[index] += layer["n_bytes"] 338 | layer_size = layer_sizes[0] if layer_sizes else 0 339 | 340 | if len(kv) > 0: 341 | layer_size += kv[0] 342 | # On metal there's no partial offload overhead 343 | if gpus[0]["name"] == "Metal": 344 | graph_partial_offload = graph_full_offload 345 | elif len(gpus) > 1: 346 | # Multi gpu should always use the partial graph size 347 | graph_full_offload = graph_partial_offload 348 | 349 | # Get output layer size 350 | memory_layer_output = 0 351 | # Output layer handled at the end if we have space 352 | for name, layer in data["tensors"].items(): 353 | if any( 354 | name.startswith(prefix) 355 | for prefix in ["output_norm", "output", "token_embd"] 356 | ): 357 | memory_layer_output += layer["n_bytes"] 358 | 359 | # Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer 360 | default_memory_min = 512 * 1024**2 361 | gpu_allocations = [0] * len(gpus) 362 | gpus_with_space: list[int] = [] 363 | for i in range(len(gpus)): 364 | gpu0_overhead = projector_weights if len(gpus_with_space) == 0 else 0 365 | minimum_memory = gpus[i].get("memory_min", default_memory_min) 366 | if ( 367 | gpus[i]["memory_free"] 368 | < gpu0_overhead 369 | + max(graph_partial_offload, graph_full_offload) 370 | + minimum_memory 371 | + 2 * layer_size 372 | ): 373 | continue 374 | gpus_with_space.append(i) 375 | gpu_allocations[i] += gpu0_overhead + minimum_memory + layer_size 376 | 377 | overflow = 0 378 | if len(gpus_with_space) == 0: 379 | overflow = projector_weights 380 | 381 | # For all the layers, find where they can fit on the GPU(s) 382 | layer_count = 0 383 | layer_counts = [0] * len(gpus) 384 | for i in range(block_count - 1, -1, -1): 385 | layer_size = layer_sizes[i] 386 | layer_size += kv[i] 387 | 388 | # Distribute the layers across the GPU(s) that have space 389 | for j in range(len(gpus_with_space), 0, -1): 390 | g = gpus_with_space[i % j] 391 | used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload) 392 | if gpus[g]["memory_free"] > used + layer_size: 393 | gpu_allocations[g] += layer_size 394 | layer_counts[g] += 1 395 | layer_count += 1 396 | break 397 | else: 398 | gpus_with_space = ( 399 | gpus_with_space[: i % j] + gpus_with_space[i % j + 1 :] 400 | ) 401 | 402 | if len(gpus_with_space) == 0: 403 | overflow += layer_size 404 | 405 | fully_loaded = False 406 | if layer_count >= block_count: 407 | fully_loaded = True 408 | 409 | # Determine if we need to consider output then find where it fits 410 | if memory_layer_output > 0: 411 | for j in range(len(gpus_with_space), 0, -1): 412 | g = gpus_with_space[layer_count % j] 413 | used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload) 414 | if gpus[g]["memory_free"] > used + memory_layer_output: 415 | gpu_allocations[g] += memory_layer_output 416 | layer_counts[g] += 1 417 | layer_count += 1 418 | break 419 | else: 420 | gpus_with_space = ( 421 | gpus_with_space[: layer_count % j] 422 | + gpus_with_space[layer_count % j + 1 :] 423 | ) 424 | 425 | if layer_count < block_count + 1: 426 | fully_loaded = False 427 | overflow += memory_layer_output 428 | 429 | # Add the applicable (full or partial) graph allocations 430 | for i in range(len(gpus)): 431 | if layer_counts[i] <= 0: 432 | continue 433 | if fully_loaded: 434 | gpu_allocations[i] += graph_full_offload 435 | else: 436 | gpu_allocations[i] += graph_partial_offload 437 | 438 | if fully_loaded: 439 | graph_offload = graph_full_offload 440 | else: 441 | graph_offload = graph_partial_offload 442 | 443 | # Normalize splits 444 | tensor_split = layer_counts 445 | if layer_count != 0: 446 | for i in range(len(tensor_split)): 447 | tensor_split[i] /= layer_count 448 | 449 | # Summaries 450 | memory_required_partial = sum(gpu_allocations) 451 | memory_required_total = memory_required_partial + overflow 452 | 453 | estimate = MemoryEstimate( 454 | layers=0, 455 | graph=0, 456 | vram_size=0, 457 | total_size=int(memory_required_total), 458 | tensor_split=tensor_split, 459 | gpu_sizes=[], 460 | ) 461 | if gpus[0]["name"] == "CPU": 462 | return estimate 463 | if layer_count == 0: 464 | return estimate 465 | 466 | estimate.layers = layer_count 467 | estimate.graph = int(graph_offload) 468 | estimate.vram_size = int(memory_required_partial) 469 | estimate.total_size = int(memory_required_total) 470 | estimate.tensor_split = tensor_split 471 | estimate.gpu_sizes = [int(i) for i in gpu_allocations] 472 | return estimate 473 | -------------------------------------------------------------------------------- /.github/workflows/build-wheel-cuda-hip.yaml: -------------------------------------------------------------------------------- 1 | name: Build Wheels (CUDA & HIP) 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | workflow_dispatch: 8 | 9 | # on: 10 | # push: 11 | # branches: 12 | # - '*' 13 | # pull_request: 14 | # types: ['opened', 'reopened', 'synchronize'] 15 | 16 | concurrency: 17 | group: ${{ github.workflow }}-${{ github.ref }} 18 | cancel-in-progress: true 19 | 20 | permissions: 21 | contents: write 22 | 23 | jobs: 24 | build_wheels_hip_linux: 25 | name: Build Wheel HIP Linux ${{ matrix.pyver }} ${{matrix.hip}} 26 | runs-on: ubuntu-22.04 27 | strategy: 28 | matrix: 29 | pyver: ["3.10", "3.11", "3.12", "3.13"] 30 | hip: ["6.3.4", "6.4.1"] 31 | steps: 32 | - name: Free Disk Space (Ubuntu) 33 | uses: jlumbroso/free-disk-space@main 34 | with: 35 | # this might remove tools that are actually needed, 36 | # if set to "true" but frees about 6 GB 37 | tool-cache: false 38 | 39 | # all of these default to true, but feel free to set to 40 | # "false" if necessary for your workflow 41 | android: true 42 | dotnet: true 43 | haskell: true 44 | large-packages: false 45 | docker-images: true 46 | swap-storage: true 47 | 48 | - name: Clone 49 | id: checkout 50 | uses: actions/checkout@v4 51 | with: 52 | submodules: "recursive" 53 | 54 | - name: Set up Python ${{ matrix.pyver }} 55 | id: setup-python 56 | uses: actions/setup-python@v5 57 | with: 58 | python-version: ${{ matrix.pyver }} 59 | 60 | - name: Start ROCm container 61 | run: | 62 | # Get Python location from setup-python 63 | PYTHON_PATH=$(which python) 64 | PYTHON_HOME=$(dirname $(dirname $PYTHON_PATH)) 65 | 66 | # Start the container with Python from host mounted 67 | docker run -d \ 68 | --name rocm-container \ 69 | -v ${{ github.workspace }}:/workspace \ 70 | -v $PYTHON_HOME:$PYTHON_HOME \ 71 | -e PATH=$PYTHON_HOME/bin:$PATH \ 72 | -w /workspace \ 73 | rocm/dev-ubuntu-22.04:${{ matrix.hip }} \ 74 | sleep infinity 75 | 76 | # Verify Python installation in container 77 | docker exec rocm-container python --version 78 | docker exec rocm-container pip --version 79 | 80 | - name: System Dependencies 81 | run: | 82 | docker exec rocm-container bash -c ' 83 | df -ha 84 | apt-get update 85 | apt-get install -y build-essential git cmake libcurl4-openssl-dev patchelf rocblas-dev hipblas-dev rocwmma-dev curl 86 | apt-get clean 87 | df -ha 88 | hipconfig --full 89 | ls -alh /opt/rocm/lib 90 | ' 91 | 92 | - name: Install Rust 93 | run: | 94 | docker exec rocm-container bash -c ' 95 | curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 96 | source $HOME/.cargo/env 97 | rustc --version 98 | cargo --version 99 | # Add cargo to PATH permanently for this container 100 | echo "export PATH=\$HOME/.cargo/bin:\$PATH" >> $HOME/.bashrc 101 | ' 102 | 103 | - name: Setup Python environment 104 | run: | 105 | docker exec rocm-container bash -c ' 106 | python -m pip install --upgrade pip 107 | python -m pip install build wheel auditwheel 108 | python -m pip install -r requirements.txt 109 | ' 110 | 111 | - name: Build with native CMake HIP support 112 | env: 113 | XLLAMACPP_BUILD_HIP: "1" 114 | VERSIONEER_CLOSEST_TAG_ONLY: "1" 115 | VERBOSE: "1" 116 | run: | 117 | docker exec -e XLLAMACPP_BUILD_HIP=$XLLAMACPP_BUILD_HIP \ 118 | -e VERSIONEER_CLOSEST_TAG_ONLY=$VERSIONEER_CLOSEST_TAG_ONLY \ 119 | -e VERBOSE=$VERBOSE \ 120 | rocm-container bash -l -c ' 121 | python --version 122 | gcc -v 123 | cargo --version 124 | printenv 125 | git config --global --add safe.directory "*" 126 | make 127 | python -m build --wheel 128 | df -ha 129 | echo "Clean up" 130 | rm -rf build 131 | rm -rf thirdparty 132 | df -ha 133 | auditwheel show dist/*.whl 134 | auditwheel repair --plat manylinux_2_35_x86_64 dist/*.whl -w dist 135 | rm dist/*-linux_x86_64.whl 136 | ls -alh dist 137 | ' 138 | 139 | - name: Stop ROCm container and verify wheel files 140 | if: always() 141 | run: | 142 | # Always stop and remove the container 143 | docker stop rocm-container || true 144 | docker rm rocm-container || true 145 | 146 | # Check if any wheel files exist in the dist directory 147 | if [ -z "$(ls -A dist/*.whl 2>/dev/null)" ]; then 148 | echo "❌ No wheel files found in dist directory!" 149 | echo "Current directory contents:" 150 | ls -la dist/ 2>/dev/null || echo "No dist directory found" 151 | exit 1 152 | fi 153 | 154 | echo "✅ Wheel files found in dist directory" 155 | 156 | # - uses: actions/upload-artifact@v4 157 | # with: 158 | # name: artifacts 159 | # path: ./dist/*.whl 160 | # overwrite: true 161 | 162 | - uses: softprops/action-gh-release@v2 163 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 164 | with: 165 | files: dist/* 166 | tag_name: ${{ github.ref_name }}-rocm-${{matrix.hip}} 167 | env: 168 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 169 | 170 | 171 | build_wheels_vulkan: 172 | name: Build Wheel Vulkan ${{ matrix.os }} ${{ matrix.pyver }} 173 | runs-on: ${{ matrix.os }} 174 | strategy: 175 | matrix: 176 | os: ["ubuntu-22.04", "macos-15-intel", "windows-2022"] 177 | pyver: ["3.10", "3.11", "3.12", "3.13"] 178 | vulkan: ["1.4.313.2"] 179 | env: 180 | VULKAN_VERSION: ${{ matrix.vulkan }} 181 | steps: 182 | - name: Clone 183 | id: checkout 184 | uses: actions/checkout@v4 185 | with: 186 | submodules: "recursive" 187 | 188 | - name: Set up Python ${{ matrix.pyver }} 189 | id: setup-python 190 | uses: actions/setup-python@v5 191 | with: 192 | python-version: ${{ matrix.pyver }} 193 | 194 | - name: System Dependencies (Linux) 195 | if: runner.os == 'Linux' 196 | run: | 197 | sudo wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - 198 | sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list 199 | sudo apt-get update -y 200 | sudo apt-get install -y build-essential git cmake libcurl4-openssl-dev patchelf vulkan-sdk mesa-vulkan-drivers 201 | 202 | - name: System Dependencies (Windows) 203 | if: runner.os == 'Windows' 204 | run: | 205 | curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip 206 | unzip -q w64devkit.zip -d . 207 | echo "$(pwd)/w64devkit/bin" >> $GITHUB_PATH 208 | curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" 209 | & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install 210 | Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" 211 | Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" 212 | 213 | - name: System Dependencies (macOS) 214 | if: runner.os == 'macOS' 215 | run: | 216 | # Download and install Vulkan SDK 217 | brew install cmake libomp vulkan-headers glslang molten-vk shaderc vulkan-loader vulkan-tools 218 | vulkaninfo 219 | 220 | - name: Setup Python environment 221 | run: | 222 | python -m pip install --upgrade pip 223 | python -m pip install build wheel 224 | python -m pip install -r requirements.txt 225 | 226 | - name: Install wheel repair tools (Linux) 227 | if: runner.os == 'Linux' 228 | run: python -m pip install auditwheel 229 | 230 | - name: Install wheel repair tools (macOS) 231 | if: runner.os == 'macOS' 232 | run: python -m pip install delocate 233 | 234 | - name: Install wheel repair tools (Windows) 235 | if: runner.os == 'Windows' 236 | run: python -m pip install delvewheel 237 | 238 | - name: Build Wheel 239 | env: 240 | XLLAMACPP_BUILD_VULKAN: "1" 241 | VERSIONEER_CLOSEST_TAG_ONLY: "1" 242 | VERBOSE: "1" 243 | run: | 244 | python --version 245 | printenv 246 | git config --global --add safe.directory "*" 247 | make 248 | python -m build --wheel 249 | 250 | - name: Repair Wheel (Linux) 251 | if: runner.os == 'Linux' 252 | run: | 253 | auditwheel show dist/*.whl 254 | auditwheel repair --plat manylinux_2_35_x86_64 dist/*.whl -w dist 255 | rm dist/*-linux_x86_64.whl 256 | ls -alh dist 257 | 258 | - name: Repair Wheel (macOS) 259 | if: runner.os == 'macOS' 260 | run: | 261 | delocate-wheel -v dist/*.whl 262 | ls -alh dist 263 | 264 | - name: Repair Wheel (Windows) 265 | if: runner.os == 'Windows' 266 | run: | 267 | python -m delvewheel repair dist/*.whl -w dist 268 | dir dist 269 | 270 | # - uses: actions/upload-artifact@v4 271 | # with: 272 | # name: artifacts-${{ runner.os }} 273 | # path: ./dist/*.whl 274 | # overwrite: true 275 | 276 | - uses: softprops/action-gh-release@v2 277 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 278 | with: 279 | files: dist/* 280 | tag_name: ${{ github.ref_name }}-vulkan-${{ matrix.os }} 281 | env: 282 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 283 | 284 | build_wheels_cuda_linux: 285 | name: Build Wheel CUDA Linux ${{ matrix.platform }} ${{ matrix.pyver }} ${{ matrix.cuda }} 286 | runs-on: ${{ matrix.platform }} 287 | strategy: 288 | matrix: 289 | platform: ["ubuntu-22.04", "ubuntu-22.04-arm"] 290 | pyver: ["3.10", "3.11", "3.12", "3.13"] 291 | cuda: ["12.4.1", "12.8.1"] 292 | env: 293 | CUDAVER: ${{ matrix.cuda }} 294 | 295 | steps: 296 | - name: Free Disk Space (Ubuntu) 297 | uses: jlumbroso/free-disk-space@main 298 | with: 299 | tool-cache: false 300 | android: true 301 | dotnet: true 302 | haskell: true 303 | large-packages: false 304 | docker-images: true 305 | swap-storage: true 306 | 307 | - name: Clone 308 | id: checkout 309 | uses: actions/checkout@v4 310 | with: 311 | submodules: "recursive" 312 | fetch-depth: 0 313 | 314 | - name: Setup Python 315 | uses: actions/setup-python@v5 316 | with: 317 | python-version: ${{ matrix.pyver }} 318 | cache: 'pip' 319 | 320 | - name: Setup Mamba 321 | uses: conda-incubator/setup-miniconda@v3.1.1 322 | with: 323 | activate-environment: "llamacpp" 324 | python-version: ${{ matrix.pyver }} 325 | miniforge-version: latest 326 | add-pip-as-python-dependency: true 327 | auto-activate-base: false 328 | 329 | - name: Install Dependencies 330 | env: 331 | MAMBA_DOWNLOAD_FAILFAST: "0" 332 | MAMBA_NO_LOW_SPEED_LIMIT: "1" 333 | run: | 334 | # Echo glibc version 335 | ldd --version 336 | 337 | # First install basic build tools 338 | sudo apt-get update 339 | sudo apt-get install -y build-essential 340 | 341 | # Initialize conda for shell 342 | source $CONDA/etc/profile.d/conda.sh 343 | 344 | # Activate the conda environment 345 | conda activate llamacpp 346 | 347 | echo "CONDA_PREFIX after activation: $CONDA_PREFIX" 348 | 349 | # Try different CUDA package names and channels 350 | echo "Attempting to install CUDA ${{ matrix.cuda }}..." 351 | 352 | # Install using the cuda meta-package from the official channel 353 | mamba install -y -c conda-forge 'cuda==${{ matrix.cuda }}' 354 | echo "Successfully installed CUDA ${{ matrix.cuda }}" 355 | 356 | # Verify CONDA_PREFIX is set 357 | if [ -z "$CONDA_PREFIX" ]; then 358 | echo "ERROR: CONDA_PREFIX is not set after conda activation" 359 | exit 1 360 | fi 361 | 362 | # Install build dependencies 363 | python -m pip install build wheel 364 | python -m pip install -r requirements.txt 365 | 366 | # Verify CUDA installation 367 | echo "=== CUDA Installation Check ===" 368 | echo "CONDA_PREFIX: $CONDA_PREFIX" 369 | echo "=== CUDA Files in CONDA_PREFIX ===" 370 | find $CONDA_PREFIX -name "*cuda*" -o -name "*nvcc*" | sort 371 | echo "=== NVCC Version ===" 372 | which nvcc || echo "nvcc not found in PATH" 373 | nvcc --version || echo "nvcc version check failed" 374 | 375 | # Set CUDA environment variables 376 | echo "=== Setting CUDA Environment Variables ===" 377 | export CUDA_HOME=$CONDA_PREFIX 378 | export CUDA_PATH=$CONDA_PREFIX 379 | export CUDA_TOOLKIT_ROOT_DIR=$CONDA_PREFIX 380 | export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" 381 | export PATH="$CONDA_PREFIX/bin:$PATH" 382 | 383 | # Save to GITHUB_ENV for subsequent steps 384 | echo "CUDA_HOME=$CUDA_HOME" >> $GITHUB_ENV 385 | echo "CUDA_PATH=$CUDA_PATH" >> $GITHUB_ENV 386 | echo "CUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR" >> $GITHUB_ENV 387 | echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV 388 | echo "PATH=$PATH" >> $GITHUB_ENV 389 | 390 | - name: Verify CUDA Version 391 | run: | 392 | source $CONDA/etc/profile.d/conda.sh 393 | conda activate llamacpp 394 | nvcc --version | grep "release $(echo ${{ matrix.cuda }} | cut -d. -f1,2)" 395 | 396 | - name: Build Wheel 397 | run: | 398 | # Echo glibc version 399 | ldd --version 400 | 401 | # Initialize conda for shell 402 | source $CONDA/etc/profile.d/conda.sh 403 | conda activate llamacpp 404 | 405 | echo "=== Build Environment ===" 406 | echo "System: ${{ matrix.platform }}" 407 | echo "Python: $(which python)" 408 | echo "Pip: $(which pip)" 409 | pip list 410 | echo "CUDA_HOME: $CUDA_HOME" 411 | echo "PATH: $PATH" 412 | echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" 413 | echo "CONDA_PREFIX: $CONDA_PREFIX" 414 | which nvcc 415 | nvcc --version 416 | 417 | # Set build environment 418 | cuda_version=${CUDAVER//./} 419 | cuda_version=${cuda_version:0:${#cuda_version}-1} 420 | 421 | # Export build variables 422 | export XLLAMACPP_BUILD_CUDA=1 423 | export VERSIONEER_CLOSEST_TAG_ONLY=1 424 | export VERBOSE=1S 425 | 426 | make 427 | python -m build --wheel 428 | 429 | echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV 430 | 431 | - uses: softprops/action-gh-release@v2 432 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 433 | with: 434 | files: dist/* 435 | tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} 436 | env: 437 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 438 | 439 | build_wheels_cuda_windows: 440 | name: Build Wheel CUDA Windows ${{ matrix.pyver }} ${{ matrix.cuda }} 441 | runs-on: windows-2022 442 | strategy: 443 | matrix: 444 | pyver: ["3.10", "3.11", "3.12", "3.13"] 445 | cuda: ["12.4.1", "12.8.1"] 446 | defaults: 447 | run: 448 | shell: bash 449 | env: 450 | CUDAVER: ${{ matrix.cuda }} 451 | 452 | steps: 453 | - name: Clone 454 | id: checkout 455 | uses: actions/checkout@v4 456 | with: 457 | submodules: "recursive" 458 | fetch-depth: 0 459 | 460 | - name: Setup Python 461 | uses: actions/setup-python@v5 462 | with: 463 | python-version: ${{ matrix.pyver }} 464 | cache: 'pip' 465 | 466 | - name: Install python dependencies 467 | run: | 468 | python -m pip install --upgrade pip 469 | python -m pip install build wheel delvewheel 470 | python -m pip install -r requirements.txt 471 | 472 | - name: Download and install win64devkit 473 | run: | 474 | curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip 475 | unzip -q w64devkit.zip -d . 476 | 477 | - name: Add w64devkit to PATH 478 | run: | 479 | echo "$(pwd)/w64devkit/bin" >> $GITHUB_PATH 480 | 481 | - name: Setup CUDA 482 | uses: Jimver/cuda-toolkit@v0.2.24 483 | id: cuda-toolkit 484 | with: 485 | use-github-cache: false 486 | cuda: ${{ matrix.cuda }} 487 | 488 | - name: Build Wheel 489 | run: | 490 | cuda_version=${CUDAVER//./} 491 | cuda_version=${cuda_version:0:${#cuda_version}-1} 492 | 493 | export XLLAMACPP_BUILD_CUDA=1 494 | export VERSIONEER_CLOSEST_TAG_ONLY=1 495 | export VERBOSE=1 496 | 497 | make 498 | python -m build --wheel 499 | 500 | # On Windows, we use delvewheel for wheel repair 501 | python -m delvewheel repair --exclude nvcuda.dll dist/*.whl -w dist 502 | 503 | echo "CUDA_VERSION=$cuda_version" >> $GITHUB_ENV 504 | 505 | - uses: softprops/action-gh-release@v2 506 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 507 | with: 508 | files: dist/* 509 | tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} 510 | env: 511 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 512 | -------------------------------------------------------------------------------- /src/llama.cpp/include/ggml-backend.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | #include "ggml-alloc.h" 5 | 6 | #ifdef GGML_BACKEND_SHARED 7 | # if defined(_WIN32) && !defined(__MINGW32__) 8 | # ifdef GGML_BACKEND_BUILD 9 | # define GGML_BACKEND_API __declspec(dllexport) extern 10 | # else 11 | # define GGML_BACKEND_API __declspec(dllimport) extern 12 | # endif 13 | # else 14 | # define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern 15 | # endif 16 | #else 17 | # define GGML_BACKEND_API extern 18 | #endif 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; 25 | typedef struct ggml_backend_buffer * ggml_backend_buffer_t; 26 | typedef struct ggml_backend_event * ggml_backend_event_t; 27 | typedef struct ggml_backend * ggml_backend_t; 28 | typedef void * ggml_backend_graph_plan_t; 29 | typedef struct ggml_backend_reg * ggml_backend_reg_t; 30 | typedef struct ggml_backend_device * ggml_backend_dev_t; 31 | 32 | 33 | // 34 | // Backend buffer type 35 | // 36 | 37 | GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); 38 | GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); 39 | GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); 40 | GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); 41 | GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); 42 | GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); 43 | GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); 44 | 45 | // 46 | // Backend buffer 47 | // 48 | 49 | enum ggml_backend_buffer_usage { 50 | GGML_BACKEND_BUFFER_USAGE_ANY = 0, 51 | GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, 52 | GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, 53 | }; 54 | 55 | GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); 56 | GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); 57 | GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); 58 | GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); 59 | GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); 60 | GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); 61 | GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); 62 | GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor); 63 | GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); 64 | GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); 65 | GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); 66 | GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); 67 | GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); 68 | GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); 69 | 70 | // tensor copy between different backends 71 | GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); 72 | 73 | // 74 | // Backend (stream) 75 | // 76 | 77 | GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); 78 | GGML_API const char * ggml_backend_name(ggml_backend_t backend); 79 | GGML_API void ggml_backend_free(ggml_backend_t backend); 80 | 81 | GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); 82 | GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); 83 | GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); 84 | GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend); 85 | 86 | GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); 87 | GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); 88 | 89 | // "offset" refers to the offset in tensor->data for setting/getting data 90 | GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); 91 | GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); 92 | GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); 93 | 94 | GGML_API void ggml_backend_synchronize(ggml_backend_t backend); 95 | 96 | GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); 97 | GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); 98 | 99 | GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); 100 | GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); 101 | GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); 102 | 103 | // NOTE: will be removed, use device version instead 104 | GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); 105 | GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); 106 | GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); 107 | 108 | // asynchronous copy 109 | // the copy is performed after all the currently queued operations in backend_src 110 | // backend_dst will wait for the copy to complete before performing other operations 111 | // automatic fallback to sync copy if async is not supported 112 | GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); 113 | 114 | GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); 115 | 116 | // 117 | // Events 118 | // 119 | 120 | GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); 121 | GGML_API void ggml_backend_event_free(ggml_backend_event_t event); 122 | GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); 123 | GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); 124 | GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); 125 | 126 | // 127 | // Backend device 128 | // 129 | 130 | enum ggml_backend_dev_type { 131 | // CPU device using system memory 132 | GGML_BACKEND_DEVICE_TYPE_CPU, 133 | // GPU device using dedicated memory 134 | GGML_BACKEND_DEVICE_TYPE_GPU, 135 | // integrated GPU device using host memory 136 | GGML_BACKEND_DEVICE_TYPE_IGPU, 137 | // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) 138 | GGML_BACKEND_DEVICE_TYPE_ACCEL 139 | }; 140 | 141 | // functionality supported by the device 142 | struct ggml_backend_dev_caps { 143 | // asynchronous operations 144 | bool async; 145 | // pinned host buffer 146 | bool host_buffer; 147 | // creating buffers from host ptr 148 | bool buffer_from_host_ptr; 149 | // event synchronization 150 | bool events; 151 | }; 152 | 153 | // all the device properties 154 | struct ggml_backend_dev_props { 155 | // device name 156 | const char * name; 157 | // device description 158 | const char * description; 159 | // device free memory in bytes 160 | size_t memory_free; 161 | // device total memory in bytes 162 | size_t memory_total; 163 | // device type 164 | enum ggml_backend_dev_type type; 165 | // device id 166 | // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0") 167 | // if the id is unknown, this should be NULL 168 | const char * device_id; 169 | // device capabilities 170 | struct ggml_backend_dev_caps caps; 171 | }; 172 | 173 | GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); 174 | GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); 175 | GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); 176 | GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); 177 | GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); 178 | GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); 179 | GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); 180 | GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); 181 | GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); 182 | GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); 183 | 184 | GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); 185 | GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); 186 | GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); 187 | 188 | // 189 | // Backend (reg) 190 | // 191 | 192 | GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg); 193 | GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); 194 | GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); 195 | GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); 196 | 197 | // Common functions that may be obtained using ggml_backend_reg_get_proc_address 198 | 199 | // Split buffer type for tensor parallelism 200 | typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split); 201 | // Set the number of threads for the backend 202 | typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); 203 | // Get additional buffer types provided by the device (returns a NULL-terminated array) 204 | typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); 205 | // Set the abort callback for the backend 206 | typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); 207 | // Get a list of feature flags supported by the backend (returns a NULL-terminated array) 208 | struct ggml_backend_feature { 209 | const char * name; 210 | const char * value; 211 | }; 212 | typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); 213 | 214 | // 215 | // Backend registry 216 | // 217 | 218 | GGML_API void ggml_backend_register(ggml_backend_reg_t reg); 219 | 220 | GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); 221 | 222 | // Backend (reg) enumeration 223 | GGML_API size_t ggml_backend_reg_count(void); 224 | GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); 225 | GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); 226 | 227 | // Device enumeration 228 | GGML_API size_t ggml_backend_dev_count(void); 229 | GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); 230 | GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); 231 | GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); 232 | 233 | // Direct backend (stream) initialization 234 | // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) 235 | GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); 236 | // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) 237 | GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); 238 | // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) 239 | GGML_API ggml_backend_t ggml_backend_init_best(void); 240 | 241 | // Load a backend from a dynamic library and register it 242 | GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); 243 | // Unload a backend if loaded dynamically and unregister it 244 | GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); 245 | // Load all known backends from dynamic libraries 246 | GGML_API void ggml_backend_load_all(void); 247 | GGML_API void ggml_backend_load_all_from_path(const char * dir_path); 248 | 249 | // 250 | // Backend scheduler 251 | // 252 | 253 | // The backend scheduler allows for multiple backend devices to be used together 254 | // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends 255 | // The backends are selected based on: 256 | // - the backend that supports the operation 257 | // - the location of the pre-allocated tensors (e.g. the weights) 258 | /* 259 | Example usage: 260 | 261 | // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned 262 | // preferrably to run on the same backend as the buffer 263 | ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); 264 | 265 | sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); 266 | 267 | // initialize buffers from a max size graph (optional) 268 | reserve_graph = build_graph(sched, max_batch_size); 269 | 270 | // manually assign nodes to a backend (optional, should not be needed in most cases) 271 | struct ggml_tensor * node = ggml_mul_mat(ctx, ...); 272 | ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu); 273 | 274 | ggml_backend_sched_reserve(sched, reserve_graph); 275 | 276 | // compute 277 | graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation 278 | for (int i = 0; i < 10; ++i) { 279 | ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically 280 | } 281 | 282 | // if there are graph inputs: 283 | graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called) 284 | ggml_backend_sched_reset(sched); // clear the allocation of the previous graph 285 | ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it 286 | ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors 287 | ggml_backend_sched_graph_compute(sched, graph); // execute the graph 288 | 289 | // as an alternative to the above it is also possible to assign the inputs to a dedicated context and 290 | // allocate them statically via ggml_backend_alloc_ctx_tensors 291 | } 292 | */ 293 | 294 | typedef struct ggml_backend_sched * ggml_backend_sched_t; 295 | 296 | // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) 297 | // when ask == true, the scheduler wants to know if the user wants to observe this node 298 | // this allows the scheduler to batch nodes together in order to evaluate them in a single call 299 | // 300 | // when ask == false, the scheduler is passing the node tensor to the user for observation 301 | // if the user returns false, the scheduler will cancel the graph compute 302 | // 303 | typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); 304 | 305 | // Initialize a backend scheduler, backends with low index are given priority over backends with high index 306 | GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); 307 | GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); 308 | 309 | // Initialize backend buffers from a measure graph 310 | GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success 311 | 312 | GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); 313 | GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); 314 | 315 | // Get the number of splits of the last graph 316 | GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); 317 | GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); 318 | 319 | GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend); 320 | GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); 321 | 322 | GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); 323 | GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); 324 | 325 | // Split graph without allocating it 326 | GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); 327 | 328 | // Allocate and compute graph on the backend scheduler 329 | GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success 330 | GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); 331 | GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); 332 | GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); 333 | 334 | // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph. 335 | // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers. 336 | // The correct way to use this API is to discard the deallocated tensors and create new ones. 337 | GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); 338 | 339 | // Set a callback to be called for each resulting node during graph compute 340 | GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); 341 | 342 | // 343 | // Utils 344 | // 345 | 346 | struct ggml_backend_graph_copy { 347 | ggml_backend_buffer_t buffer; 348 | struct ggml_context * ctx_allocated; 349 | struct ggml_context * ctx_unallocated; 350 | struct ggml_cgraph * graph; 351 | }; 352 | 353 | // Copy a graph to a different backend 354 | GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); 355 | GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); 356 | 357 | typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); 358 | 359 | // Compare the output of two backends 360 | GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node); 361 | 362 | // Tensor initialization 363 | GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); 364 | GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor); 365 | 366 | // CPU buffer types are always available 367 | GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); 368 | GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); 369 | 370 | #ifdef __cplusplus 371 | } 372 | #endif 373 | -------------------------------------------------------------------------------- /src/xllamacpp/server.cpp: -------------------------------------------------------------------------------- 1 | #include "json-schema-to-grammar.h" 2 | #include "server-context.h" 3 | #include "server-http.h" 4 | #include "server-models.h" 5 | 6 | #include "arg.h" 7 | #include "common.h" 8 | #include "llama.h" 9 | #include "log.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include // for std::thread::hardware_concurrency 15 | 16 | #if defined(_WIN32) 17 | #include 18 | #endif 19 | 20 | static std::function shutdown_handler; 21 | static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; 22 | 23 | static inline void signal_handler(int signal) { 24 | if (is_terminating.test_and_set()) { 25 | // in case it hangs, we can force terminate the server by hitting Ctrl+C 26 | // twice this is for better developer experience, we can remove when the 27 | // server is stable enough 28 | fprintf(stderr, "Received second interrupt, terminating immediately.\n"); 29 | exit(1); 30 | } 31 | 32 | shutdown_handler(signal); 33 | } 34 | 35 | // wrapper function that handles exceptions and logs errors 36 | // this is to make sure handler_t never throws exceptions; instead, it returns 37 | // an error response 38 | static server_http_context::handler_t 39 | ex_wrapper(server_http_context::handler_t func) { 40 | return [func = std::move(func)]( 41 | const server_http_req &req) -> server_http_res_ptr { 42 | std::string message; 43 | error_type error; 44 | try { 45 | return func(req); 46 | } catch (const std::invalid_argument &e) { 47 | // treat invalid_argument as invalid request (400) 48 | error = ERROR_TYPE_INVALID_REQUEST; 49 | message = e.what(); 50 | } catch (const std::exception &e) { 51 | // treat other exceptions as server error (500) 52 | error = ERROR_TYPE_SERVER; 53 | message = e.what(); 54 | } catch (...) { 55 | error = ERROR_TYPE_SERVER; 56 | message = "unknown error"; 57 | } 58 | 59 | auto res = std::make_unique(); 60 | res->status = 500; 61 | try { 62 | json error_data = format_error_response(message, error); 63 | res->status = json_value(error_data, "code", 500); 64 | res->data = safe_json_to_str({{"error", error_data}}); 65 | SRV_WRN("got exception: %s\n", res->data.c_str()); 66 | } catch (const std::exception &e) { 67 | SRV_ERR("got another exception: %s | while handling exception: %s\n", 68 | e.what(), message.c_str()); 69 | res->data = "Internal Server Error"; 70 | } 71 | return res; 72 | }; 73 | } 74 | 75 | static void init(common_params ¶ms, server_context &ctx_server, 76 | std::string &listening_address, std::promise out) { 77 | common_log_set_verbosity_thold(params.verbosity); 78 | 79 | // TODO: should we have a separate n_parallel parameter for the server? 80 | // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 81 | // TODO: this is a common configuration that is suitable for most local use 82 | // cases 83 | // however, overriding the parameters is a bit confusing - figure out 84 | // something more intuitive 85 | if (params.n_parallel == 1 && params.kv_unified == false && 86 | !params.has_speculative()) { 87 | LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to " 88 | "disable this)\n", 89 | __func__); 90 | 91 | params.n_parallel = 4; 92 | params.kv_unified = true; 93 | } 94 | 95 | // for consistency between server router mode and single-model mode, we set 96 | // the same model name as alias 97 | if (params.model_alias.empty() && !params.model.name.empty()) { 98 | params.model_alias = params.model.name; 99 | } 100 | 101 | common_init(); 102 | llama_backend_init(); 103 | llama_numa_init(params.numa); 104 | 105 | LOG_INF( 106 | "system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", 107 | params.cpuparams.n_threads, params.cpuparams_batch.n_threads, 108 | std::thread::hardware_concurrency()); 109 | LOG_INF("\n"); 110 | LOG_INF("%s\n", common_params_get_system_info(params).c_str()); 111 | LOG_INF("\n"); 112 | 113 | server_http_context ctx_http; 114 | if (!ctx_http.init(params)) { 115 | LOG_ERR("%s: failed to initialize HTTP server\n", __func__); 116 | out.set_value(1); 117 | return; 118 | } 119 | 120 | // 121 | // Router 122 | // 123 | 124 | // register API routes 125 | server_routes routes(params, ctx_server, 126 | [&ctx_http]() { return ctx_http.is_ready.load(); }); 127 | 128 | constexpr bool is_router_server = false; 129 | std::optional models_routes{}; 130 | if (is_router_server) { 131 | // setup server instances manager 132 | models_routes.emplace(params, 0, nullptr, nullptr); 133 | 134 | // proxy handlers 135 | // note: routes.get_health stays the same 136 | routes.get_metrics = models_routes->proxy_get; 137 | routes.post_props = models_routes->proxy_post; 138 | routes.get_api_show = models_routes->proxy_get; 139 | routes.post_completions = models_routes->proxy_post; 140 | routes.post_completions_oai = models_routes->proxy_post; 141 | routes.post_chat_completions = models_routes->proxy_post; 142 | routes.post_anthropic_messages = models_routes->proxy_post; 143 | routes.post_anthropic_count_tokens = models_routes->proxy_post; 144 | routes.post_infill = models_routes->proxy_post; 145 | routes.post_embeddings = models_routes->proxy_post; 146 | routes.post_embeddings_oai = models_routes->proxy_post; 147 | routes.post_rerank = models_routes->proxy_post; 148 | routes.post_tokenize = models_routes->proxy_post; 149 | routes.post_detokenize = models_routes->proxy_post; 150 | routes.post_apply_template = models_routes->proxy_post; 151 | routes.get_lora_adapters = models_routes->proxy_get; 152 | routes.post_lora_adapters = models_routes->proxy_post; 153 | routes.get_slots = models_routes->proxy_get; 154 | routes.post_slots = models_routes->proxy_post; 155 | 156 | // custom routes for router 157 | routes.get_props = models_routes->get_router_props; 158 | routes.get_models = models_routes->get_router_models; 159 | ctx_http.post("/models/load", 160 | ex_wrapper(models_routes->post_router_models_load)); 161 | ctx_http.post("/models/unload", 162 | ex_wrapper(models_routes->post_router_models_unload)); 163 | ctx_http.post("/models/status", 164 | ex_wrapper(models_routes->post_router_models_status)); 165 | } 166 | 167 | ctx_http.get( 168 | "/health", 169 | ex_wrapper(routes.get_health)); // public endpoint (no API key check) 170 | ctx_http.get( 171 | "/v1/health", 172 | ex_wrapper(routes.get_health)); // public endpoint (no API key check) 173 | ctx_http.get("/metrics", ex_wrapper(routes.get_metrics)); 174 | ctx_http.get("/props", ex_wrapper(routes.get_props)); 175 | ctx_http.post("/props", ex_wrapper(routes.post_props)); 176 | ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); 177 | ctx_http.get( 178 | "/models", 179 | ex_wrapper(routes.get_models)); // public endpoint (no API key check) 180 | ctx_http.get( 181 | "/v1/models", 182 | ex_wrapper(routes.get_models)); // public endpoint (no API key check) 183 | ctx_http.get( 184 | "/api/tags", 185 | ex_wrapper(routes.get_models)); // ollama specific endpoint. public 186 | // endpoint (no API key check) 187 | ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy 188 | ctx_http.post("/completions", ex_wrapper(routes.post_completions)); 189 | ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); 190 | ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); 191 | ctx_http.post("/v1/chat/completions", 192 | ex_wrapper(routes.post_chat_completions)); 193 | ctx_http.post( 194 | "/api/chat", 195 | ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint 196 | ctx_http.post( 197 | "/v1/messages", 198 | ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API 199 | ctx_http.post( 200 | "/v1/messages/count_tokens", 201 | ex_wrapper( 202 | routes.post_anthropic_count_tokens)); // anthropic token counting 203 | ctx_http.post("/infill", ex_wrapper(routes.post_infill)); 204 | ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy 205 | ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); 206 | ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); 207 | ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); 208 | ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); 209 | ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); 210 | ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); 211 | ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); 212 | ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); 213 | ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); 214 | // LoRA adapters hotswap 215 | ctx_http.get("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); 216 | ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); 217 | // Save & load slots 218 | ctx_http.get("/slots", ex_wrapper(routes.get_slots)); 219 | ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); 220 | 221 | // 222 | // Start the server 223 | // 224 | 225 | std::function clean_up; 226 | 227 | if (is_router_server) { 228 | LOG_INF( 229 | "%s: starting router server, no model will be loaded in this process\n", 230 | __func__); 231 | 232 | clean_up = [&models_routes]() { 233 | SRV_INF("%s: cleaning up before exit...\n", __func__); 234 | if (models_routes.has_value()) { 235 | models_routes->models.unload_all(); 236 | } 237 | llama_backend_free(); 238 | }; 239 | 240 | if (!ctx_http.start()) { 241 | clean_up(); 242 | LOG_ERR("%s: exiting due to HTTP server error\n", __func__); 243 | out.set_value(1); 244 | return; 245 | } 246 | ctx_http.is_ready.store(true); 247 | 248 | shutdown_handler = [&](int) { ctx_http.stop(); }; 249 | 250 | } else { 251 | // setup clean up function, to be called before exit 252 | clean_up = [&ctx_http, &ctx_server]() { 253 | SRV_INF("%s: cleaning up before exit...\n", __func__); 254 | ctx_http.stop(); 255 | ctx_server.terminate(); 256 | llama_backend_free(); 257 | }; 258 | 259 | // start the HTTP server before loading the model to be able to serve 260 | // /health requests 261 | if (!ctx_http.start()) { 262 | clean_up(); 263 | LOG_ERR("%s: exiting due to HTTP server error\n", __func__); 264 | out.set_value(1); 265 | return; 266 | } 267 | 268 | // load the model 269 | LOG_INF("%s: loading model\n", __func__); 270 | 271 | if (!ctx_server.load_model(params)) { 272 | clean_up(); 273 | if (ctx_http.thread.joinable()) { 274 | ctx_http.thread.join(); 275 | } 276 | LOG_ERR("%s: exiting due to model loading error\n", __func__); 277 | out.set_value(1); 278 | return; 279 | } 280 | 281 | ctx_server.init(); 282 | ctx_http.is_ready.store(true); 283 | 284 | LOG_INF("%s: model loaded\n", __func__); 285 | 286 | shutdown_handler = [&](int) { 287 | // this will unblock start_loop() 288 | ctx_server.terminate(); 289 | }; 290 | } 291 | 292 | // TODO: refactor in common/console 293 | #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) 294 | struct sigaction sigint_action; 295 | sigint_action.sa_handler = signal_handler; 296 | sigemptyset(&sigint_action.sa_mask); 297 | sigint_action.sa_flags = 0; 298 | sigaction(SIGINT, &sigint_action, NULL); 299 | sigaction(SIGTERM, &sigint_action, NULL); 300 | #elif defined(_WIN32) 301 | auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { 302 | return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; 303 | }; 304 | SetConsoleCtrlHandler( 305 | reinterpret_cast(console_ctrl_handler), true); 306 | #endif 307 | 308 | if (is_router_server) { 309 | LOG_INF("%s: router server is listening on %s\n", __func__, 310 | ctx_http.listening_address.c_str()); 311 | LOG_INF("%s: NOTE: router mode is experimental\n", __func__); 312 | LOG_INF("%s: it is not recommended to use this mode in untrusted " 313 | "environments\n", 314 | __func__); 315 | if (ctx_http.thread.joinable()) { 316 | ctx_http.thread.join(); // keep the main thread alive 317 | } 318 | 319 | // when the HTTP server stops, clean up and exit 320 | clean_up(); 321 | } else { 322 | LOG_INF("%s: server is listening on %s\n", __func__, 323 | ctx_http.listening_address.c_str()); 324 | LOG_INF("%s: starting the main loop...\n", __func__); 325 | 326 | // optionally, notify router server that this instance is ready 327 | const char *router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); 328 | std::thread monitor_thread; 329 | if (router_port != nullptr) { 330 | monitor_thread = server_models::setup_child_server( 331 | params, std::atoi(router_port), params.model_alias, shutdown_handler); 332 | } 333 | 334 | // write the listening_address 335 | listening_address = ctx_http.listening_address; 336 | 337 | out.set_value(0); 338 | 339 | // this call blocks the main thread until queue_tasks.terminate() is called 340 | ctx_server.start_loop(); 341 | 342 | clean_up(); 343 | if (ctx_http.thread.joinable()) { 344 | ctx_http.thread.join(); 345 | } 346 | if (monitor_thread.joinable()) { 347 | monitor_thread.join(); 348 | } 349 | // crash during llama_memory_breakdown_print if the model is rerank. 350 | if (params.pooling_type != LLAMA_POOLING_TYPE_RANK) { 351 | llama_memory_breakdown_print(ctx_server.get_llama_context()); 352 | } 353 | } 354 | } 355 | 356 | static void ggml_log_callback_default(enum ggml_log_level level, 357 | const char *text, void *user_data) { 358 | (void)level; 359 | (void)text; 360 | (void)user_data; 361 | // if (level == GGML_LOG_LEVEL_INFO || level == GGML_LOG_LEVEL_ERROR) { 362 | // fputs(text, stderr); 363 | // fflush(stderr); 364 | // } 365 | } 366 | 367 | std::function not_stop = [] { return false; }; 368 | 369 | static std::vector parse_oai_sse(const std::string &sse) { 370 | std::vector out; 371 | 372 | std::size_t start = 0; 373 | while (start < sse.size()) { 374 | std::size_t end = sse.find('\n', start); 375 | if (end == std::string::npos) { 376 | break; 377 | } 378 | 379 | // Empty line = event separator, skip 380 | if (end > start) { 381 | // Guaranteed format: "data: " 382 | out.emplace_back(sse.substr(start + 6, end - start - 6)); 383 | } 384 | 385 | start = end + 1; 386 | } 387 | 388 | return out; 389 | } 390 | 391 | static void 392 | process_handler_response(server_http_res_ptr &response, 393 | std::function res_err, 394 | std::function res_ok) { 395 | static const std::string sse_prefix("data: "); 396 | auto res = response->status == 200 ? res_ok : res_err; 397 | if (response->is_stream()) { 398 | std::string chunk; 399 | 400 | while (true) { 401 | const bool has_next = response->next(chunk); 402 | if (!chunk.empty() && chunk.size() >= sse_prefix.size()) { 403 | if (!has_next && chunk == "data: [DONE]\n\n") { 404 | return; 405 | } 406 | auto parsed = parse_oai_sse(chunk); 407 | for (auto &&json_str : parsed) { 408 | if (res(std::move(json_str))) { 409 | return; 410 | } 411 | } 412 | } 413 | if (!has_next) { 414 | return; 415 | } 416 | } 417 | } else { 418 | res(std::move(response->data)); 419 | } 420 | } 421 | 422 | #include "server.h" 423 | 424 | namespace xllamacpp { 425 | 426 | std::string get_system_info() { return llama_print_system_info(); } 427 | 428 | std::vector get_device_info() { 429 | ggml_log_set(ggml_log_callback_default, nullptr); 430 | 431 | const size_t dev_count = ggml_backend_dev_count(); 432 | 433 | std::vector result; 434 | std::vector devs; 435 | std::vector backends; 436 | 437 | for (size_t i = 0; i < dev_count; ++i) { 438 | devs.push_back(ggml_backend_dev_get(i)); 439 | 440 | ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL); 441 | GGML_ASSERT(backend != NULL); 442 | 443 | auto *reg = ggml_backend_dev_backend_reg(devs[i]); 444 | auto ggml_backend_set_n_threads_fn = 445 | (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address( 446 | reg, "ggml_backend_set_n_threads"); 447 | if (ggml_backend_set_n_threads_fn) { 448 | ggml_backend_set_n_threads_fn(backend, 449 | std::thread::hardware_concurrency() / 2); 450 | } 451 | 452 | backends.push_back(backend); 453 | } 454 | 455 | for (size_t i = 0; i < dev_count; ++i) { 456 | // Put the backend to be tested in front so that it's prioritized: 457 | std::vector backends_modded = {backends[i]}; 458 | backends_modded.insert(backends_modded.end(), backends.begin(), 459 | backends.end()); 460 | 461 | ggml_backend_dev_props prop; 462 | ggml_backend_dev_get_props(devs[i], &prop); 463 | // Avoid crash when converting the prop struct to Python dict by Cython. 464 | if (prop.device_id == nullptr) { 465 | prop.device_id = ""; 466 | } 467 | 468 | result.push_back(prop); 469 | } 470 | 471 | for (ggml_backend_t backend : backends) { 472 | ggml_backend_free(backend); 473 | } 474 | 475 | return result; 476 | } 477 | 478 | Server::Server(const common_params ¶ms) 479 | : _params(params), _ctx_server(new server_context()) { 480 | std::promise out; 481 | std::future fut = out.get_future(); 482 | _loop_thread = std::thread(init, std::ref(_params), std::ref(*_ctx_server), 483 | std::ref(_listening_address), std::move(out)); 484 | if (fut.get() != 0) { 485 | if (_loop_thread.joinable()) { 486 | _loop_thread.join(); 487 | } 488 | throw std::runtime_error( 489 | "Failed to init server, please check the input params."); 490 | } 491 | _routes = std::make_shared(_params, *_ctx_server, 492 | []() { return true; }); 493 | } 494 | 495 | Server::~Server() { 496 | _ctx_server->terminate(); 497 | LOG_INF("%s: waiting for main loop exit\n", __func__); 498 | if (_loop_thread.joinable()) { 499 | _loop_thread.join(); 500 | } 501 | LOG_INF("%s: main loop exited\n", __func__); 502 | } 503 | 504 | std::string Server::listening_address() const { return _listening_address; } 505 | 506 | std::string Server::handle_metrics() { 507 | server_http_req req{{}, {}, "", "", not_stop}; 508 | auto res = _routes->get_metrics(req); 509 | return res->data; 510 | } 511 | 512 | std::string Server::handle_embeddings(const std::string &input_json_str) { 513 | server_http_req req{{}, {}, "", input_json_str, not_stop}; 514 | auto res = _routes->post_embeddings_oai(req); 515 | return res->data; 516 | } 517 | 518 | std::string Server::handle_rerank(const std::string &input_json_str) { 519 | server_http_req req{{}, {}, "", input_json_str, not_stop}; 520 | auto res = _routes->post_rerank(req); 521 | return res->data; 522 | } 523 | 524 | void Server::handle_completions(const std::string &prompt_json_str, 525 | Callback res_err, void *py_cb_err, 526 | Callback res_ok, void *py_cb_ok) { 527 | server_http_req req{{}, {}, "", prompt_json_str, not_stop}; 528 | auto res = _routes->post_completions_oai(req); 529 | process_handler_response( 530 | res, 531 | [res_err, py_cb_err](std::string &&err) { 532 | return res_err(std::move(err), py_cb_err); 533 | }, 534 | [res_ok, py_cb_ok](std::string &&ok) { 535 | return res_ok(std::move(ok), py_cb_ok); 536 | }); 537 | } 538 | 539 | void Server::handle_chat_completions(const std::string &prompt_json_str, 540 | Callback res_err, void *py_cb_err, 541 | Callback res_ok, void *py_cb_ok) { 542 | server_http_req req{{}, {}, "", prompt_json_str, not_stop}; 543 | auto res = _routes->post_chat_completions(req); 544 | process_handler_response( 545 | res, 546 | [res_err, py_cb_err](std::string &&err) { 547 | return res_err(std::move(err), py_cb_err); 548 | }, 549 | [res_ok, py_cb_ok](std::string &&ok) { 550 | return res_ok(std::move(ok), py_cb_ok); 551 | }); 552 | } 553 | 554 | std::string json_schema_to_grammar_str(const std::string &schema_json_str) { 555 | try { 556 | auto schema = json::parse(schema_json_str); 557 | return json_schema_to_grammar(schema); 558 | } catch (const std::exception &e) { 559 | throw std::runtime_error(std::string("json_schema_to_grammar: ") + 560 | e.what()); 561 | } 562 | } 563 | 564 | // Helper function to parse tensor buffer override strings 565 | void parse_tensor_buffer_overrides( 566 | const std::string &value, 567 | std::vector &overrides) { 568 | std::map buft_list; 569 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { 570 | auto *dev = ggml_backend_dev_get(i); 571 | auto *buft = ggml_backend_dev_buffer_type(dev); 572 | if (buft) { 573 | buft_list[ggml_backend_buft_name(buft)] = buft; 574 | } 575 | } 576 | 577 | for (const auto &override : string_split(value, ',')) { 578 | std::string::size_type pos = override.find('='); 579 | if (pos == std::string::npos) { 580 | throw std::invalid_argument("invalid value"); 581 | } 582 | std::string tensor_name = override.substr(0, pos); 583 | std::string buffer_type = override.substr(pos + 1); 584 | 585 | if (buft_list.find(buffer_type) == buft_list.end()) { 586 | printf("Available buffer types:\n"); 587 | for (const auto &it : buft_list) { 588 | printf(" %s\n", ggml_backend_buft_name(it.second)); 589 | } 590 | throw std::invalid_argument("unknown buffer type"); 591 | } 592 | // keep strings alive and avoid leaking memory by storing them in a static 593 | // vector 594 | static std::list buft_overrides; 595 | buft_overrides.push_back(tensor_name); 596 | overrides.push_back( 597 | {buft_overrides.back().c_str(), buft_list.at(buffer_type)}); 598 | } 599 | } 600 | 601 | // Helper function to build tensor buffer override strings 602 | void build_tensor_buffer_overrides( 603 | const std::vector &overrides, 604 | std::string &value) { 605 | std::map buft_list; 606 | for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { 607 | auto *dev = ggml_backend_dev_get(i); 608 | auto *buft = ggml_backend_dev_buffer_type(dev); 609 | if (buft) { 610 | buft_list[buft] = ggml_backend_buft_name(buft); 611 | } 612 | } 613 | 614 | std::vector parts; 615 | for (auto &override : overrides) { 616 | std::string ov_str = 617 | std::string(override.pattern) + "=" + buft_list[override.buft]; 618 | parts.emplace_back(ov_str); 619 | } 620 | 621 | value = string_join(parts, ","); 622 | } 623 | 624 | } // namespace xllamacpp 625 | --------------------------------------------------------------------------------