├── .github └── workflows │ └── build-and-push.yml ├── .gitignore ├── LICENSE ├── README.en.md ├── README.md ├── README.zh.md ├── assets ├── inf.png ├── schema-example.flowchart.txt ├── schema-example.png ├── schema.flowchart.txt └── schema.png ├── docker-compose.dist.yml ├── examples ├── completion.sh └── server.sh └── llama.cpp ├── Dockerfile ├── Dockerfile.cuda └── entrypoint.sh /.github/workflows/build-and-push.yml: -------------------------------------------------------------------------------- 1 | name: Check for new releases, build and push containers 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '0 0 * * *' # every day at midnight 6 | 7 | jobs: 8 | compare_tags: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v2 13 | 14 | - name: Install curl and jq 15 | run: sudo apt-get install curl jq 16 | 17 | - name: Get Docker Hub tags 18 | id: docker_hub_tags 19 | run: | 20 | url="https://registry.hub.docker.com/v2/repositories/${owner}/${repo}/tags/" 21 | result=$(curl -s "$url" | jq '."results"[]["name"]' | grep -v "latest" | grep -v "cuda") 22 | sorted_result=$(echo "$result" | sed 's/\"//g' | sort -V | tr '\n' ',') 23 | echo "::set-output name=result::$sorted_result" 24 | env: 25 | owner: evilfreelancer 26 | repo: llama.cpp-rpc 27 | 28 | - name: Get GitHub Releases 29 | id: github_releases 30 | run: | 31 | url="https://api.github.com/repos/${owner}/${repo}/releases" 32 | result=$(curl -L -s "$url" | jq '.[]["name"]' | grep '^b*') 33 | sorted_result=$(echo "$result" | sed 's/\"//g' | sort -V | tail -n 2 | tr '\n' ',') 34 | echo "::set-output name=result::$sorted_result" 35 | env: 36 | owner: ggerganov 37 | repo: llama.cpp 38 | 39 | - name: Login to Docker Hub 40 | uses: docker/login-action@v3 41 | with: 42 | username: ${{ secrets.DOCKERHUB_USERNAME }} 43 | password: ${{ secrets.DOCKERHUB_TOKEN }} 44 | 45 | - name: Compare tags 46 | run: | 47 | docker_hub_tags="${{ steps.docker_hub_tags.outputs.result }}" 48 | github_releases="${{ steps.github_releases.outputs.result }}" 49 | 50 | IFS=',' read -r -a docker_hub_tags_array <<< "$docker_hub_tags" 51 | IFS=',' read -r -a github_releases_array <<< "$github_releases" 52 | 53 | docker buildx create --name my_builder --driver=docker-container 54 | 55 | # Initialize a variable to track the latest tag 56 | latest_tag="" 57 | 58 | # Build and push images with missed tag 59 | for tag in "${github_releases_array[@]}"; do 60 | if [[ ! " ${docker_hub_tags_array[*]} " =~ " $tag " ]]; then 61 | echo "Missing tag: $tag" 62 | docker buildx build --builder=my_builder --push --platform=linux/amd64 --build-arg LLAMACPP_VERSION=${tag} --tag=${owner}/${repo}:${tag} ./llama.cpp/ 63 | docker buildx build --builder=my_builder --push --platform=linux/amd64 --file ./llama.cpp/Dockerfile.cuda --build-arg LLAMACPP_VERSION=${tag} --tag=${owner}/${repo}:${tag}-cuda ./llama.cpp/ 64 | latest_tag=$tag 65 | fi 66 | done 67 | 68 | # Tagging the latest version as 'latest' 69 | if [ ! -z "$latest_tag" ]; then 70 | echo "Tagging the latest version as 'latest': $latest_tag" 71 | docker buildx build --builder=my_builder --push --platform=linux/amd64 --build-arg LLAMACPP_VERSION=${latest_tag} --tag=${owner}/${repo}:latest ./llama.cpp/ 72 | docker buildx build --builder=my_builder --push --platform=linux/amd64 --file ./llama.cpp/Dockerfile.cuda --build-arg LLAMACPP_VERSION=${latest_tag} --tag=${owner}/${repo}:cuda --tag=${owner}/${repo}:latest-cuda ./llama.cpp/ 73 | fi 74 | env: 75 | owner: evilfreelancer 76 | repo: llama.cpp-rpc 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | /models/ 3 | /docker-compose.yml 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Pavel Rykov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.en.md: -------------------------------------------------------------------------------- 1 | # llama.cpp RPC-server in Docker 2 | 3 | [Русский](./README.md) | [中文](./README.zh.md) | **English** 4 | 5 | This project is based on [llama.cpp](https://github.com/ggerganov/llama.cpp) and compiles only 6 | the [RPC](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc) server, along with auxiliary utilities 7 | operating in RPC client mode, which are necessary for implementing distributed inference of Large Language Models (LLMs) 8 | and Embedding Models converted into the GGUF format. 9 | 10 | ## Overview 11 | 12 | The general architecture of an application using the RPC server looks as follows: 13 | 14 | ![schema](./assets/schema.png) 15 | 16 | Instead of `llama-server`, you can use `llama-cli` or `llama-embedding`, which are included in the standard container 17 | package. 18 | 19 | Docker images are built with support for the following architectures: 20 | 21 | * **CPU-only** - amd64, arm64, arm/v7 22 | * **CUDA** - amd64 23 | 24 | Unfortunately, CUDA builds for arm64 fail due to an error, so they are temporarily disabled. 25 | 26 | ## Environment Variables 27 | 28 | | Name | Default | Description | 29 | |--------------------|--------------------------------------------|--------------------------------------------------------------------------------------------------| 30 | | APP_MODE | backend | Container operation mode, available options: server, backend, and none | 31 | | APP_BIND | 0.0.0.0 | Interface to bind to | 32 | | APP_PORT | `8080` for `server`, `50052` for `backend` | Port number on which the server is running | 33 | | APP_MEM | 1024 | Amount of MiB of RAM available to the client; in CUDA mode, this is the amount of GPU memory | 34 | | APP_RPC_BACKENDS | backend-cuda:50052,backend-cpu:50052 | Comma-separated addresses of backends that the container will try to connect to in `server` mode | 35 | | APP_MODEL | /app/models/TinyLlama-1.1B-q4_0.gguf | Path to the model weights inside the container | 36 | | APP_REPEAT_PENALTY | 1.0 | Repeat penalty | 37 | | APP_GPU_LAYERS | 99 | Number of layers offloaded to the backend | 38 | 39 | ## Example of docker-compose.yml 40 | 41 | In this example, `llama-server` (container `main`) is launched and the 42 | model [TinyLlama-1.1B-q4_0.gguf](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/tree/main), which was 43 | previously downloaded to the `./models` directory located at the same level as `docker-compose.yml`, is initialized. The 44 | `./models` directory is then mounted inside the `main` container and is available at the path `/app/models`. 45 | 46 | ```yaml 47 | version: "3.9" 48 | 49 | services: 50 | 51 | main: 52 | image: evilfreelancer/llama.cpp-rpc:latest 53 | restart: unless-stopped 54 | volumes: 55 | - ./models:/app/models 56 | environment: 57 | # Operation mode (RPC client in API server format) 58 | APP_MODE: server 59 | # Path to the model weights, preloaded inside the container 60 | APP_MODEL: /app/models/TinyLlama-1.1B-q4_0.gguf 61 | # Addresses of the RPC servers the client will interact with 62 | APP_RPC_BACKENDS: backend-cuda:50052,backend-cpu:50052 63 | ports: 64 | - "127.0.0.1:8080:8080" 65 | 66 | backend-cpu: 67 | image: evilfreelancer/llama.cpp-rpc:latest 68 | restart: unless-stopped 69 | environment: 70 | # Operation mode (RPC server) 71 | APP_MODE: backend 72 | # Amount of system RAM available to the RPC server (in Megabytes) 73 | APP_MEM: 2048 74 | 75 | backend-cuda: 76 | image: evilfreelancer/llama.cpp-rpc:latest-cuda 77 | restart: "unless-stopped" 78 | environment: 79 | # Operation mode (RPC server) 80 | APP_MODE: backend 81 | # Amount of GPU memory available to the RPC server (in Megabytes) 82 | APP_MEM: 1024 83 | deploy: 84 | resources: 85 | reservations: 86 | devices: 87 | - driver: nvidia 88 | count: 1 89 | capabilities: [ gpu ] 90 | ``` 91 | 92 | A complete example is available in [docker-compose.dist.yml](./docker-compose.dist.yml). 93 | 94 | As a result, we obtain the following diagram: 95 | 96 | ![schema-example](./assets/schema-example.png) 97 | 98 | Once launched, you can make HTTP requests like this: 99 | 100 | ```shell 101 | curl \ 102 | --request POST \ 103 | --url http://localhost:8080/completion \ 104 | --header "Content-Type: application/json" \ 105 | --data '{"prompt": "Building a website can be done in 10 simple steps:"}' 106 | ``` 107 | 108 | ## Manual Docker Build 109 | 110 | Building containers in CPU-only mode: 111 | 112 | ```shell 113 | docker build ./llama.cpp/ 114 | ``` 115 | 116 | Building the container for CUDA: 117 | 118 | ```shell 119 | docker build ./llama.cpp/ --file ./llama.cpp/Dockerfile.cuda 120 | ``` 121 | 122 | Using the build argument `LLAMACPP_VERSION`, you can specify the tag version, branch name, or commit hash to build the 123 | container from. By default, the `master` branch is specified in the container. 124 | 125 | ```shell 126 | # Build the container from the tag https://github.com/ggerganov/llama.cpp/releases/tag/b3700 127 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=b3700 128 | ``` 129 | 130 | ```shell 131 | # Build the container from the master branch 132 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=master 133 | # or simply 134 | docker build ./llama.cpp/ 135 | ``` 136 | 137 | ## Manual Build Using Docker Compose 138 | 139 | An example of docker-compose.yml that performs image building with an explicit tag specification: 140 | 141 | ```yaml 142 | version: "3.9" 143 | 144 | services: 145 | 146 | main: 147 | restart: "unless-stopped" 148 | build: 149 | context: ./llama.cpp 150 | args: 151 | - LLAMACPP_VERSION=b3700 152 | volumes: 153 | - ./models:/app/models 154 | environment: 155 | APP_MODE: none 156 | ports: 157 | - "8080:8080" 158 | 159 | backend: 160 | restart: "unless-stopped" 161 | build: 162 | context: ./llama.cpp 163 | args: 164 | - LLAMACPP_VERSION=b3700 165 | environment: 166 | APP_MODE: backend 167 | ports: 168 | - "50052:50052" 169 | ``` 170 | 171 | ## Links 172 | 173 | - https://github.com/ggerganov/ggml/pull/761 174 | - https://github.com/ggerganov/llama.cpp/issues/7293 175 | - https://github.com/ggerganov/llama.cpp/pull/6829 176 | - https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc 177 | - https://github.com/mudler/LocalAI/commit/fdb45153fed10d8a2c775633e952fdf02de60461 178 | - https://github.com/mudler/LocalAI/pull/2324 179 | - https://github.com/ollama/ollama/issues/4643 180 | 181 | ## Citing 182 | 183 | ```text 184 | [Pavel Rykov]. (2024). llama.cpp RPC-server in Docker. GitHub. https://github.com/EvilFreelancer/docker-llama.cpp-rpc 185 | ``` 186 | 187 | ```text 188 | @misc{pavelrykov2024llamacpprpc, 189 | author = {Pavel Rykov}, 190 | title = {llama.cpp RPC-server in Docker}, 191 | year = {2024}, 192 | url = {https://github.com/EvilFreelancer/docker-llama.cpp-rpc} 193 | } 194 | ``` 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llama.cpp RPC-server in Docker 2 | 3 | **Русский** | [中文](./README.zh.md) | [English](./README.en.md) 4 | 5 | Данный проект основан на [llama.cpp](https://github.com/ggerganov/llama.cpp) и компилирует 6 | только [RPC](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc)-сервер, а так же 7 | вспомогательные утилиты, работающие в режиме RPC-клиента, необходимые для реализации распределённого инференса 8 | конвертированных в GGUF формат Больших Языковых Моделей (БЯМ) и Эмбеддинговых Моделей. 9 | 10 | Образы [evilfreelancer/llama.cpp-rpc](https://hub.docker.com/r/evilfreelancer/llama.cpp-rpc) на Docker Hub. 11 | 12 | ## Обзор 13 | 14 | В общем виде схема приложения с использованием RPC-сервера имеет следующий вид: 15 | 16 | ![schema](./assets/schema.png) 17 | 18 | Вместо `llama-server` можно использовать `llama-cli` или `llama-embedding`, они идут в стандартной поставке контейнера. 19 | 20 | Docker-образы собираются с поддержкой следующих архитектур: 21 | 22 | - **CPU-only** - amd64 23 | - **CUDA** - amd64 24 | 25 | К сожалению сборка под архитектуры arm64 и arm/v7 падает с ошибкой, поэтому они временно отключены. 26 | 27 | ## Переменные окружения 28 | 29 | | Имя | Дефолт | Описание | 30 | |--------------------|------------------------------------------------|----------------------------------------------------------------------------------------------------------------| 31 | | APP_MODE | backend | Режим работы контейнера, доступные варианты: `server`, `backend` и `none` | 32 | | APP_BIND | 0.0.0.0 | Интерфейс на который происходит биндинг | 33 | | APP_PORT | у `server` это `8080`, у `backend` это `50052` | Номер порта на котором запускается сервер | 34 | | APP_MEM | 1024 | Количество Мб оперативной памяти доступной клиенту, в режиме CUDA это количество оперативной памяти видеокарты | 35 | | APP_RPC_BACKENDS | backend-cuda:50052,backend-cpu:50052 | Разделённые запятой адреса бэкендов к которым будет пытаться подключиться контейнер в режиме `server` | 36 | | APP_MODEL | /app/models/TinyLlama-1.1B-q4_0.gguf | Путь к весам модели внутри контейнера | 37 | | APP_REPEAT_PENALTY | 1.0 | Пенальти повторов | 38 | | APP_GPU_LAYERS | 99 | Количество слоёв выгружаемых на бэкенд | 39 | 40 | ## Пример docker-compose.yml 41 | 42 | В данном примере происходит запуск `llama-server` (контейнер `main`) и инициализация 43 | модели [TinyLlama-1.1B-q4_0.gguf](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/tree/main), 44 | которая была заранее загружена в директорию `./models`, расположенную на одном уровне с `docker-compose.yml`. Директория 45 | `./models` в свою очередь монтируется внутрь контейнера `main` и доступна по пути `/app/models`. 46 | 47 | ```yaml 48 | version: "3.9" 49 | 50 | services: 51 | 52 | main: 53 | image: evilfreelancer/llama.cpp-rpc:latest 54 | restart: unless-stopped 55 | volumes: 56 | - ./models:/app/models 57 | environment: 58 | # Режим работы (RPC-клиент в формате API-сервера) 59 | APP_MODE: server 60 | # Путь до весов, предварительно загруженной модели, внутри контейнера 61 | APP_MODEL: /app/models/TinyLlama-1.1B-q4_0.gguf 62 | # Адреса RPC-серверов с которыми будет взаимодействовать клиент 63 | APP_RPC_BACKENDS: backend-cuda:50052,backend-cpu:50052 64 | ports: 65 | - "127.0.0.1:8080:8080" 66 | 67 | backend-cpu: 68 | image: evilfreelancer/llama.cpp-rpc:latest 69 | restart: unless-stopped 70 | environment: 71 | # Режим работы (RPC-сервер) 72 | APP_MODE: backend 73 | # Количество доступной RPC-серверу системной оперативной памяти (в Мегабайтах) 74 | APP_MEM: 2048 75 | 76 | backend-cuda: 77 | image: evilfreelancer/llama.cpp-rpc:latest-cuda 78 | restart: "unless-stopped" 79 | environment: 80 | # Режим работы (RPC-сервер) 81 | APP_MODE: backend 82 | # Количество доступной RPC-серверу оперативной памяти видеокарты (в Мегабайтах) 83 | APP_MEM: 1024 84 | deploy: 85 | resources: 86 | reservations: 87 | devices: 88 | - driver: nvidia 89 | count: 1 90 | capabilities: [ gpu ] 91 | ``` 92 | 93 | Полный пример в [docker-compose.dist.yml](./docker-compose.dist.yml). 94 | 95 | В результате чего у нас получается следующего вида схема: 96 | 97 | ![schema-example](./assets/schema-example.png) 98 | 99 | После её запуска можно будет делать такого вида HTTP запросы: 100 | 101 | ```shell 102 | curl \ 103 | --request POST \ 104 | --url http://localhost:8080/completion \ 105 | --header "Content-Type: application/json" \ 106 | --data '{"prompt": "Building a website can be done in 10 simple steps:"}' 107 | ``` 108 | 109 | ## Ручная сборка через Docker 110 | 111 | Сборка контейнеров в режиме CPU-only: 112 | 113 | ```shell 114 | docker build ./llama.cpp/ 115 | ``` 116 | 117 | Сборка контейнера под CUDA: 118 | 119 | ```shell 120 | docker build ./llama.cpp/ --file ./llama.cpp/Dockerfile.cuda 121 | ``` 122 | 123 | При помощи аргумента сборки `LLAMACPP_VERSION` можно указать версию тега, название ветки или хеш коммита из которого 124 | требуется выполнить сборку контейнера, по умолчанию в контейнере указана ветка `master`. 125 | 126 | ```shell 127 | # Собрать контейнер из тега https://github.com/ggerganov/llama.cpp/releases/tag/b3700 128 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=b5480 129 | ``` 130 | 131 | ```shell 132 | # Собрать контейнер из ветки master 133 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=master 134 | # или просто 135 | docker build ./llama.cpp/ 136 | ``` 137 | 138 | ## Ручная сборка через Docker Compose 139 | 140 | Пример `docker-compose.yml` который выполняет сборку образа с явным указанием тега. 141 | 142 | ```yaml 143 | version: "3.9" 144 | 145 | services: 146 | 147 | main: 148 | restart: "unless-stopped" 149 | build: 150 | context: ./llama.cpp 151 | args: 152 | - LLAMACPP_VERSION=b5480 153 | volumes: 154 | - ./models:/app/models 155 | environment: 156 | APP_MODE: none 157 | ports: 158 | - "8080:8080" 159 | 160 | backend: 161 | restart: "unless-stopped" 162 | build: 163 | context: ./llama.cpp 164 | args: 165 | - LLAMACPP_VERSION=b5480 166 | environment: 167 | APP_MODE: backend 168 | ports: 169 | - "50052:50052" 170 | ``` 171 | 172 | ## Ссылки 173 | 174 | - https://github.com/ggerganov/ggml/pull/761 175 | - https://github.com/ggerganov/llama.cpp/issues/7293 176 | - https://github.com/ggerganov/llama.cpp/pull/6829 177 | - https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc 178 | - https://github.com/mudler/LocalAI/commit/fdb45153fed10d8a2c775633e952fdf02de60461 179 | - https://github.com/mudler/LocalAI/pull/2324 180 | - https://github.com/ollama/ollama/issues/4643 181 | 182 | ## Лицензия 183 | 184 | Этот проект лицензирован на условиях лицензии MIT. Подробности в файле [LICENSE](./LICENSE). 185 | 186 | ## Цитирование 187 | 188 | Если вы используете этот проект в своих исследованиях или работе, пожалуйста, укажите ссылку на него следующим образом: 189 | 190 | ```text 191 | [Pavel Rykov]. (2024). llama.cpp RPC-server in Docker. GitHub. https://github.com/EvilFreelancer/docker-llama.cpp-rpc 192 | ``` 193 | 194 | Альтернатива в формате BibTeX: 195 | 196 | ```text 197 | @misc{pavelrykov2024llamacpprpc, 198 | author = {Pavel Rykov}, 199 | title = {llama.cpp RPC-server in Docker}, 200 | year = {2024}, 201 | url = {https://github.com/EvilFreelancer/docker-llama.cpp-rpc} 202 | } 203 | ``` 204 | -------------------------------------------------------------------------------- /README.zh.md: -------------------------------------------------------------------------------- 1 | # llama.cpp RPC服务器在Docker中 2 | 3 | [Русский](./README.md) | **中文** | [English](./README.en.md) 4 | 5 | 该项目基于[llama.cpp](https://github.com/ggerganov/llama.cpp) 6 | ,仅编译RPC服务器以及以[RPC](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc) 7 | 客户端模式运行的辅助工具,这些工具对于分布式推理转化为GGUF格式的大型语言模型(LLMs)和嵌入模型是必需的。 8 | 9 | ## 概述 10 | 11 | 使用RPC服务器的应用程序的通用架构如下所示: 12 | 13 | ![schema](./assets/schema.png) 14 | 15 | 除了`llama-server`,您还可以使用`llama-cli`或`llama-embedding`,它们都包含在标准的容器包中。 16 | 17 | Docker镜像支持以下架构: 18 | 19 | * 仅CPU - amd64, arm64, arm/v7 20 | * CUDA - amd64 21 | 22 | 不幸的是,CUDA在arm64上的构建由于错误而失败,因此它们暂时被禁用。 23 | 24 | ## 环境变量 25 | 26 | | 名称 | 默认值 | 描述 | 27 | |--------------------|---------------------------------------|----------------------------------------| 28 | | APP_MODE | backend | 容器的操作模式,可用选项:`server`,`backend`和`none` | 29 | | APP_BIND | 0.0.0.0 | 绑定到的接口 | 30 | | APP_PORT | 对于`server`是`8080`,对于`backend`是`50052` | 服务器运行的端口号 | 31 | | APP_MEM | 1024 | 客户端可用的内存量;在CUDA模式下,这是显存量 | 32 | | APP_RPC_BACKENDS | backend-cuda:50052,backend-cpu:50052 | 以逗号分隔的后端地址列表,容器将在server模式下尝试连接这些地址 | 33 | | APP_MODEL | /app/models/TinyLlama-1.1B-q4_0.gguf | 容器内的模型权重路径 | 34 | | APP_REPEAT_PENALTY | 1.0 | 重复惩罚 | 35 | | APP_GPU_LAYERS | 99 | 卸载到后端的层数 | 36 | 37 | ## docker-compose.yml示例 38 | 39 | 在此示例中,`llama-server`(容器`main`)启动并初始化[TinyLlama-1.1B-q4_0.gguf]模型,该模型预先下载到与`docker-compose.yml` 40 | 位于同一级的`./models`目录中。然后将`./models`目录挂载到`main`容器内部,并在路径`/app/models`下可用。 41 | 42 | ```yaml 43 | version: "3.9" 44 | 45 | services: 46 | 47 | main: 48 | image: evilfreelancer/llama.cpp-rpc:latest 49 | restart: unless-stopped 50 | volumes: 51 | - ./models:/app/models 52 | environment: 53 | # 操作模式(API服务器格式的RPC客户端) 54 | APP_MODE: server 55 | # 容器内部预先加载的模型权重路径 56 | APP_MODEL: /app/models/TinyLlama-1.1B-q4_0.gguf 57 | # 客户端将与之交互的RPC服务器地址 58 | APP_RPC_BACKENDS: backend-cuda:50052,backend-cpu:50052 59 | ports: 60 | - "127.0.0.1:8080:8080" 61 | 62 | backend-cpu: 63 | image: evilfreelancer/llama.cpp-rpc:latest 64 | restart: unless-stopped 65 | environment: 66 | # 操作模式(RPC服务器) 67 | APP_MODE: backend 68 | # RPC服务器可用的系统RAM大小(以MB为单位) 69 | APP_MEM: 2048 70 | 71 | backend-cuda: 72 | image: evilfreelancer/llama.cpp-rpc:latest-cuda 73 | restart: "unless-stopped" 74 | environment: 75 | # 操作模式(RPC服务器) 76 | APP_MODE: backend 77 | # RPC服务器可用的显存大小(以MB为单位) 78 | APP_MEM: 1024 79 | deploy: 80 | resources: 81 | reservations: 82 | devices: 83 | - driver: nvidia 84 | count: 1 85 | capabilities: [ gpu ] 86 | ``` 87 | 88 | 完整示例见[docker-compose.dist.yml](./docker-compose.dist.yml)。 89 | 90 | 结果我们得到如下图所示的架构: 91 | 92 | ![schema-example](./assets/schema-example.png) 93 | 94 | 启动后,可以发送如下的HTTP请求: 95 | 96 | ```shell 97 | curl \ 98 | --request POST \ 99 | --url http://localhost:8080/completion \ 100 | --header "Content-Type: application/json" \ 101 | --data '{"prompt": "Building a website can be done in 10 simple steps:"}' 102 | ``` 103 | 104 | ## 手动通过Docker构建 105 | 106 | 仅CPU模式下的容器构建: 107 | 108 | ```shell 109 | docker build ./llama.cpp/ 110 | ``` 111 | 112 | 针对CUDA的容器构建: 113 | 114 | ```shell 115 | docker build ./llama.cpp/ --file ./llama.cpp/Dockerfile.cuda 116 | ``` 117 | 118 | 通过构建参数LLAMACPP_VERSION,可以指定标记版本、分支名称或提交哈希值以从中构建容器。默认情况下,容器中指定的是master分支。 119 | 120 | ```shell 121 | # 从标记构建容器 https://github.com/ggerganov/llama.cpp/releases/tag/b3700 122 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=b3700 123 | ``` 124 | 125 | ```shell 126 | # 从master分支构建容器 127 | docker build ./llama.cpp/ --build-arg LLAMACPP_VERSION=master 128 | # 或者简单地 129 | docker build ./llama.cpp/ 130 | ``` 131 | 132 | ## 使用Docker Compose手动构建 133 | 134 | 一个执行显式标记指定的镜像构建的docker-compose.yml示例。 135 | 136 | ```yaml 137 | version: "3.9" 138 | 139 | services: 140 | 141 | main: 142 | restart: "unless-stopped" 143 | build: 144 | context: ./llama.cpp 145 | args: 146 | - LLAMACPP_VERSION=b3700 147 | volumes: 148 | - ./models:/app/models 149 | environment: 150 | APP_MODE: none 151 | ports: 152 | - "8080:8080" 153 | 154 | backend: 155 | restart: "unless-stopped" 156 | build: 157 | context: ./llama.cpp 158 | args: 159 | - LLAMACPP_VERSION=b3700 160 | environment: 161 | APP_MODE: backend 162 | ports: 163 | - "50052:50052" 164 | ``` 165 | 166 | ## 链接 167 | 168 | - https://github.com/ggerganov/ggml/pull/761 169 | - https://github.com/ggerganov/llama.cpp/issues/7293 170 | - https://github.com/ggerganov/llama.cpp/pull/6829 171 | - https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc 172 | - https://github.com/mudler/LocalAI/commit/fdb45153fed10d8a2c775633e952fdf02de60461 173 | - https://github.com/mudler/LocalAI/pull/2324 174 | - https://github.com/ollama/ollama/issues/4643 175 | 176 | ## 引用 177 | 178 | ```text 179 | [Pavel Rykov]. (2024). llama.cpp RPC-server in Docker. GitHub. https://github.com/EvilFreelancer/docker-llama.cpp-rpc 180 | ``` 181 | 182 | ```text 183 | @misc{pavelrykov2024llamacpprpc, 184 | author = {Pavel Rykov}, 185 | title = {llama.cpp RPC-server in Docker}, 186 | year = {2024}, 187 | url = {https://github.com/EvilFreelancer/docker-llama.cpp-rpc} 188 | } 189 | ``` 190 | -------------------------------------------------------------------------------- /assets/inf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvilFreelancer/docker-llama.cpp-rpc/22e8e5ef54a624cbed0dd1171059e1cc6686811d/assets/inf.png -------------------------------------------------------------------------------- /assets/schema-example.flowchart.txt: -------------------------------------------------------------------------------- 1 | main { 2 | llama-server 3 | TCP: (.connection) 4 | } 5 | 6 | backend-cpu:50052 { 7 | rpc-server .connection 8 | backend \(CPU\) 9 | } 10 | 11 | backend-cuda:50052 { 12 | rpc-server .connection 13 | backend \(CUDA\) 14 | } 15 | -------------------------------------------------------------------------------- /assets/schema-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvilFreelancer/docker-llama.cpp-rpc/22e8e5ef54a624cbed0dd1171059e1cc6686811d/assets/schema-example.png -------------------------------------------------------------------------------- /assets/schema.flowchart.txt: -------------------------------------------------------------------------------- 1 | Main Host { 2 | llama-server 3 | TCP: (.connection) 4 | } 5 | 6 | Host A { 7 | rpc-server .connection 8 | backend\n\(CUDA, CPU, etc.\) 9 | } 10 | 11 | Host B { 12 | rpc-server .connection 13 | backend\n\(CUDA, CPU, etc.\) 14 | } 15 | 16 | Host C { 17 | rpc-server .connection 18 | backend\n\(CUDA, CPU, etc.\) 19 | } 20 | -------------------------------------------------------------------------------- /assets/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvilFreelancer/docker-llama.cpp-rpc/22e8e5ef54a624cbed0dd1171059e1cc6686811d/assets/schema.png -------------------------------------------------------------------------------- /docker-compose.dist.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | x-shared-logs: &shared-logs 4 | logging: 5 | driver: "json-file" 6 | options: 7 | max-size: "10k" 8 | 9 | x-shared-deploy: &shared-deploy 10 | deploy: 11 | resources: 12 | reservations: 13 | devices: 14 | - driver: nvidia 15 | count: 1 16 | capabilities: [ gpu ] 17 | <<: *shared-logs 18 | 19 | services: 20 | 21 | main: 22 | image: evilfreelancer/llama.cpp-rpc:latest 23 | restart: "unless-stopped" 24 | #build: 25 | # context: ./llama.cpp 26 | # args: 27 | # - LLAMACPP_VERSION=b5480 28 | volumes: 29 | - ./models:/app/models 30 | environment: 31 | # Режим работы (RPC-клиент в формате API-сервера) 32 | APP_MODE: server 33 | # Путь до весов, предварительно загруженной модели, внутри контейнера 34 | APP_MODEL: /app/models/TinyLlama-1.1B-q4_0.gguf 35 | # Адреса RPC-серверов с которыми будет взаимодействовать клиент 36 | APP_RPC_BACKENDS: backend-cuda:50052,backend-cpu:50052 37 | ports: 38 | - "127.0.0.1:8080:8080" 39 | <<: *shared-logs 40 | 41 | backend-cpu: 42 | image: evilfreelancer/llama.cpp-rpc:latest 43 | restart: "unless-stopped" 44 | #build: 45 | # context: ./llama.cpp 46 | # args: 47 | # - LLAMACPP_VERSION=b5480 48 | environment: 49 | # Режим работы (RPC-сервер) 50 | APP_MODE: backend 51 | # Количество доступной RPC-серверу системной оперативной памяти (в Мегабайтах) 52 | APP_MEM: 2048 53 | ports: 54 | - "127.0.0.1:50152:50052" 55 | <<: *shared-logs 56 | 57 | backend-cuda: 58 | image: evilfreelancer/llama.cpp-rpc:latest-cuda 59 | restart: "unless-stopped" 60 | #build: 61 | # context: ./llama.cpp 62 | # dockerfile: Dockerfile.cuda 63 | # args: 64 | # - LLAMACPP_VERSION=b5480 65 | environment: 66 | # Режим работы (RPC-сервер) 67 | APP_MODE: backend 68 | # Количество доступной RPC-серверу оперативной памяти видеокарты (в Мегабайтах) 69 | APP_MEM: 1024 70 | ports: 71 | - "127.0.0.1:50252:50052" 72 | <<: *shared-deploy 73 | -------------------------------------------------------------------------------- /examples/completion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | curl \ 6 | --request POST \ 7 | --url http://localhost:8080/completion \ 8 | --header "Content-Type: application/json" \ 9 | --data '{"prompt": "Building a website can be done in 10 simple steps:"}' 10 | -------------------------------------------------------------------------------- /examples/server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xe 4 | 5 | curl \ 6 | --request POST \ 7 | --url http://localhost:8080/embeddings \ 8 | --header "Content-Type: application/json" \ 9 | --data '{"content": "Building a website can be done in 10 simple steps:"}' 10 | -------------------------------------------------------------------------------- /llama.cpp/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 AS builder 2 | WORKDIR /app 3 | 4 | ARG LLAMACPP_REPO="https://github.com/ggerganov/llama.cpp.git" 5 | # It may be name of branch, tag or commit hash 6 | ARG LLAMACPP_VERSION="master" 7 | 8 | # To get latest tag use this: 9 | # git -c 'versionsort.suffix=-' ls-remote --tags --sort='v:refname' \ 10 | # "https://github.com/ggerganov/llama.cpp.git" 'b*' | \ 11 | # tail --lines=1 | cut --delimiter='/' --fields=3 12 | # For details see here: https://stackoverflow.com/questions/8932389/git-shallow-clone-to-specific-tag) 13 | 14 | # Install dependencies 15 | RUN set -xe \ 16 | && apt update -q \ 17 | && apt install -fyq bash wget git cmake make g++ curl libcurl4-openssl-dev \ 18 | && apt clean 19 | 20 | # Clone repo 21 | RUN set -xe \ 22 | && git clone --branch "$LLAMACPP_VERSION" --depth 1 "$LLAMACPP_REPO" 23 | 24 | # Build binaries 25 | WORKDIR /app/llama.cpp 26 | RUN set -xe \ 27 | && cmake -B build -DGGML_RPC=ON -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON \ 28 | && cmake --build build --config Release -j$(nproc) 29 | 30 | 31 | FROM ubuntu:24.04 32 | WORKDIR /app 33 | 34 | # Install basic dependencies 35 | RUN set -xe \ 36 | && apt update -q \ 37 | && apt install -fyq libgomp1 curl \ 38 | && apt clean 39 | 40 | # Create folders 41 | RUN set -xe \ 42 | && mkdir -pv /app/models 43 | 44 | # Copy compiled tools 45 | COPY --from=builder /app/llama.cpp/build/bin/*.so /usr/lib/x86_64-linux-gnu 46 | COPY --from=builder /app/llama.cpp/build/bin/rpc-server . 47 | COPY --from=builder /app/llama.cpp/build/bin/llama-cli . 48 | COPY --from=builder /app/llama.cpp/build/bin/llama-embedding . 49 | COPY --from=builder /app/llama.cpp/build/bin/llama-server . 50 | 51 | # Init entrypoint 52 | ADD entrypoint.sh . 53 | ENTRYPOINT ["/app/entrypoint.sh"] 54 | -------------------------------------------------------------------------------- /llama.cpp/Dockerfile.cuda: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=24.04 2 | ARG CUDA_VERSION=12.6.3 3 | ARG BASE_CUDA_DEVEL_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} 4 | ARG BASE_CUDA_RUNTIME_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} 5 | 6 | FROM ${BASE_CUDA_DEVEL_CONTAINER} AS builder 7 | WORKDIR /app 8 | 9 | ARG LLAMACPP_REPO="https://github.com/ggerganov/llama.cpp.git" 10 | # It may be name of branch, tag or commit hash 11 | ARG LLAMACPP_VERSION="master" 12 | 13 | # To get latest tag use this: 14 | # git -c 'versionsort.suffix=-' ls-remote --tags --sort='v:refname' \ 15 | # "https://github.com/ggerganov/llama.cpp.git" 'b*' | \ 16 | # tail --lines=1 | cut --delimiter='/' --fields=3 17 | # For details see here: https://stackoverflow.com/questions/8932389/git-shallow-clone-to-specific-tag) 18 | 19 | # Anti-"sanction" fix 20 | #RUN set -xe \ 21 | # && sed -r 's#developer.download.nvidia.com#mirror.yandex.ru/mirrors/developer.download.nvidia.com#g' -i /etc/apt/sources.list.d/cuda-ubuntu2204-x86_64.list 22 | 23 | # Install dependencies 24 | RUN set -xe \ 25 | && apt update -q \ 26 | && apt install -fyq bash wget git cmake make g++ curl libcurl4-openssl-dev \ 27 | && apt clean 28 | 29 | # Clone repo 30 | RUN set -xe \ 31 | && git clone --branch "$LLAMACPP_VERSION" --depth 1 "$LLAMACPP_REPO" 32 | 33 | # Build binaries 34 | WORKDIR /app/llama.cpp 35 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 36 | RUN set -xe \ 37 | && export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH \ 38 | && cmake -B build -DGGML_CUDA=ON -DGGML_RPC=ON -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON \ 39 | && cmake --build build --config Release -j$(nproc) 40 | 41 | 42 | FROM ${BASE_CUDA_RUNTIME_CONTAINER} 43 | WORKDIR /app 44 | 45 | # Install basic dependencies 46 | RUN set -xe \ 47 | && apt update -q \ 48 | && apt install -fyq libgomp1 curl \ 49 | && apt clean 50 | 51 | # Create folders 52 | RUN set -xe \ 53 | && mkdir -pv /app/models 54 | 55 | # Copy compiled tools 56 | COPY --from=builder /app/llama.cpp/build/bin/*.so /usr/lib/x86_64-linux-gnu 57 | COPY --from=builder /app/llama.cpp/build/bin/rpc-server . 58 | COPY --from=builder /app/llama.cpp/build/bin/llama-cli . 59 | COPY --from=builder /app/llama.cpp/build/bin/llama-embedding . 60 | COPY --from=builder /app/llama.cpp/build/bin/llama-server . 61 | 62 | # Init entrypoint 63 | ADD entrypoint.sh . 64 | ENTRYPOINT ["/app/entrypoint.sh"] 65 | -------------------------------------------------------------------------------- /llama.cpp/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")" 4 | 5 | # If arguments passed to the script — treat them as custom command 6 | if [ "$#" -gt 0 ]; then 7 | echo && echo "Custom CMD detected, executing: $*" && echo 8 | exec "$@" 9 | fi 10 | 11 | # Default env setup 12 | [ "x$APP_MODE" = "x" ] && export APP_MODE="backend" 13 | [ "x$APP_BIND" = "x" ] && export APP_BIND="0.0.0.0" 14 | [ "x$APP_MEM" = "x" ] && export APP_MEM="1024" 15 | [ "x$APP_MODEL" = "x" ] && export APP_MODEL="/app/models/TinyLlama-1.1B-q4_0.gguf" 16 | [ "x$APP_REPEAT_PENALTY" = "x" ] && export APP_REPEAT_PENALTY="1.0" 17 | [ "x$APP_GPU_LAYERS" = "x" ] && export APP_GPU_LAYERS="99" 18 | [ "x$APP_THREADS" = "x" ] && export APP_THREADS="16" 19 | [ "x$APP_DEVICE" = "x" ] && unset APP_DEVICE 20 | [ "x$APP_CACHE" = "x" ] && export APP_CACHE="false" 21 | [ "x$APP_EMBEDDING" = "x" ] && export APP_EMBEDDING="false" 22 | 23 | # Construct the command with the options 24 | if [ "$APP_MODE" = "backend" ]; then 25 | [ "x$APP_PORT" = "x" ] && export APP_PORT="50052" 26 | # RPC backend 27 | CMD="/app/rpc-server" 28 | CMD+=" --host $APP_BIND" 29 | CMD+=" --port $APP_PORT" 30 | CMD+=" --mem $APP_MEM" 31 | CMD+=" --threads $APP_THREADS" 32 | [ -n "$APP_DEVICE" ] && CMD+=" --device $APP_DEVICE" 33 | [ "$APP_CACHE" = "true" ] && CMD+=" --cache" 34 | elif [ "$APP_MODE" = "server" ]; then 35 | [ "x$APP_PORT" = "x" ] && export APP_PORT="8080" 36 | # API server connected to multipla backends 37 | CMD="/app/llama-server" 38 | CMD+=" --host $APP_BIND" 39 | CMD+=" --port $APP_PORT" 40 | CMD+=" --model $APP_MODEL" 41 | CMD+=" --repeat-penalty $APP_REPEAT_PENALTY" 42 | CMD+=" --gpu-layers $APP_GPU_LAYERS" 43 | [ -n "$APP_RPC_BACKENDS" ] && CMD+=" --rpc $APP_RPC_BACKENDS" 44 | [ "$APP_EMBEDDING" = "true" ] && CMD+=" --embedding" 45 | elif [ "$APP_MODE" = "none" ]; then 46 | # For cases when you want to use /app/llama-cli 47 | echo "APP_MODE is set to none. Sleeping indefinitely." 48 | CMD="sleep inf" 49 | else 50 | echo "Invalid APP_MODE specified: $APP_MODE" 51 | exit 1 52 | fi 53 | 54 | # Execute the command 55 | echo && echo "Executing command: $CMD" && echo 56 | exec $CMD 57 | exit 0 58 | --------------------------------------------------------------------------------