├── Dockerfile
├── README.md
├── docker-compose.yml
├── rkllm-toolkit
    ├── docker-compose.yml
    └── model
    │   └── test.py
├── rkllm_api_demo
    └── llm_demo
└── rkllm_server
    ├── README.md
    ├── fix_freq_rk3588.sh
    ├── flask_server.py
    ├── gradio_server.py
    └── lib
        ├── .gitkeep
        └── librkllmrt.so


/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 第一阶段：构建阶段
 2 | FROM python:3.8-slim as builder
 3 | 
 4 | # 设置环境变量以避免交互式安装过程中提示输入
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # 更新包列表并安装必要的工具和依赖
 8 | RUN apt-get update && apt-get install -y --no-install-recommends \
 9 |     build-essential \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | # 安装 Flask 和 Werkzeug 以及 Gradio
13 | RUN pip install --no-cache-dir --target=/install flask==2.2.2 Werkzeug==2.2.2 gradio>=4.24.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
14 | 
15 | # 第二阶段：运行阶段
16 | FROM python:3.8-slim
17 | 
18 | # 设置环境变量以避免交互式安装过程中提示输入
19 | ENV DEBIAN_FRONTEND=noninteractive
20 | 
21 | # 复制构建阶段的安装结果到最终镜像
22 | COPY --from=builder /install /usr/local/lib/python3.8/site-packages
23 | 
24 | # 复制本地的 rkllm_server 目录到容器的根目录 /
25 | COPY ./rkllm_server /rkllm_server
26 | 
27 | # 设置卷和工作目录
28 | VOLUME /rkllm_server/model
29 | 
30 | # 设置工作目录
31 | WORKDIR /rkllm_server
32 | 
33 | # 暴露端口
34 | EXPOSE 8080
35 | 
36 | # 清理安装过程中产生的缓存和临时文件
37 | RUN apt-get purge -y --auto-remove \
38 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Rk3588平台用 NPU跑大模型，rknn-llm 服务端
  2 | https://github.com/airockchip/rknn-llm
  3 | 
  4 | 
  5 | # 一、RKLLM 简介
  6 | RKLLM 可以帮助用户快速将 LLM 模型部署到 Rockchip 芯片中，本仓库所述目前支持芯片：rk3588，整体框架如下：
  7 | 
  8 | ![Framework](https://github.com/airockchip/rknn-llm/raw/main/res/framework.jpg)
  9 | 
 10 | 要使用RKNPU，用户需要先在电脑上运行RKLLM-Toolkit工具，将训练好的模型转换为RKLLM格式的模型，然后在开发板上使用RKLLM C API进行推理。
 11 | 
 12 | - RKLLM-Toolkit是一套软件开发包，供用户在PC上进行模型转换和量化。
 13 | - RKLLM Runtime为Rockchip NPU平台提供C/C++编程接口，帮助用户部署RKLLM模型，加速LLM应用的实现。
 14 | - RKNPU内核驱动负责与NPU硬件交互，已经开源，可以在Rockchip内核代码中找到。
 15 | 
 16 | ## 支持平台
 17 | - RK3588系列
 18 | 
 19 | ## 目前支持模型
 20 |   - [X] [TinyLLAMA 1.1B](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/tree/fe8a4ea1ffedaf415f4da2f062534de366a451e6) 
 21 |   - [X] [Qwen 1.8B](https://huggingface.co/Qwen/Qwen-1_8B-Chat/tree/1d0f68de57b88cfde81f3c3e537f24464d889081)
 22 |   - [X] [Qwen2 0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B/tree/8f445e3628f3500ee69f24e1303c9f10f5342a39)
 23 |   - [X] [Phi-2 2.7B](https://hf-mirror.com/microsoft/phi-2/tree/834565c23f9b28b96ccbeabe614dd906b6db551a)
 24 |   - [X] [Phi-3 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/291e9e30e38030c23497afa30f3af1f104837aa6)
 25 |   - [X] [ChatGLM3 6B](https://huggingface.co/THUDM/chatglm3-6b/tree/103caa40027ebfd8450289ca2f278eac4ff26405)
 26 |   - [X] [Gemma 2B](https://huggingface.co/google/gemma-2b-it/tree/de144fb2268dee1066f515465df532c05e699d48)
 27 |   - [X] [InternLM2 1.8B](https://huggingface.co/internlm/internlm2-chat-1_8b/tree/ecccbb5c87079ad84e5788baa55dd6e21a9c614d)
 28 |   - [X] [MiniCPM 2B](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/tree/79fbb1db171e6d8bf77cdb0a94076a43003abd9e)
 29 | 
 30 | # 二、模型转换（RKLLM-Toolkit容器转换工具）
 31 | 要使用 RKNPU，用户需要先在 x86 工作站上运行 RKLLM-Toolkit 容器转换工具，将训练好的模型转换为 RKLLM 格式的模型，然后在开发板上使用 RKLLM C API 进行推理
 32 | 
 33 | `★内存要大于32G 否则会失败，转换前一定要关闭其他应用，以免资源不足，转换失败`
 34 | 
 35 | `★本文有第三章节有转换好的模型供下载`
 36 | 
 37 | ## 1. docker-compose.yml 
 38 | ~~~ docker
 39 | version: '3.8'
 40 | 
 41 | services:
 42 |   rk3588_llm:
 43 |     image: kaylor/rk3588_llm
 44 |     platform: linux/amd64
 45 |     container_name: rk3588_llm
 46 |     restart: unless-stopped
 47 |     privileged: true
 48 |     volumes:
 49 |       - ./model:/root/ws
 50 |     stdin_open: true  # -i
 51 |     tty: true         # -t
 52 |     command: /bin/bash
 53 | ~~~
 54 | ## 2. 启用
 55 | ~~~ liunx
 56 | docker-compose up -d
 57 | ~~~
 58 | ## 3. [魔塔](https://www.modelscope.cn/) 或[Hugging Face](https://huggingface.co)下载模型
 59 | 模型放在下载在 ./model 目录
 60 | 
 61 | ## 4. 下载转换python程序到./model
 62 | ~~~ liunx
 63 | wget https://raw.githubusercontent.com/airockchip/rknn-llm/main/rkllm-toolkit/examples/huggingface/test.py
 64 | ~~~
 65 | 
 66 | ## 5. 修改test.py中的模型路径
 67 | 
 68 | `modelpath = '/root/ws/Qwen2.5-3B-Instruct'`
 69 | 
 70 | 其中“/root/ws/”为容器内的路径，“Qwen2.5-3B-Instruct” 为下载的模型文件夹
 71 | 
 72 | ## 6. 修改test.py中的生成转换模型的名称和路径
 73 | 
 74 | `ret = llm.export_rkllm("./Qwen2.5-3B.rkllm") `
 75 | 
 76 | 当前目录（./model）中生成Qwen2.5-3B.rkllm
 77 | 
 78 | ## 7. 转换模型
 79 | ### 进入容器内部：
 80 | ~~~ liunx
 81 |  docker exec -it rk3588_llm /bin/bash
 82 | ~~~
 83 | ### 进入模型文件夹
 84 | ~~~ liunx
 85 | cd /root/ws
 86 | ~~~
 87 | ### 运行转换
 88 | ~~~ liunx
 89 | python3 test.py
 90 | ~~~
 91 | 
 92 | 
 93 | # 三、RK3588 的 RKNPU driver 驱动
 94 | 
 95 | 由于所提供的 RKLLM 所需要的 NPU 内核版本较高，用户在板端使用 RKLLM Runtime 进行模型
 96 | 推理前，首先需要确认板端的 NPU 内核是否为 v0.9.6 版本（https://github.com/airockchip/rknn-llm/tree/main/rknpu-driver）
 97 | 具体的查询命令如下：
 98 | ~~~ liunx
 99 | # 板端执行以下命令，查询 NPU 内核版本
100 | cat /sys/kernel/debug/rknpu/version
101 | # 确认命令输出是否为：
102 | # RKNPU driver: v0.9.6
103 | ~~~
104 | 若所查询的 NPU 内核版本低于 v0.9.6，请前往官方固件地址下载最新固件进行更新，详见技术手册：https://github.com/airockchip/rknn-llm/blob/main/doc/Rockchip_RKLLM_SDK_CN.pdf
105 | 
106 | 若是H88K_V1，直接更新固件（内含RKNPU driver: v0.9.7）：https://github.com/wudingjian/armbian-h88k-images/releases/tag/20240917-2001
107 | 
108 | 
109 | # 四、下载 转化后的.rkllm模型文件 
110 | 
111 | 如果不转换模型（跳过“二、模型转换”），可以直接下载转好后的rkllm模型
112 | 
113 | 例如：`Qwen2.5-3B.rkllm `
114 | 
115 | `rkllm模型下载链接：https://pan.baidu.com/s/1kIxv488-0IiQdZgDpKO-cw?pwd=up1b 
116 | 提取码：up1b`
117 | 
118 | https://huggingface.co/jsntwdj/rkllm
119 | 
120 | 下载后的模型文件放在./model目录下
121 | 
122 | # 五、rk3588开发板部署LLM服务端 docker-compose.yml 
123 | 
124 | ~~~ docker
125 | version: '3.8'
126 | 
127 | services:
128 |   rkllm_server:
129 |     image: jsntwdj/rkllm_chat:1.0.1
130 |     container_name: rkllm_chat
131 |     restart: unless-stopped
132 |     privileged: true
133 |     devices:
134 |       - /dev:/dev
135 |     volumes:
136 |       - ./model:/rkllm_server/model  # rkllm模型文件目录
137 |     ports:
138 |       - "8080:8080" # 端口自行修改
139 |     command: >
140 |       sh -c "python3 gradio_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm" #  rkllm模型文件名称自行修改
141 | 
142 | ~~~
143 | 
144 | 启用
145 | ~~~ liunx
146 | docker-compose up -d
147 | ~~~
148 | # 六、聊天web界面
149 | 
150 | 浏览器访问 `http://ip:8080`
151 | 
152 | # 七、rkllm_api_demo(可选其他交互方式)（ssh中演示的demo）
153 | ## 1. rkllm_api_demo 编译(在x86 pc上编译)
154 | ### 下载rkllm_api_demo
155 | ~~~ liunx
156 | git clone --no-checkout https://github.com/airockchip/rknn-llm.git
157 | cd /rknn-llm/tree/main/rkllm-runtime/examples/rkllm_api_demo
158 | ~~~
159 | 
160 | ### 下载 gcc 编译工具
161 | 使用 RKLLM Runtime 的过程中，需要注意 gcc 编译工具的版本。推荐使用交叉编译工具
162 | gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu；下载路径为：GCC_10.2 交叉编译工具下载
163 | 地址：https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu-a/10.2-2020.11/binrel/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu.tar.xz
164 | 
165 | 放在 root 目录
166 | 
167 | ### 修改配置文件
168 | 修改./rkllm_api_demo/src/main.cpp中的两个参数
169 | 
170 | `param.num_npu_core = 3; # rk3588 3个核心 由1或2 改成3`
171 |     
172 | `param.use_gpu = false;  # 禁止gpu加速`
173 | 
174 | ### 构建
175 | 确保 `build-linux.sh` 脚本中的 `GCC_COMPILER_PATH` 选项配置正确：
176 | ```sh
177 | GCC_COMPILER_PATH=~/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu
178 | ```
179 | 要执行，请运行：
180 | ```bash
181 | bash build-linux.sh
182 | ```
183 | 编译生成的rk3588 上的可执行程序llm_demo目录在./rkllm_api_demo/build/build_linux_aarch64_Release/
184 | 
185 | 
186 | ## 3. RK3588开发板设备安装rkllm_api_demo
187 | 将编译好的 `llm_demo` 文件和 `librkllmrt.so` 文件推送到RK3588设备：
188 | ```bash
189 | 编译好的 ./rkllm_api_demo/build/build_linux_aarch64_Release/llm_demo
190 | 
191 | ★仓库中已有编译好的 /rkllm_api_demo/llm_demo
192 | 
193 | 文件复制到 ~/llm  #目录自行确定
194 | 
195 | 
196 | 使用 wget 下载librkllmrt.so文件
197 | ~~~ liunx
198 | # 使用 wget 下载librkllmrt.so文件
199 | wget https://raw.githubusercontent.com/airockchip/rknn-llm/main/rkllm-runtime/runtime/Linux/librkllm_api/aarch64/librkllmrt.so
200 | # 使用 cp 命令将文件复制到 /usr/lib/ 目录
201 | cp librkllmrt.so /usr/lib/librkllmrt.so
202 | ~~~
203 | 
204 | ```
205 | ## 4. 运行demo
206 | ~~~ ssh
207 | # 程序目录
208 | cd ~/llm
209 | # 将当前 shell 进程及其子进程的文件描述符限制设置为 102400
210 | ulimit -n 102400
211 | # "./model/Qwen2.5-3B.rkllm " 为转换好的模型路径
212 | taskset f0 ./llm_demo ./model/Qwen2.5-3B.rkllm  
213 | ~~~
214 | 输出聊天对话界面
215 | 
216 | # 八、FAQ
217 | 
218 | FAQ：https://github.com/wudingjian/rkllm_chat/issues
219 | hub.docker：https://hub.docker.com/r/jsntwdj/rkllm_chat
220 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   rkllm_server:
 5 |     image: jsntwdj/rkllm_chat:1.0.1
 6 |     container_name: rkllm_chat
 7 |     restart: unless-stopped
 8 |     privileged: true
 9 |     devices:
10 |       - /dev:/dev
11 |     volumes:
12 |       - ./model:/rkllm_server/model  # rkllm模型文件目录
13 |     ports:
14 |       - "8080:8080" # 端口自行修改
15 |     command: >
16 |       sh -c "python3 gradio_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm" #  rkllm模型文件名称自行修改
17 | 


--------------------------------------------------------------------------------
/rkllm-toolkit/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   rk3588_llm:
 5 |     image: kaylor/rk3588_llm
 6 |     platform: linux/amd64
 7 |     container_name: rk3588_llm
 8 |     restart: unless-stopped
 9 |     privileged: true
10 |     volumes:
11 |       - ./model:/root/ws
12 |     stdin_open: true  # -i
13 |     tty: true         # -t
14 |     command: /bin/bash


--------------------------------------------------------------------------------
/rkllm-toolkit/model/test.py:
--------------------------------------------------------------------------------
 1 | from rkllm.api import RKLLM
 2 | 
 3 | '''
 4 | https://huggingface.co/Qwen/Qwen-1_8B-Chat
 5 | Download the Qwen model from the above website.
 6 | '''
 7 | 
 8 | modelpath = '/root/ws/Qwen2.5-3B-Instruct'
 9 | llm = RKLLM()
10 | 
11 | # Load model
12 | ret = llm.load_huggingface(model = modelpath)
13 | if ret != 0:
14 |     print('Load model failed!')
15 |     exit(ret)
16 | 
17 | # Build model
18 | ret = llm.build(do_quantization=True, optimization_level=1, quantized_dtype='w8a8', target_platform='rk3588')
19 | if ret != 0:
20 |     print('Build model failed!')
21 |     exit(ret)
22 | 
23 | # Export rknn model
24 | ret = llm.export_rkllm("./Qwen2.5-3B.rkllm")
25 | if ret != 0:
26 |     print('Export model failed!')
27 |     exit(ret)
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/rkllm_api_demo/llm_demo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wudingjian/rkllm_chat/ac815ab4363bbdb48d9297c7e7ca2cf7055f8c46/rkllm_api_demo/llm_demo


--------------------------------------------------------------------------------
/rkllm_server/README.md:
--------------------------------------------------------------------------------
 1 | # 一、Gradio 模式
 2 | ## 服务端
 3 | ### 1. 修改 gradio_server.py, 禁用 GPU 进行 prefill 加速
 4 | `rknnllm_param.use_gpu = False`
 5 | 
 6 | ### 2.Dockerfile设置启动命令
 7 | ~~~docker
 8 | CMD ["sh", "-c", "python3 gradio_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm"]
 9 | ~~~
10 | ### 3. docker-compose.yml
11 | ~~~docker
12 |     command: >
13 |       sh -c "python3 gradio_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm" #  rkllm模型文件名称自行修改
14 | ~~~
15 | ## 客户端
16 | ### 下载
17 | ~~~linux
18 | wget https://github.com/airockchip/rknn-llm/raw/main/rkllm-runtime/examples/rkllm_server_demo/chat_api_gradio.py
19 | ~~~
20 | ### 修改网址
21 | 实例化Gradio Client，用户需要根据自己部署的具体网址进行修改
22 |     client = Client("http://172.16.10.102:8080/")
23 | ### 运行 chat_api_gradio.py
24 | ~~~linux
25 | python3 chat_api_gradio.py
26 | ~~~
27 | 
28 | # 二 、Falsk 模式
29 | ## 服务端
30 | ### 1.修改 flask_server.py, 禁用 GPU 进行 prefill 加速
31 | `rknnllm_param.use_gpu = False`
32 | ### 2.Dockerfile设置启动命令
33 | ~~~docker
34 | CMD ["sh", "-c", "python3 flask_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm"]
35 | ~~~
36 | ### 3.docker-compose.yml
37 | ~~~docker
38 |     command: >
39 |       sh -c "python3 flask_server.py --target_platform rk3588 --rkllm_model_path /rkllm_server/model/Qwen2.5-3B.rkllm" #  rkllm模型文件名称自行修改
40 | ~~~
41 | ## 客户端
42 | ~~~linux
43 | wget https://github.com/airockchip/rknn-llm/raw/main/rkllm-runtime/examples/rkllm_server_demo/chat_api_flask.py
44 | ~~~
45 | ### 设置 Server 服务器的地址
46 | 修改地址
47 | `server_url = 'http://172.16.10.102:8080/rkllm_chat'`
48 | 
49 | ### 运行 chat_api_flask.py
50 | python3 chat_api_flask.py
51 | 


--------------------------------------------------------------------------------
/rkllm_server/fix_freq_rk3588.sh:
--------------------------------------------------------------------------------
 1 | echo userspace > /sys/class/devfreq/fdab0000.npu/governor
 2 | echo 1000000000 > /sys/class/devfreq/fdab0000.npu/userspace/set_freq
 3 | 
 4 | echo userspace > /sys/devices/system/cpu/cpufreq/policy0/scaling_governor
 5 | echo 1800000 > /sys/devices/system/cpu/cpufreq/policy0/scaling_setspeed
 6 | echo userspace > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor
 7 | echo 2400000 > /sys/devices/system/cpu/cpufreq/policy4/scaling_setspeed
 8 | echo userspace > /sys/devices/system/cpu/cpufreq/policy6/scaling_governor
 9 | echo 2400000 > /sys/devices/system/cpu/cpufreq/policy6/scaling_setspeed
10 | 
11 | echo userspace > /sys/class/devfreq/dmc/governor
12 | echo 2112000000 > /sys/class/devfreq/dmc/userspace/set_freq
13 | 
14 | echo userspace > /sys/class/devfreq/fb000000.gpu/governor
15 | echo 1000000000 > /sys/class/devfreq/fb000000.gpu/userspace/set_freq
16 | 


--------------------------------------------------------------------------------
/rkllm_server/flask_server.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import sys
  3 | import os
  4 | import subprocess
  5 | import resource
  6 | import threading
  7 | import time
  8 | import argparse
  9 | import json
 10 | from flask import Flask, request, jsonify, Response
 11 | 
 12 | app = Flask(__name__)
 13 | 
 14 | # 创建一个锁，用于控制多人访问Server
 15 | lock = threading.Lock()
 16 | 
 17 | # 创建一个全局变量，用于标识服务器当前是否处于阻塞状态
 18 | is_blocking = False
 19 | 
 20 | # 设置动态库路径
 21 | rkllm_lib = ctypes.CDLL('lib/librkllmrt.so')
 22 | 
 23 | # 定义全局变量，用于保存回调函数的输出，便于在gradio界面中输出
 24 | global_text = []
 25 | global_state = -1
 26 | split_byte_data = bytes(b"") # 用于保存分割的字节数据
 27 | 
 28 | # 定义动态库中的结构体
 29 | class Token(ctypes.Structure):
 30 |     _fields_ = [
 31 |         ("logprob", ctypes.c_float),
 32 |         ("id", ctypes.c_int32)
 33 |     ]
 34 | 
 35 | class RKLLMResult(ctypes.Structure):
 36 |     _fields_ = [
 37 |         ("text", ctypes.c_char_p),
 38 |         ("tokens", ctypes.POINTER(Token)),
 39 |         ("num", ctypes.c_int32)
 40 |     ]
 41 | 
 42 | 
 43 | # 定义回调函数
 44 | def callback(result, userdata, state):
 45 |     global global_text, global_state, split_byte_data
 46 |     if state == 0:
 47 |         # 保存输出的token文本及RKLLM运行状态
 48 |         global_state = state
 49 |         # 需要监控当前的字节数据是否完整，不完整则进行记录，后续进行解析
 50 |         try:
 51 |             global_text.append((split_byte_data + result.contents.text).decode('utf-8'))
 52 |             print((split_byte_data + result.contents.text).decode('utf-8'), end='')
 53 |             split_byte_data = bytes(b"")
 54 |         except:
 55 |             split_byte_data += result.contents.text
 56 |         sys.stdout.flush()
 57 |     elif state == 1:
 58 |         # 保存RKLLM运行状态
 59 |         global_state = state
 60 |         print("\n")
 61 |         sys.stdout.flush()
 62 |     else:
 63 |         print("run error")
 64 | 
 65 | # Python端与C++端的回调函数连接
 66 | callback_type = ctypes.CFUNCTYPE(None, ctypes.POINTER(RKLLMResult), ctypes.c_void_p, ctypes.c_int)
 67 | c_callback = callback_type(callback)
 68 | 
 69 | # 定义动态库中的结构体
 70 | class RKNNllmParam(ctypes.Structure):
 71 |     _fields_ = [
 72 |         ("model_path", ctypes.c_char_p),
 73 |         ("num_npu_core", ctypes.c_int32),
 74 |         ("max_context_len", ctypes.c_int32),
 75 |         ("max_new_tokens", ctypes.c_int32),
 76 |         ("top_k", ctypes.c_int32),
 77 |         ("top_p", ctypes.c_float),
 78 |         ("temperature", ctypes.c_float),
 79 |         ("repeat_penalty", ctypes.c_float),
 80 |         ("frequency_penalty", ctypes.c_float),
 81 |         ("presence_penalty", ctypes.c_float),
 82 |         ("mirostat", ctypes.c_int32),
 83 |         ("mirostat_tau", ctypes.c_float),
 84 |         ("mirostat_eta", ctypes.c_float),
 85 |         ("logprobs", ctypes.c_bool),
 86 |         ("top_logprobs", ctypes.c_int32),
 87 |         ("use_gpu", ctypes.c_bool)
 88 |     ]
 89 | 
 90 | # 定义RKLLM_Handle_t和userdata
 91 | RKLLM_Handle_t = ctypes.c_void_p
 92 | userdata = ctypes.c_void_p(None)
 93 | 
 94 | # 设置提示文本
 95 | PROMPT_TEXT_PREFIX = "<|im_start|>system You are a helpful assistant. <|im_end|> <|im_start|>user"
 96 | PROMPT_TEXT_POSTFIX = "<|im_end|><|im_start|>assistant"
 97 | 
 98 | # 定义Python端的RKLLM类，其中包括了对动态库中RKLLM模型的初始化、推理及释放操作
 99 | class RKLLM(object):
100 |     def __init__(self, model_path, target_platform):
101 |         rknnllm_param = RKNNllmParam()
102 |         rknnllm_param.model_path = bytes(model_path, 'utf-8')
103 |         if target_platform == "rk3588":
104 |             rknnllm_param.num_npu_core = 3
105 |         elif target_platform == "rk3576":
106 |             rknnllm_param.num_npu_core = 1
107 |         rknnllm_param.max_context_len = 320
108 |         rknnllm_param.max_new_tokens = 512
109 |         rknnllm_param.top_k = 1
110 |         rknnllm_param.top_p = 0.9
111 |         rknnllm_param.temperature = 0.8
112 |         rknnllm_param.repeat_penalty = 1.1
113 |         rknnllm_param.frequency_penalty = 0.0
114 |         rknnllm_param.presence_penalty = 0.0
115 |         rknnllm_param.mirostat = 0
116 |         rknnllm_param.mirostat_tau = 5.0
117 |         rknnllm_param.mirostat_eta = 0.1
118 |         rknnllm_param.logprobs = False
119 |         rknnllm_param.top_logprobs = 5
120 |         rknnllm_param.use_gpu = False
121 |         self.handle = RKLLM_Handle_t()
122 | 
123 |         self.rkllm_init = rkllm_lib.rkllm_init
124 |         self.rkllm_init.argtypes = [ctypes.POINTER(RKLLM_Handle_t), ctypes.POINTER(RKNNllmParam), callback_type]
125 |         self.rkllm_init.restype = ctypes.c_int
126 |         self.rkllm_init(ctypes.byref(self.handle), rknnllm_param, c_callback)
127 | 
128 |         self.rkllm_run = rkllm_lib.rkllm_run
129 |         self.rkllm_run.argtypes = [RKLLM_Handle_t, ctypes.POINTER(ctypes.c_char), ctypes.c_void_p]
130 |         self.rkllm_run.restype = ctypes.c_int
131 | 
132 |         self.rkllm_destroy = rkllm_lib.rkllm_destroy
133 |         self.rkllm_destroy.argtypes = [RKLLM_Handle_t]
134 |         self.rkllm_destroy.restype = ctypes.c_int
135 | 
136 |     def run(self, prompt):
137 |         prompt = bytes(PROMPT_TEXT_PREFIX + prompt + PROMPT_TEXT_POSTFIX, 'utf-8')
138 |         self.rkllm_run(self.handle, prompt, ctypes.byref(userdata))
139 |         return
140 | 
141 |     def release(self):
142 |         self.rkllm_destroy(self.handle)
143 | 
144 | if __name__ == "__main__":
145 |     parser = argparse.ArgumentParser()
146 |     parser.add_argument('--target_platform', help='目标平台: 如rk3588/rk3576;')
147 |     parser.add_argument('--rkllm_model_path', help='Linux板端上已转换好的rkllm模型的绝对路径')
148 |     args = parser.parse_args()
149 | 
150 |     if not (args.target_platform in ["rk3588", "rk3576"]):
151 |         print("====== Error: 请指定正确的目标平台: rk3588/rk3576 ======")
152 |         sys.stdout.flush()
153 |         exit()
154 | 
155 |     if not os.path.exists(args.rkllm_model_path):
156 |         print("====== Error: 请给出准确的rkllm模型路径，需注意是板端的绝对路径 ======")
157 |         sys.stdout.flush()
158 |         exit()
159 | 
160 |     # 定频设置
161 |     command = "sudo bash fix_freq_{}.sh".format(args.target_platform)
162 |     subprocess.run(command, shell=True)
163 | 
164 |     # 设置文件描述符限制
165 |     resource.setrlimit(resource.RLIMIT_NOFILE, (102400, 102400))
166 | 
167 |     # 初始化RKLLM模型
168 |     print("=========init....===========")
169 |     sys.stdout.flush()
170 |     target_platform = args.target_platform
171 |     model_path = args.rkllm_model_path
172 |     rkllm_model = RKLLM(model_path, target_platform)
173 |     print("RKLLM初始化成功！")
174 |     print("==============================")
175 |     sys.stdout.flush()
176 | 
177 |     # 创建一个函数用于接受用户使用 request 发送的数据
178 |     @app.route('/rkllm_chat', methods=['POST'])
179 |     def receive_message():
180 |         # 链接全局变量，获取回调函数的输出信息
181 |         global global_text, global_state
182 |         global is_blocking
183 | 
184 |         # 如果服务器正在阻塞状态，则返回特定响应
185 |         if is_blocking or global_state==0:
186 |             return jsonify({'status': 'error', 'message': 'RKLLM_Server is busy! Maybe you can try again later.'}), 503
187 |         
188 |         # 加锁
189 |         lock.acquire()
190 |         try:
191 |             # 设置服务器为阻塞状态
192 |             is_blocking = True
193 | 
194 |             # 获取 POST 请求中的 JSON 数据
195 |             data = request.json
196 |             if data and 'messages' in data:
197 |                 # 重置全局变量
198 |                 global_text = []
199 |                 global_state = -1
200 | 
201 |                 # 定义返回的结构体
202 |                 rkllm_responses = {
203 |                     "id": "rkllm_chat",
204 |                     "object": "rkllm_chat",
205 |                     "created": None,
206 |                     "choices": [],
207 |                     "usage": {
208 |                     "prompt_tokens": None,
209 |                     "completion_tokens": None,
210 |                     "total_tokens": None
211 |                     }
212 |                 }
213 | 
214 |                 if not "stream" in data.keys() or data["stream"] == False:
215 |                     # 在这里处理收到的数据
216 |                     messages = data['messages']
217 |                     print("Received messages:", messages)
218 |                     for index, message in enumerate(messages):
219 |                         input_prompt = message['content']
220 |                         rkllm_output = ""
221 |                         
222 |                         # 创建模型推理的线程
223 |                         model_thread = threading.Thread(target=rkllm_model.run, args=(input_prompt,))
224 |                         model_thread.start()
225 | 
226 |                         # 等待模型运行完成，定时检查模型的推理线程
227 |                         model_thread_finished = False
228 |                         while not model_thread_finished:
229 |                             while len(global_text) > 0:
230 |                                 rkllm_output += global_text.pop(0)
231 |                                 time.sleep(0.005)
232 | 
233 |                             model_thread.join(timeout=0.005)
234 |                             model_thread_finished = not model_thread.is_alive()
235 |                         
236 |                         rkllm_responses["choices"].append(
237 |                             {"index": index,
238 |                             "message": {
239 |                                 "role": "assistant",
240 |                                 "content": rkllm_output,
241 |                             },
242 |                             "logprobs": None,
243 |                             "finish_reason": "stop"
244 |                             }
245 |                         )
246 |                     return jsonify(rkllm_responses), 200
247 |                 else:
248 |                     # 在这里处理收到的数据
249 |                     messages = data['messages']
250 |                     print("Received messages:", messages)
251 |                     for index, message in enumerate(messages):
252 |                         input_prompt = message['content']
253 |                         rkllm_output = ""
254 |                         
255 |                         def generate():
256 |                             # 创建模型推理的线程
257 |                             model_thread = threading.Thread(target=rkllm_model.run, args=(input_prompt,))
258 |                             model_thread.start()
259 | 
260 |                             # 等待模型运行完成，定时检查模型的推理线程
261 |                             model_thread_finished = False
262 |                             while not model_thread_finished:
263 |                                 while len(global_text) > 0:
264 |                                     rkllm_output = global_text.pop(0)
265 | 
266 |                                     rkllm_responses["choices"].append(
267 |                                         {"index": index,
268 |                                         "delta": {
269 |                                             "role": "assistant",
270 |                                             "content": rkllm_output,
271 |                                         },
272 |                                         "logprobs": None,
273 |                                         "finish_reason": "stop" if global_state == 1 else None,
274 |                                         }
275 |                                     )
276 |                                     yield f"{json.dumps(rkllm_responses)}\n\n"
277 | 
278 |                                 model_thread.join(timeout=0.005)
279 |                                 model_thread_finished = not model_thread.is_alive()
280 | 
281 |                     return Response(generate(), content_type='text/plain')
282 |             else:
283 |                 return jsonify({'status': 'error', 'message': 'Invalid JSON data!'}), 400
284 |         finally:
285 |             # 释放锁
286 |             lock.release()
287 |             # 将服务器状态设置为非阻塞
288 |             is_blocking = False
289 |         
290 |     # 启动 Flask 应用程序
291 |     # app.run(host='0.0.0.0', port=8080)
292 |     app.run(host='0.0.0.0', port=8080, threaded=True, debug=False)
293 | 
294 |     print("====================")
295 |     print("RKLLM模型推理结束, 释放RKLLM模型资源...")
296 |     rkllm_model.release()
297 |     print("====================")
298 | 


--------------------------------------------------------------------------------
/rkllm_server/gradio_server.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import sys
  3 | import os
  4 | import subprocess
  5 | import resource
  6 | import threading
  7 | import time
  8 | import gradio as gr
  9 | import argparse
 10 | 
 11 | # 设定环境变量
 12 | os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
 13 | os.environ["GRADIO_SERVER_PORT"] = "8080"
 14 | 
 15 | # 设置动态库路径
 16 | rkllm_lib = ctypes.CDLL('lib/librkllmrt.so')
 17 | 
 18 | # 定义全局变量，用于保存回调函数的输出，便于在gradio界面中输出
 19 | global_text = []
 20 | global_state = -1
 21 | split_byte_data = bytes(b"") # 用于保存分割的字节数据
 22 | 
 23 | # 定义动态库中的结构体
 24 | class Token(ctypes.Structure):
 25 |     _fields_ = [
 26 |         ("logprob", ctypes.c_float),
 27 |         ("id", ctypes.c_int32)
 28 |     ]
 29 | 
 30 | class RKLLMResult(ctypes.Structure):
 31 |     _fields_ = [
 32 |         ("text", ctypes.c_char_p),
 33 |         ("tokens", ctypes.POINTER(Token)),
 34 |         ("num", ctypes.c_int32)
 35 |     ]
 36 | 
 37 | # 定义回调函数
 38 | def callback(result, userdata, state):
 39 |     global global_text, global_state, split_byte_data
 40 |     if state == 0:
 41 |         # 保存输出的token文本及RKLLM运行状态
 42 |         global_state = state
 43 |         # 需要监控当前的字节数据是否完整，不完整则进行记录，后续进行解析
 44 |         try:
 45 |             global_text.append((split_byte_data + result.contents.text).decode('utf-8'))
 46 |             print((split_byte_data + result.contents.text).decode('utf-8'), end='')
 47 |             split_byte_data = bytes(b"")
 48 |         except:
 49 |             split_byte_data += result.contents.text
 50 |         sys.stdout.flush()
 51 |     elif state == 1:
 52 |         # 保存RKLLM运行状态
 53 |         global_state = state
 54 |         print("\n")
 55 |         sys.stdout.flush()
 56 |     else:
 57 |         print("run error")
 58 | 
 59 | # Python端与C++端的回调函数连接
 60 | callback_type = ctypes.CFUNCTYPE(None, ctypes.POINTER(RKLLMResult), ctypes.c_void_p, ctypes.c_int)
 61 | c_callback = callback_type(callback)
 62 | 
 63 | # 定义动态库中的结构体
 64 | class RKNNllmParam(ctypes.Structure):
 65 |     _fields_ = [
 66 |         ("model_path", ctypes.c_char_p),
 67 |         ("num_npu_core", ctypes.c_int32),
 68 |         ("max_context_len", ctypes.c_int32),
 69 |         ("max_new_tokens", ctypes.c_int32),
 70 |         ("top_k", ctypes.c_int32),
 71 |         ("top_p", ctypes.c_float),
 72 |         ("temperature", ctypes.c_float),
 73 |         ("repeat_penalty", ctypes.c_float),
 74 |         ("frequency_penalty", ctypes.c_float),
 75 |         ("presence_penalty", ctypes.c_float),
 76 |         ("mirostat", ctypes.c_int32),
 77 |         ("mirostat_tau", ctypes.c_float),
 78 |         ("mirostat_eta", ctypes.c_float),
 79 |         ("logprobs", ctypes.c_bool),
 80 |         ("top_logprobs", ctypes.c_int32),
 81 |         ("use_gpu", ctypes.c_bool)
 82 |     ]
 83 | 
 84 | # 定义RKLLM_Handle_t和userdata
 85 | RKLLM_Handle_t = ctypes.c_void_p
 86 | userdata = ctypes.c_void_p(None)
 87 | 
 88 | # 设置提示文本
 89 | PROMPT_TEXT_PREFIX = "<|im_start|>system You are a helpful assistant. <|im_end|> <|im_start|>user"
 90 | PROMPT_TEXT_POSTFIX = "<|im_end|><|im_start|>assistant"
 91 | 
 92 | # 定义Python端的RKLLM类，其中包括了对动态库中RKLLM模型的初始化、推理及释放操作
 93 | class RKLLM(object):
 94 |     def __init__(self, model_path, target_platform):
 95 |         rknnllm_param = RKNNllmParam()
 96 |         rknnllm_param.model_path = bytes(model_path, 'utf-8')
 97 |         if target_platform == "rk3588":
 98 |             rknnllm_param.num_npu_core = 3
 99 |         elif target_platform == "rk3576":
100 |             rknnllm_param.num_npu_core = 2
101 |         rknnllm_param.max_context_len = 320
102 |         rknnllm_param.max_new_tokens = 512
103 |         rknnllm_param.top_k = 1
104 |         rknnllm_param.top_p = 0.9
105 |         rknnllm_param.temperature = 0.8
106 |         rknnllm_param.repeat_penalty = 1.1
107 |         rknnllm_param.frequency_penalty = 0.0
108 |         rknnllm_param.presence_penalty = 0.0
109 |         rknnllm_param.mirostat = 0
110 |         rknnllm_param.mirostat_tau = 5.0
111 |         rknnllm_param.mirostat_eta = 0.1
112 |         rknnllm_param.logprobs = False
113 |         rknnllm_param.top_logprobs = 5
114 |         rknnllm_param.use_gpu = False
115 |         self.handle = RKLLM_Handle_t()
116 | 
117 |         self.rkllm_init = rkllm_lib.rkllm_init
118 |         self.rkllm_init.argtypes = [ctypes.POINTER(RKLLM_Handle_t), ctypes.POINTER(RKNNllmParam), callback_type]
119 |         self.rkllm_init.restype = ctypes.c_int
120 |         self.rkllm_init(ctypes.byref(self.handle), rknnllm_param, c_callback)
121 | 
122 |         self.rkllm_run = rkllm_lib.rkllm_run
123 |         self.rkllm_run.argtypes = [RKLLM_Handle_t, ctypes.POINTER(ctypes.c_char), ctypes.c_void_p]
124 |         self.rkllm_run.restype = ctypes.c_int
125 | 
126 |         self.rkllm_destroy = rkllm_lib.rkllm_destroy
127 |         self.rkllm_destroy.argtypes = [RKLLM_Handle_t]
128 |         self.rkllm_destroy.restype = ctypes.c_int
129 | 
130 |     def run(self, prompt):
131 |         prompt = bytes(PROMPT_TEXT_PREFIX + prompt + PROMPT_TEXT_POSTFIX, 'utf-8')
132 |         self.rkllm_run(self.handle, prompt, ctypes.byref(userdata))
133 |         return
134 | 
135 |     def release(self):
136 |         self.rkllm_destroy(self.handle)
137 | 
138 | if __name__ == "__main__":
139 |     parser = argparse.ArgumentParser()
140 |     parser.add_argument('--target_platform', help='目标平台: 如rk3588/rk3576;')
141 |     parser.add_argument('--rkllm_model_path', help='Linux板端上已转换好的rkllm模型的绝对路径')
142 |     args = parser.parse_args()
143 | 
144 |     if not (args.target_platform in ["rk3588", "rk3576"]):
145 |         print("====== Error: 请指定正确的目标平台: rk3588/rk3576 ======")
146 |         sys.stdout.flush()
147 |         exit()
148 | 
149 |     if not os.path.exists(args.rkllm_model_path):
150 |         print("====== Error: 请给出准确的rkllm模型路径，需注意是板端的绝对路径 ======")
151 |         sys.stdout.flush()
152 |         exit()
153 | 
154 |     # 定频设置
155 |     command = "sudo bash fix_freq_{}.sh".format(args.target_platform)
156 |     subprocess.run(command, shell=True)
157 | 
158 |     # 设置文件描述符限制
159 |     resource.setrlimit(resource.RLIMIT_NOFILE, (102400, 102400))
160 | 
161 |     # 初始化RKLLM模型
162 |     print("=========init....===========")
163 |     sys.stdout.flush()
164 |     target_platform = args.target_platform
165 |     model_path = args.rkllm_model_path
166 |     rkllm_model = RKLLM(model_path, target_platform)
167 |     print("RKLLM初始化成功！")
168 |     print("==============================")
169 |     sys.stdout.flush()
170 | 
171 |     # 记录用户输入的prompt         
172 |     def get_user_input(user_message, history):
173 |         history = history + [[user_message, None]]
174 |         return "", history
175 | 
176 |     # 获取RKLLM模型的输出并进行流式打印
177 |     def get_RKLLM_output(history):
178 |         # 链接全局变量，获取回调函数的输出信息
179 |         global global_text, global_state
180 |         global_text = []
181 |         global_state = -1
182 | 
183 |         # 创建模型推理的线程
184 |         model_thread = threading.Thread(target=rkllm_model.run, args=(history[-1][0],))
185 |         model_thread.start()
186 | 
187 |         # history[-1][1]表示当前的输出对话
188 |         history[-1][1] = ""
189 |         
190 |         # 等待模型运行完成，定时检查模型的推理线程
191 |         model_thread_finished = False
192 |         while not model_thread_finished:
193 |             while len(global_text) > 0:
194 |                 history[-1][1] += global_text.pop(0)
195 |                 time.sleep(0.005)
196 |                 # gradio在调用then方法式自动将yield返回的结果推进行输出
197 |                 yield history
198 | 
199 |             model_thread.join(timeout=0.005)
200 |             model_thread_finished = not model_thread.is_alive()
201 | 
202 |     # 创建gradio界面
203 |     with gr.Blocks(title="Chat with RKLLM") as chatRKLLM:
204 |         gr.Markdown("<div align='center'><font size='70'> Chat with RKLLM </font></div>")
205 |         gr.Markdown("### 在 inputTextBox 输入您的问题，按下 Enter 键，即可与 RKLLM 模型进行对话。")
206 |         # 创建一个Chatbot组件，用于显示对话历史
207 |         rkllmServer = gr.Chatbot(height=600)
208 |         # 创建一个Textbox组件，让用户输入消息
209 |         msg = gr.Textbox(placeholder="Please input your question here...", label="inputTextBox")
210 |         # 创建一个Button组件，用于清除聊天历史
211 |         clear = gr.Button("清除")
212 | 
213 |         # 将用户输入的消息提交给get_user_input函数，并且立即更新聊天历史
214 |         # 然后调用get_RKLLM_output函数，进一步更新聊天历史
215 |         # queue=False参数确保这些更新不会被放入队列，而是立即执行
216 |         msg.submit(get_user_input, [msg, rkllmServer], [msg, rkllmServer], queue=False).then(get_RKLLM_output, rkllmServer, rkllmServer)
217 |         # 当点击清除按钮时，执行一个空操作（lambda: None），并且立即清除聊天历史
218 |         clear.click(lambda: None, None, rkllmServer, queue=False)
219 | 
220 |     # 启用事件队列系统
221 |     chatRKLLM.queue()
222 |     # 启动Gradio应用程序
223 |     chatRKLLM.launch()
224 | 
225 |     print("====================")
226 |     print("RKLLM模型推理结束, 释放RKLLM模型资源...")
227 |     rkllm_model.release()
228 |     print("====================")


--------------------------------------------------------------------------------
/rkllm_server/lib/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wudingjian/rkllm_chat/ac815ab4363bbdb48d9297c7e7ca2cf7055f8c46/rkllm_server/lib/.gitkeep


--------------------------------------------------------------------------------
/rkllm_server/lib/librkllmrt.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wudingjian/rkllm_chat/ac815ab4363bbdb48d9297c7e7ca2cf7055f8c46/rkllm_server/lib/librkllmrt.so


--------------------------------------------------------------------------------