├── .clang-format
├── .github
    └── workflows
    │   └── check_format.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── CONTRIBUTING_zh.md
├── LICENSE
├── NOTICE_Third_Party.md
├── README.md
├── README_zh.md
├── RELEASE.md
├── cmake
    ├── CMakeDetermineRustCompiler.cmake
    ├── CMakeRustCompiler.cmake.in
    ├── CMakeRustInformation.cmake
    ├── CMakeTestRustCompiler.cmake
    ├── FindRust.cmake
    ├── FindSentencePiece.cmake
    ├── cargo_library.cmake
    ├── cargo_shared_library.cmake
    ├── cc_binary.cmake
    ├── cc_library.cmake
    ├── cc_test.cmake
    ├── grpc_proto_library.cmake
    ├── proto_library.cmake
    └── static_analyzers.cmake
├── docs
    ├── assets
    │   ├── service_arch.png
    │   ├── wechat_qrcode1.png
    │   ├── wechat_qrcode2.png
    │   └── xllm_service_title.png
    ├── en
    │   ├── getting_started.md
    │   └── overview.md
    └── zh
    │   ├── getting_started.md
    │   └── overview.md
├── prepare.sh
├── third_party
    ├── CMakeLists.txt
    └── custom_cache
    │   └── cpprestsdk.patch
├── vcpkg.json
└── xllm_service
    ├── CMakeLists.txt
    ├── chat_template
        ├── CMakeLists.txt
        ├── jinja_chat_template.cpp
        ├── jinja_chat_template.h
        └── jinja_chat_template_test.cpp
    ├── common
        ├── CMakeLists.txt
        ├── call_data.h
        ├── closure_guard.h
        ├── concurrent_queue.h
        ├── global_gflags.cpp
        ├── global_gflags.h
        ├── hash_util.cpp
        ├── hash_util.h
        ├── json_reader.cpp
        ├── json_reader.h
        ├── macros.h
        ├── options.h
        ├── slice.h
        ├── threadpool.cpp
        ├── threadpool.h
        ├── ttft_predictor.cpp
        ├── ttft_predictor.h
        ├── types.h
        ├── utils.cpp
        ├── utils.h
        └── xllm
        │   ├── output.h
        │   ├── status.h
        │   ├── uuid.cpp
        │   └── uuid.h
    ├── examples
        ├── CMakeLists.txt
        ├── curl_http_client.sh
        ├── http_client_test.cpp
        ├── rpc_client_test.cpp
        └── rpc_hello_client.cpp
    ├── http_service
        ├── CMakeLists.txt
        ├── main.cpp
        ├── request_tracer.cpp
        ├── request_tracer.h
        ├── service.cpp
        └── service.h
    ├── master.cpp
    ├── master.h
    ├── proto
        ├── CMakeLists.txt
        ├── xllm
        │   ├── chat.proto
        │   ├── common.proto
        │   └── completion.proto
        ├── xllm_http_service.proto
        └── xllm_rpc_service.proto
    ├── request
        ├── CMakeLists.txt
        └── request.h
    ├── rpc_service
        ├── CMakeLists.txt
        ├── client.cpp
        ├── client.h
        ├── main.cpp
        ├── rpc_service_test.cpp
        ├── service.cpp
        └── service.h
    ├── scheduler
        ├── CMakeLists.txt
        ├── etcd_client
        │   ├── CMakeLists.txt
        │   ├── etcd_client.cpp
        │   └── etcd_client.h
        ├── loadbalance_policy
        │   ├── CMakeLists.txt
        │   ├── cache_aware_routing.cpp
        │   ├── cache_aware_routing.h
        │   ├── loadbalance_policy.h
        │   ├── round_robin.cpp
        │   └── round_robin.h
        ├── managers
        │   ├── CMakeLists.txt
        │   ├── global_kvcache_mgr.cpp
        │   ├── global_kvcache_mgr.h
        │   ├── instance_mgr.cpp
        │   └── instance_mgr.h
        ├── response_handler.cpp
        ├── response_handler.h
        ├── scheduler.cpp
        └── scheduler.h
    └── tokenizer
        ├── CMakeLists.txt
        ├── fast_tokenizer.cpp
        ├── fast_tokenizer.h
        ├── sentencepiece_tokenizer.cpp
        ├── sentencepiece_tokenizer.h
        ├── tiktoken_tokenizer.cpp
        ├── tiktoken_tokenizer.h
        ├── tokenizer.h
        ├── tokenizer_args.cpp
        ├── tokenizer_args.h
        ├── tokenizer_factory.cpp
        ├── tokenizer_factory.h
        └── tokenizers
            ├── CMakeLists.txt
            ├── Cargo.toml
            ├── src
                └── lib.rs
            └── tokenizers.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | BasedOnStyle: Google
 3 | UseTab: Never
 4 | IndentWidth: 2
 5 | ColumnLimit: 80
 6 | 
 7 | BinPackParameters: false
 8 | BinPackArguments: false
 9 | ExperimentalAutoDetectBinPacking: false
10 | AllowAllParametersOfDeclarationOnNextLine: false
11 | DerivePointerAlignment: false
12 | PointerAlignment: Left
13 | ...
14 | 


--------------------------------------------------------------------------------
/.github/workflows/check_format.yml:
--------------------------------------------------------------------------------
 1 | name: CheckFormat
 2 | on: 
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches: [main]
 6 |     paths: ['xllm_service/**']
 7 |   pull_request:
 8 |     types: [opened, synchronize, reopened]
 9 |     branches: [main]
10 |     paths: ['xllm_service/**']
11 | 
12 | jobs:
13 |   format-check:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Install clang-format
17 |         run: |
18 |           pip install clang-format==20.1.6
19 |           clang-format --version
20 | 
21 |       - name: Checkout code
22 |         uses: actions/checkout@v4
23 |         with:
24 |           fetch-depth: 0
25 | 
26 |       - name: Determine base commit for comparison
27 |         id: get_base_commit
28 |         run: |
29 |           # pull_request action
30 |           if [ "${{ github.event_name }}" = "pull_request" ]; then
31 |             echo "base_commit=${{ github.event.pull_request.base.sha }}" >> $GITHUB_OUTPUT
32 |           else
33 |             # push action
34 |             echo "base_commit=${{ github.sha }}~1" >> $GITHUB_OUTPUT
35 |           fi
36 | 
37 |       - name: Verify clang-format configuration
38 |         run: |
39 |           if [ ! -f ".clang-format" ]; then
40 |             echo "❌ .clang-format file not found in repository root"
41 |             exit 1
42 |           fi
43 |           clang-format --style=file --dump-config > /dev/null || {
44 |             echo "❌ .clang-format file has invalid format"
45 |             exit 1
46 |           }
47 | 
48 |       - name: Check code format
49 |         shell: /usr/bin/bash {0}
50 |         run: |
51 |           BASE_COMMIT="${{ steps.get_base_commit.outputs.base_commit }}"
52 |           CLANG_FORMAT_FILE="$(pwd)/.clang-format"
53 | 
54 |           # do clang-format
55 |           diff=$(git-clang-format \
56 |             --style=file:"$CLANG_FORMAT_FILE" \
57 |             --extensions="c,h,cc,cp,cpp,c++,cxx,hh,hpp,hxx,inc,cu,cuh" \
58 |             --commit "$BASE_COMMIT" \
59 |             --diff)
60 | 
61 |           # check diff 
62 |           if [ "$diff" = "no modified files to format" ] || [ "$diff" = "clang-format did not modify any files" ]; then
63 |             echo "✅ Code format is correct"
64 |             exit 0
65 |           fi
66 | 
67 |           printf "\n❌ You have introduced coding style breakages.\n"
68 | 
69 |           printf "\n\033[1mSuggested changes:\n\n"
70 |           echo "$diff"
71 |           exit 1
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Visual Studio Code
 2 | /.vscode*
 3 | 
 4 | # Idea
 5 | /.idea
 6 | /cmake-build-debug/
 7 | /cmake-build-release/
 8 | 
 9 | # CMake
10 | /build*
11 | 
12 | # cache
13 | /.*cache
14 | 
15 | # deps
16 | /.deps
17 | 
18 | # gtest
19 | /Testing
20 | 
21 | # rust
22 | Cargo.lock
23 | 
24 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/brpc"]
 2 | 	path = third_party/brpc
 3 | 	url = https://gitcode.com/xLLM-AI/brpc.git
 4 |         branch = 1.12.1_cmake
 5 | [submodule "third_party/etcd_cpp_apiv3"]
 6 | 	path = third_party/etcd_cpp_apiv3
 7 | 	url = https://gitcode.com/xLLM-AI/etcd-cpp-apiv3.git
 8 |         branch = v0.15.4
 9 | [submodule "third_party/cpprestsdk"]
10 | 	path = third_party/cpprestsdk
11 | 	url = https://gitcode.com/xLLM-AI/cpprestsdk.git
12 |         branch = v2.10.19
13 | [submodule "third_party/sentencepiece"]
14 | 	path = third_party/sentencepiece
15 | 	url = https://gitcode.com/xLLM-AI/sentencepiece.git
16 | [submodule "third_party/minja"]
17 | 	path = third_party/minja
18 | 	url = https://gitcode.com/xLLM-AI/minja.git
19 | [submodule "third_party/smhasher"]
20 | 	path = third_party/smhasher
21 | 	url = https://gitcode.com/xLLM-AI/smhasher.git
22 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit install
 2 | # pre-commit run --all-files
 3 | 
 4 | repos:
 5 | -   repo: https://github.com/pre-commit/mirrors-clang-format
 6 |     rev: v20.1.6
 7 |     hooks:
 8 |       - id: clang-format
 9 |         types_or: [c++, c, cuda]
10 |         exclude: ^(cibuild/|tools/|third_party/|cmake/|build)
11 | 
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | <!-- Copyright 2025 JD.com
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this project except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License. -->
14 | 
15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md)
16 | 
17 | # Contribute to xLLM-Service
18 | 
19 | + Write / translate / fix our documentation
20 | + Raise questions / Answer questions
21 | + Provide demos, examples or test cases
22 | + Give suggestions or other comments
23 | + Paticipate in [issues](https://github.com/xxx/xLLM/issues) or [discussions](https://github.com/xxx/xLLM/discussions)
24 | + Pull requests
25 | + Sharing related research / application
26 | + Any other ways to improve xLLM
27 | 
28 | For developers who want to contribute to our code, here is the guidance:
29 | 
30 | ## 1. Choose an issue to contribute
31 | + Issues with label `PR welcome`, which means:
32 |     + A reproducible bug
33 |     + A function in plan
34 | 
35 | ## 2. Install environment for development
36 | + We strongly suggest you to read our **[Document](http://xxx/docs/)** before developing
37 | + For setting environment, please check our  **[Readme file](/README.md)**
38 | 
39 | ## 3. Build our project
40 | + You could run our demo to check whether the requirements are successfully installed:
41 | 
42 | ## 4. Test
43 | 
44 | After the PR is submitted, we will format and test the code.
45 | Our tests are still far from perfect, so you are welcomed to add tests to our project!


--------------------------------------------------------------------------------
/CONTRIBUTING_zh.md:
--------------------------------------------------------------------------------
 1 | <!-- Copyright 2025 JD.com
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this project except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License. -->
14 | 
15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md)
16 | 
17 | # xLLM-Service 贡献指南
18 | 
19 | xLLM-Service致力于为每一位用户和开发者提供开放的XX，因此无论您是XX开发者还是专注于XX用户，我们都欢迎您参与我们的项目。
20 | 您可以通过以下方法为项目作出贡献：
21 | 
22 | + 撰写/翻译/修改文档
23 | + 提出或回答问题
24 | + 提供使用或测试样例
25 | + 提供建议或其他评论
26 | + 参与[issues](https://github.com/xxx/xLLM/issues) 或[discussions](https://github.com/xxx/xLLM/discussions)
27 | + 提交Pull request
28 | + 分享相关研究或应用场景
29 | + 其他任何对xLLM-Service的帮助
30 | 
31 | 如果您希望参与xLLM的开发，请参考以下提示：
32 | 
33 | ## 1. 选择参与贡献的issue
34 | + 您可以选择带有`PR welcome`标签的issue，包括:
35 |     + 可复现的bug
36 |     + 计划实现的功能
37 | 
38 | ## 2. 配置开发环境
39 | + 在开发之前，可以参考我们的 **[文档](http://xxx/docs/)**
40 | + 关于环境配置，参见 **[Readme file](/README.md)**
41 | 
42 | ## 3. 项目构建和运行
43 | + 您可以运行如下样例：
44 | 
45 | ## 4. 测试
46 | 
47 | 在pr提交之后，我们会对代码进行格式化及进一步测试。
48 | 我们的测试目前还很不完善，因此欢迎开发者为测试作出贡献！


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- Copyright 2022 JD Co.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this project except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License. -->
 14 | 
 15 | [English](./README.md) | [中文](./README_zh.md)
 16 | 
 17 | <p align="center">
 18 |     <img src="docs/assets/xllm_service_title.png" alt="xLLM" style="width:50%; height:auto;">
 19 | </p>
 20 | 
 21 | 
 22 | ## 1. Project Overview
 23 | **xLLM-service** is a service-layer framework developed based on the **xLLM** inference engine, providing efficient, fault-tolerant, and flexible LLM inference services for clustered deployment.
 24 | 
 25 | xLLM-service targets to address key challenges in enterprise-level service scenarios:
 26 | 
 27 | - How to ensure the SLA of online services and improve resource utilization of offline tasks in a hybrid online-offline deployment environment.
 28 | 
 29 | - How to react to changing request loads in actual businesses, such as fluctuations in input/output lengths.
 30 | 
 31 | - Resolving performance bottlenecks of multimodal model requests.
 32 | 
 33 | - Ensuring high reliability of computing instances.
 34 | 
 35 | ---
 36 | 
 37 | ## 2. Key Features
 38 | With management of computing resource pools, intelligent scheduling and preemption of hybrid requests, and real-time monitoring of computing instances, xLLM-service achieves the following key features:
 39 | 
 40 | - Unified scheduling of online and offline requests, with preemptive execution for online requests and best-effort execution for offline requests.
 41 | 
 42 | - Adaptive dynamic allocation of PD ratios, supporting efficient switching of instance PD roles.
 43 | 
 44 | - EPD three-stage disaggregation for multimodal requests, with intelligent resource allocation for different stages.
 45 | 
 46 | - Fault-tolerant architecture, fast detection of instance error and automatic rescheduling for interrupted requests. 
 47 | 
 48 | ---
 49 | 
 50 | ## 3. Core Architecture
 51 | 
 52 | ```
 53 | ├── xllm-service/
 54 | |   : main source folder
 55 | │   ├── chat_template/               # 
 56 | │   ├── common/                      # 
 57 | │   ├── examples/                    # 
 58 | │   ├── http_service/                # 
 59 | │   ├── rpc_service/                 # 
 60 | |   ├── tokenizers/                  #
 61 | |   └── master.cpp                   # 
 62 | ```
 63 | 
 64 | ---
 65 | 
 66 | 
 67 | ## 4. Quick Start
 68 | #### Installation
 69 | ```bash
 70 | git clone git@coding.jd.com:xllm-ai/xllm_service.git
 71 | cd xllm_service
 72 | git submodule init
 73 | git submodule update
 74 | ```
 75 | #### Compilation
 76 | compile xllm-service: 
 77 | ```bash
 78 | sh prepare.sh # apply patch
 79 | mkdir -p build && cd build
 80 | cmake .. && make -j 8
 81 | ```
 82 | 
 83 | --- 
 84 | 
 85 | ## 5. Contributing
 86 | 
 87 | There are several ways you can contribute to xLLM:
 88 | 
 89 | 1. Reporting Issues (Bugs & Errors)
 90 | 2. Suggesting Enhancements
 91 | 3. Improving Documentation
 92 |     + Fork the repository
 93 |     + Add your view in document
 94 |     + Send your pull request
 95 | 4. Writing Code
 96 |     + Fork the repository
 97 |     + Create a new branch
 98 |     + Add your feature or improvement
 99 |     + Send your pull request
100 | 
101 | We appreciate all kinds of contributions! 🎉🎉🎉
102 | If you have problems about development, please check our document: * **[Document](./docs/docs/readme.md)**
103 | 
104 | ---
105 | 
106 | ## 6. Community & Support
107 | 
108 | If you encounter any issues along the way, you are welcomed to submit reproducible steps and log snippets in the project's Issues area, or contact the xLLM Core team directly via your internal Slack.
109 | 
110 | Welcome to contact us:
111 | 
112 | <div align="center">
113 |   <img src="docs/assets/wechat_qrcode1.png" alt="qrcode1" width="30%" />
114 |   <img src="docs/assets/wechat_qrcode2.png" alt="qrcode2" width="30%" />
115 | </div>
116 | 
117 | ---
118 | ## 7. About the Contributors
119 | 
120 | Thanks to all the following [developers](https://github.com/jd-opensource/xllm-service/graphs/contributors) who have contributed to xLLM.
121 | <a href="https://github.com/jd-opensource/xllm-service/graphs/contributors">
122 |   <img src="https://contrib.rocks/image?repo=jd-opensource/xllm-service" />
123 | </a>
124 | 
125 | ---
126 | 
127 | ## 8. License
128 | 
129 | [Apache License](LICENSE)
130 | 
131 | #### xLLM is provided by JD.com 
132 | #### Thanks for your Contributions!
133 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | <!-- Copyright 2022 JD Co.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this project except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License. -->
 14 | 
 15 | [English](./README.md) | [中文](./README_zh.md)
 16 | 
 17 | 
 18 | <p align="center">
 19 |     <img src="docs/assets/xllm_service_title.png" alt="xLLM" style="width:50%; height:auto;">
 20 | </p>
 21 | 
 22 | ## 1. 简介
 23 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架，为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。
 24 | 
 25 | xLLM-service 旨在解决企业级服务场景中的关键挑战：
 26 | - 如何于在离线混合部署环境中，保障在线服务的SLA，提升离线任务的资源利用率。
 27 | - 如何适应实际业务中动态变化的请求负载，如输入/输出长度出现剧烈波动。
 28 | - 解决多模态模型请求的性能瓶颈。
 29 | - 保障集群计算实例的高可靠性。
 30 | 
 31 | --- 
 32 | 
 33 | ## 2. 核心特性
 34 | 
 35 | xLLM-service 通过对计算资源池的动态管理、请求的智能调度与抢占，以及计算实例的实时监控，实现了以下核心能力：
 36 | - 在线与离线任务的统一调度，在线请求的抢占式执行，离线请求best-effort执行；
 37 | - PD比例的自适应动态调配，支持实例PD角色的高效切换；
 38 | - 多模态请求的EPD三阶段分离，不同阶段的资源智能分配；
 39 | - 多节点容错架构，快速感知实例错误信息，自动决策最优的被中断请求再调度方案。
 40 | 
 41 | ---
 42 | 
 43 | ## 3. 代码结构
 44 | 
 45 | ```
 46 | ├── xllm-service/
 47 | |   : 主代码目录
 48 | │   ├── chat_template/               # 
 49 | │   ├── common/                      # 
 50 | │   ├── examples/                    # 
 51 | │   ├── http_service/                # 
 52 | │   ├── rpc_service/                 # 
 53 | |   ├── tokenizers/                  #
 54 | |   └── master.cpp                   # 
 55 | ```
 56 | ---
 57 | 
 58 | 
 59 | ## 4. 快速开始
 60 | #### 安装
 61 | ```bash
 62 | git clone git@coding.jd.com:xllm-ai/xllm_service.git
 63 | cd xllm_service
 64 | git submodule init
 65 | git submodule update
 66 | ```
 67 | #### 编译
 68 | 编译执行
 69 | ```bash
 70 | sh prepare.sh # 应用patch
 71 | mkdir -p build && cd build
 72 | cmake .. && make -j 8
 73 | ```
 74 | 
 75 | ---
 76 | ## 5. 成为贡献者
 77 | 您可以通过以下方法为 xLLM-Service 作出贡献:
 78 | 
 79 | 1. 在Issue中报告问题
 80 | 2. 提供改进建议
 81 | 3. 补充文档
 82 |     + Fork仓库
 83 |     + 修改文档
 84 |     + 提出pull request
 85 | 4. 修改代码
 86 |     + Fork仓库
 87 |     + 创建新分支
 88 |     + 加入您的修改
 89 |     + 提出pull request
 90 | 
 91 | 感谢您的贡献！ 🎉🎉🎉
 92 | 如果您在开发中遇到问题，请参阅**[xLLM-Service中文指南](./docs/docs_zh/readme.md)**
 93 | 
 94 | ---
 95 | 
 96 | ## 6. 社区支持
 97 | 
 98 | 如果你在xLLM的开发或使用过程中遇到任何问题，欢迎在项目的Issue区域提交可复现的步骤或日志片段。
 99 | 如果您有企业内部Slack，请直接联系xLLM Core团队。
100 | 
101 | 欢迎沟通和联系我们:
102 | 
103 | <div align="center">
104 |   <img src="docs/assets/wechat_qrcode1.png" alt="qrcode1" width="30%" />
105 |   <img src="docs/assets/wechat_qrcode2.png" alt="qrcode2" width="30%" />
106 | </div>
107 | 
108 | ## 7. 致谢
109 | 
110 | 感谢以下为xLLM-Servic作出贡献的[开发者](https://github.com/jd-opensource/xllm-service/graphs/contributors)
111 | <a href="https://github.com/jd-opensource/xllm-service/graphs/contributors">
112 |   <img src="https://contrib.rocks/image?repo=jd-opensource/xllm-service" />
113 | </a>
114 | 
115 | ---
116 | 
117 | ## 8. 许可证
118 | [Apache License](LICENSE)
119 | 
120 | #### xLLM-Service 由 JD.com 提供 
121 | #### 感谢您对xLLM的关心与贡献!
122 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
1 | # Release xllm-service 0.1.0
2 | 
3 | ## **Major Features and Improvements**
4 | 
5 | - Support disaggregated prefill and decoding.
6 | - Support KV Cache aware routing.
7 | - Support KV Cache Pool.
8 | 


--------------------------------------------------------------------------------
/cmake/CMakeDetermineRustCompiler.cmake:
--------------------------------------------------------------------------------
 1 | # ported from https://github.com/Devolutions/CMakeRust
 2 | if(NOT CMAKE_Rust_COMPILER)
 3 | 	find_package(Rust)
 4 | 	if(RUST_FOUND)
 5 | 		set(CMAKE_Rust_COMPILER "${RUSTC_EXECUTABLE}")
 6 | 		set(CMAKE_Rust_COMPILER_ID "Rust")
 7 | 		set(CMAKE_Rust_COMPILER_VERSION "${RUST_VERSION}")
 8 | 		set(CMAKE_Rust_PLATFORM_ID "Rust")
 9 | 	endif()
10 | endif()
11 | 
12 | message(STATUS "Cargo Home: ${CARGO_HOME}")
13 | message(STATUS "Rust Compiler Version: ${RUSTC_VERSION}")
14 | 
15 | mark_as_advanced(CMAKE_Rust_COMPILER)
16 | 
17 | if(CMAKE_Rust_COMPILER)
18 | 	set(CMAKE_Rust_COMPILER_LOADED 1)
19 | endif(CMAKE_Rust_COMPILER)
20 | 
21 | configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeRustCompiler.cmake.in
22 | 	${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${CMAKE_VERSION}/CMakeRustCompiler.cmake IMMEDIATE @ONLY)
23 | 
24 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC")
25 | 
26 | 


--------------------------------------------------------------------------------
/cmake/CMakeRustCompiler.cmake.in:
--------------------------------------------------------------------------------
 1 | 
 2 | # ported from https://github.com/Devolutions/CMakeRust
 3 | set(CMAKE_Rust_COMPILER "@CMAKE_Rust_COMPILER@")
 4 | set(CMAKE_Rust_COMPILER_ID "@CMAKE_Rust_COMPILER_ID@")
 5 | set(CMAKE_Rust_COMPILER_VERSION "@CMAKE_Rust_COMPILER_VERSION@")
 6 | set(CMAKE_Rust_COMPILER_LOADED @CMAKE_Rust_COMPILER_LOADED@)
 7 | set(CMAKE_Rust_PLATFORM_ID "@CMAKE_Rust_PLATFORM_ID@")
 8 | 
 9 | SET(CMAKE_Rust_SOURCE_FILE_EXTENSIONS rs)
10 | SET(CMAKE_Rust_LINKER_PREFERENCE 40)
11 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC")
12 | 
13 | 


--------------------------------------------------------------------------------
/cmake/CMakeRustInformation.cmake:
--------------------------------------------------------------------------------
  1 | # ported from https://github.com/Devolutions/CMakeRust
  2 | # 
  3 | # Usage: rustc [OPTIONS] INPUT
  4 | # 
  5 | # Options:
  6 | #     -h --help           Display this message
  7 | #     --cfg SPEC          Configure the compilation environment
  8 | #     -L [KIND=]PATH      Add a directory to the library search path. The
  9 | #                         optional KIND can be one of dependency, crate, native,
 10 | #                         framework or all (the default).
 11 | #     -l [KIND=]NAME      Link the generated crate(s) to the specified native
 12 | #                         library NAME. The optional KIND can be one of static,
 13 | #                         dylib, or framework. If omitted, dylib is assumed.
 14 | #     --crate-type [bin|lib|rlib|dylib|cdylib|staticlib|metadata]
 15 | #                         Comma separated list of types of crates for the
 16 | #                         compiler to emit
 17 | #     --crate-name NAME   Specify the name of the crate being built
 18 | #     --emit [asm|llvm-bc|llvm-ir|obj|link|dep-info]
 19 | #                         Comma separated list of types of output for the
 20 | #                         compiler to emit
 21 | #     --print [crate-name|file-names|sysroot|cfg|target-list|target-cpus|target-features|relocation-models|code-models]
 22 | #                         Comma separated list of compiler information to print
 23 | #                         on stdout
 24 | #     -g                  Equivalent to -C debuginfo=2
 25 | #     -O                  Equivalent to -C opt-level=2
 26 | #     -o FILENAME         Write output to <filename>
 27 | #     --out-dir DIR       Write output to compiler-chosen filename in <dir>
 28 | #     --explain OPT       Provide a detailed explanation of an error message
 29 | #     --test              Build a test harness
 30 | #     --target TARGET     Target triple for which the code is compiled
 31 | #     -W --warn OPT       Set lint warnings
 32 | #     -A --allow OPT      Set lint allowed
 33 | #     -D --deny OPT       Set lint denied
 34 | #     -F --forbid OPT     Set lint forbidden
 35 | #     --cap-lints LEVEL   Set the most restrictive lint level. More restrictive
 36 | #                         lints are capped at this level
 37 | #     -C --codegen OPT[=VALUE]
 38 | #                         Set a codegen option
 39 | #     -V --version        Print version info and exit
 40 | #     -v --verbose        Use verbose output
 41 | # 
 42 | # Additional help:
 43 | #     -C help             Print codegen options
 44 | #     -W help             Print 'lint' options and default settings
 45 | #     -Z help             Print internal options for debugging rustc
 46 | #     --help -v           Print the full set of options rustc accepts
 47 | # 
 48 | 
 49 | # <TARGET> <TARGET_BASE> <OBJECT> <OBJECTS> <LINK_LIBRARIES> <FLAGS> <LINK_FLAGS> <SOURCE> <SOURCES>
 50 | 
 51 | include(CMakeLanguageInformation)
 52 | 
 53 | if(UNIX)
 54 | 	set(CMAKE_Rust_OUTPUT_EXTENSION .o)
 55 | else()
 56 | 	set(CMAKE_Rust_OUTPUT_EXTENSION .obj)
 57 | endif()
 58 | 
 59 | set(CMAKE_Rust_ECHO_ALL "echo \"TARGET: <TARGET> TARGET_BASE: <TARGET_BASE> ")
 60 | set(CMAKE_Rust_ECHO_ALL "${CMAKE_Rust_ECHO_ALL} OBJECT: <OBJECT> OBJECTS: <OBJECTS> OBJECT_DIR: <OBJECT_DIR> SOURCE: <SOURCE> SOURCES: <SOURCES> ")
 61 | set(CMAKE_Rust_ECHO_ALL "${CMAKE_Rust_ECHO_ALL} LINK_LIBRARIES: <LINK_LIBRARIES> FLAGS: <FLAGS> LINK_FLAGS: <LINK_FLAGS> \"")
 62 | 
 63 | if(NOT CMAKE_Rust_CREATE_SHARED_LIBRARY)
 64 | 	set(CMAKE_Rust_CREATE_SHARED_LIBRARY
 65 | 		"echo \"CMAKE_Rust_CREATE_SHARED_LIBRARY\""
 66 | 		"${CMAKE_Rust_ECHO_ALL}"
 67 | 		)
 68 | endif()
 69 | 
 70 | if(NOT CMAKE_Rust_CREATE_SHARED_MODULE)
 71 | 	set(CMAKE_Rust_CREATE_SHARED_MODULE
 72 | 		"echo \"CMAKE_Rust_CREATE_SHARED_MODULE\""
 73 | 		"${CMAKE_Rust_ECHO_ALL}"
 74 | 		)
 75 | endif()
 76 | 
 77 | if(NOT CMAKE_Rust_CREATE_STATIC_LIBRARY)
 78 | 	set(CMAKE_Rust_CREATE_STATIC_LIBRARY
 79 | 		"echo \"CMAKE_Rust_CREATE_STATIC_LIBRARY\""
 80 | 		"${CMAKE_Rust_ECHO_ALL}"
 81 | 		)
 82 | endif()
 83 | 
 84 | if(NOT CMAKE_Rust_COMPILE_OBJECT)
 85 | 	set(CMAKE_Rust_COMPILE_OBJECT
 86 | 		"echo \"CMAKE_Rust_COMPILE_OBJECT\""
 87 | 		"${CMAKE_Rust_ECHO_ALL}"
 88 | 		"${CMAKE_Rust_COMPILER} --emit obj <SOURCE> -o <OBJECT>")
 89 | endif()
 90 | 
 91 | if(NOT CMAKE_Rust_LINK_EXECUTABLE)
 92 | 	set(CMAKE_Rust_LINK_EXECUTABLE
 93 | 		"echo \"CMAKE_Rust_LINK_EXECUTABLE\""
 94 | 		"${CMAKE_Rust_ECHO_ALL}"
 95 | 		)
 96 | endif()
 97 | 
 98 | mark_as_advanced(
 99 | 	CMAKE_Rust_FLAGS
100 | 	CMAKE_Rust_FLAGS_DEBUG
101 | 	CMAKE_Rust_FLAGS_MINSIZEREL
102 | 	CMAKE_Rust_FLAGS_RELEASE
103 | 	CMAKE_Rust_FLAGS_RELWITHDEBINFO)
104 | 
105 | set(CMAKE_Rust_INFORMATION_LOADED 1)
106 | 
107 | 


--------------------------------------------------------------------------------
/cmake/CMakeTestRustCompiler.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_Rust_COMPILER_WORKS 1 CACHE INTERNAL "")
2 | 


--------------------------------------------------------------------------------
/cmake/FindRust.cmake:
--------------------------------------------------------------------------------
 1 | # ported from https://github.com/Devolutions/CMakeRust
 2 | set(_CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ${CMAKE_FIND_ROOT_PATH_MODE_PROGRAM})
 3 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH)
 4 | set(_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
 5 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
 6 | 
 7 | if(CMAKE_HOST_WIN32)
 8 | 	set(USER_HOME "$ENV{USERPROFILE}")
 9 | else()
10 | 	set(USER_HOME "$ENV{HOME}")
11 | endif()
12 | 
13 | if(NOT DEFINED CARGO_HOME)
14 | 	if("$ENV{CARGO_HOME}" STREQUAL "")
15 | 		set(CARGO_HOME "${USER_HOME}/.cargo")
16 | 	else()
17 | 		set(CARGO_HOME "$ENV{CARGO_HOME}")
18 | 	endif()
19 | endif()
20 | 
21 | # Find cargo executable
22 | find_program(CARGO_EXECUTABLE cargo
23 | 	HINTS "${CARGO_HOME}"
24 | 	PATH_SUFFIXES "bin")
25 | mark_as_advanced(CARGO_EXECUTABLE)
26 | 
27 | # Find rustc executable
28 | find_program(RUSTC_EXECUTABLE rustc
29 | 	HINTS "${CARGO_HOME}"
30 | 	PATH_SUFFIXES "bin")
31 | mark_as_advanced(RUSTC_EXECUTABLE)
32 | 
33 | # Find rustdoc executable
34 | find_program(RUSTDOC_EXECUTABLE rustdoc
35 | 	HINTS "${CARGO_HOME}"
36 | 	PATH_SUFFIXES "bin")
37 | mark_as_advanced(RUSTDOC_EXECUTABLE)
38 | 
39 | # Find rust-gdb executable
40 | find_program(RUST_GDB_EXECUTABLE rust-gdb
41 | 	HINTS "${CARGO_HOME}"
42 | 	PATH_SUFFIXES "bin")
43 | mark_as_advanced(RUST_GDB_EXECUTABLE)
44 | 
45 | # Find rust-lldb executable
46 | find_program(RUST_LLDB_EXECUTABLE rust-lldb
47 | 	HINTS "${CARGO_HOME}"
48 | 	PATH_SUFFIXES "bin")
49 | mark_as_advanced(RUST_LLDB_EXECUTABLE)
50 | 
51 | # Find rustup executable
52 | find_program(RUSTUP_EXECUTABLE rustup
53 | 	HINTS "${CARGO_HOME}"
54 | 	PATH_SUFFIXES "bin")
55 | mark_as_advanced(RUSTUP_EXECUTABLE)
56 | 
57 | set(RUST_FOUND FALSE CACHE INTERNAL "")
58 | 
59 | if(CARGO_EXECUTABLE AND RUSTC_EXECUTABLE AND RUSTDOC_EXECUTABLE)
60 | 	set(RUST_FOUND TRUE CACHE INTERNAL "")
61 | 
62 | 	set(CARGO_HOME "${CARGO_HOME}" CACHE PATH "Rust Cargo Home")
63 | 
64 | 	execute_process(COMMAND ${RUSTC_EXECUTABLE} --version OUTPUT_VARIABLE RUSTC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
65 | 	string(REGEX REPLACE "rustc ([^ ]+) .*" "\\1" RUSTC_VERSION "${RUSTC_VERSION}")
66 | endif()
67 | 
68 | if(NOT RUST_FOUND)
69 | 	message(FATAL_ERROR "Could not find Rust!")
70 | endif()
71 | 
72 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ${_CMAKE_FIND_ROOT_PATH_MODE_PROGRAM})
73 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
74 | 


--------------------------------------------------------------------------------
/cmake/FindSentencePiece.cmake:
--------------------------------------------------------------------------------
 1 | # FindSentencePiece.cmake
 2 | #
 3 | # Use this module as:
 4 | # 
 5 | #   find_package(SentencePiece)
 6 | #   find_package(SentencePiece REQUIRED)
 7 | # 
 8 | # This module provides the following imported targets, if found:
 9 | # 
10 | # SentencePiece::sentencepiece
11 | #   The Google SentencePiece library
12 | #
13 | 
14 | find_package(PkgConfig QUIET)
15 | if(PKG_CONFIG_FOUND)
16 |   pkg_check_modules(SENTENCEPIECE QUIET sentencepiece)
17 | endif()
18 | 
19 | find_path(SentencePiece_INCLUDE_DIR 
20 |   NAMES sentencepiece_processor.h 
21 |   PATH_SUFFIXES include
22 |   HINTS ${SENTENCEPIECE_INCLUDE_DIRS}
23 | )
24 | mark_as_advanced(SentencePiece_INCLUDE_DIR)
25 | 
26 | find_library(SentencePiece_LIBRARY 
27 |   NAMES sentencepiece
28 |   PATH_SUFFIXES lib
29 |   HINTS ${SENTENCEPIECE_LIBRARY_DIRS}
30 | )
31 | mark_as_advanced(SentencePiece_LIBRARY)
32 | 
33 | 
34 | include(FindPackageHandleStandardArgs)
35 | find_package_handle_standard_args(
36 |   SentencePiece
37 |   DEFAULT_MSG
38 |     SentencePiece_LIBRARY
39 |     SentencePiece_INCLUDE_DIR
40 | )
41 | 
42 | if(NOT SentencePiece_FOUND)
43 |   if(SentencePiece_FIND_REQUIRED)
44 |     message(FATAL_ERROR "Cannot find SentencePiece library")
45 |   else()
46 |     message(WARNING "SentencePiece library is not found!")
47 |   endif()
48 | else()
49 |   if(SentencePiece_FOUND AND NOT TARGET SentencePiece::sentencepiece)
50 |     add_library(SentencePiece::sentencepiece UNKNOWN IMPORTED)
51 |     set_target_properties(SentencePiece::sentencepiece PROPERTIES
52 |       IMPORTED_LOCATION "${SentencePiece_LIBRARY}"
53 |       INTERFACE_INCLUDE_DIRECTORIES "${SentencePiece_INCLUDE_DIR}"
54 |     )
55 |   endif()
56 | endif()
57 | 


--------------------------------------------------------------------------------
/cmake/cargo_library.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | 
 3 | # inspired by https://github.com/abseil/abseil-cpp
 4 | # cc_library()
 5 | # CMake function to imitate Bazel's cc_library rule.
 6 | function(cargo_library)
 7 |   cmake_parse_arguments(
 8 |       CARGO # prefix
 9 |       "" # options
10 |       "NAME" # one value args
11 |       "HDRS" # multi value args
12 |       ${ARGN}
13 |   )
14 | 
15 |   string(REPLACE "-" "_" LIB_NAME ${CARGO_NAME})
16 |   # set(CARGO_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR})
17 | 
18 |   # figure out the target triple
19 |   if(WIN32)
20 |     if(CMAKE_SIZEOF_VOID_P EQUAL 8)
21 |       set(LIB_TARGET "x86_64-pc-windows-msvc")
22 |     else()
23 |       set(LIB_TARGET "i686-pc-windows-msvc")
24 |     endif()
25 |   elseif(ANDROID)
26 |     if(ANDROID_SYSROOT_ABI STREQUAL "x86")
27 |       set(LIB_TARGET "i686-linux-android")
28 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "x86_64")
29 |       set(LIB_TARGET "x86_64-linux-android")
30 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "arm")
31 |       set(LIB_TARGET "arm-linux-androideabi")
32 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "arm64")
33 |       set(LIB_TARGET "aarch64-linux-android")
34 |     endif()
35 |   elseif(IOS)
36 |     set(LIB_TARGET "universal")
37 |   elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin)
38 |     set(LIB_TARGET "x86_64-apple-darwin")
39 |   else()
40 |     if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
41 |         set(LIB_TARGET "aarch64-unknown-linux-gnu") 
42 |     elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
43 |         set(LIB_TARGET "x86_64-unknown-linux-gnu")
44 |     else()
45 |         set(LIB_TARGET "i686-unknown-linux-gnu")
46 |     endif()
47 |   endif()
48 | 
49 |   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
50 |     set(LIB_BUILD_TYPE "debug")
51 |   else()
52 |     set(LIB_BUILD_TYPE "release")
53 |   endif()
54 | 
55 |   if(IOS)
56 |     set(CARGO_ARGS "lipo")
57 |   else()
58 |     set(CARGO_ARGS "build")
59 |     list(APPEND CARGO_ARGS "--target" ${LIB_TARGET})
60 |   endif()
61 | 
62 |   if(${LIB_BUILD_TYPE} STREQUAL "release")
63 |     list(APPEND CARGO_ARGS "--release")
64 |   endif()
65 | 
66 |   file(GLOB_RECURSE LIB_SOURCES "*.rs")
67 | 
68 |   set(CARGO_ENV_COMMAND ${CMAKE_COMMAND} -E env "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}")
69 | 
70 |   # build the library target with cargo
71 |   set(STATIC_LIB_NAME
72 |     "${CMAKE_STATIC_LIBRARY_PREFIX}${LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
73 |   set(LIB_FILE 
74 |     "${CMAKE_CURRENT_BINARY_DIR}/${LIB_TARGET}/${LIB_BUILD_TYPE}/${STATIC_LIB_NAME}")
75 |   
76 |   message(STATUS "running: ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS}")
77 | 
78 |   add_custom_command(
79 |       OUTPUT ${LIB_FILE}
80 |       COMMAND ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS}
81 |       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
82 |       DEPENDS ${LIB_SOURCES}
83 |       COMMENT "Building cargo library ${LIB_FILE}"
84 |   )
85 |   add_custom_target(${CARGO_NAME}_target ALL DEPENDS ${LIB_FILE})
86 | 
87 |   # add the library target
88 |   add_library(${CARGO_NAME} STATIC IMPORTED GLOBAL)
89 |   add_dependencies(${CARGO_NAME} ${CARGO_NAME}_target)
90 |   set_target_properties(${CARGO_NAME} PROPERTIES 
91 |     IMPORTED_LOCATION ${LIB_FILE}
92 |   )
93 |   target_sources(${CARGO_NAME} INTERFACE ${CARGO_HDRS})
94 | endfunction()
95 | 


--------------------------------------------------------------------------------
/cmake/cargo_shared_library.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | 
 3 | # inspired by https://github.com/abseil/abseil-cpp
 4 | # cc_library()
 5 | # CMake function to imitate Bazel's cc_library rule.
 6 | function(cargo_shared_library)
 7 |   cmake_parse_arguments(
 8 |       CARGO # prefix
 9 |       "" # options
10 |       "NAME" # one value args
11 |       "HDRS" # multi value args
12 |       ${ARGN}
13 |   )
14 | 
15 |   string(REPLACE "-" "_" LIB_NAME ${CARGO_NAME})
16 |   # set(CARGO_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR})
17 | 
18 |   # figure out the target triple
19 |   if(WIN32)
20 |     if(CMAKE_SIZEOF_VOID_P EQUAL 8)
21 |       set(LIB_TARGET "x86_64-pc-windows-msvc")
22 |     else()
23 |       set(LIB_TARGET "i686-pc-windows-msvc")
24 |     endif()
25 |   elseif(ANDROID)
26 |     if(ANDROID_SYSROOT_ABI STREQUAL "x86")
27 |       set(LIB_TARGET "i686-linux-android")
28 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "x86_64")
29 |       set(LIB_TARGET "x86_64-linux-android")
30 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "arm")
31 |       set(LIB_TARGET "arm-linux-androideabi")
32 |     elseif(ANDROID_SYSROOT_ABI STREQUAL "arm64")
33 |       set(LIB_TARGET "aarch64-linux-android")
34 |     endif()
35 |   elseif(IOS)
36 |     set(LIB_TARGET "universal")
37 |   elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin)
38 |     set(LIB_TARGET "x86_64-apple-darwin")
39 |   else()
40 |     if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
41 |         set(LIB_TARGET "aarch64-unknown-linux-gnu") 
42 |     elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
43 |         set(LIB_TARGET "x86_64-unknown-linux-gnu")
44 |     else()
45 |         set(LIB_TARGET "i686-unknown-linux-gnu")
46 |     endif()
47 |   endif()
48 | 
49 |   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
50 |     set(LIB_BUILD_TYPE "debug")
51 |   else()
52 |     set(LIB_BUILD_TYPE "release")
53 |   endif()
54 | 
55 |   if(IOS)
56 |     set(CARGO_ARGS "lipo")
57 |   else()
58 |     set(CARGO_ARGS "build")
59 |     list(APPEND CARGO_ARGS "--target" ${LIB_TARGET})
60 |   endif()
61 | 
62 |   if(${LIB_BUILD_TYPE} STREQUAL "release")
63 |     list(APPEND CARGO_ARGS "--release")
64 |   endif()
65 | 
66 |   file(GLOB_RECURSE LIB_SOURCES "*.rs")
67 | 
68 |   set(CARGO_ENV_COMMAND ${CMAKE_COMMAND} -E env "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}")
69 | 
70 |   # build the library target with cargo
71 |   set(SHARED_LIB_NAME
72 |     "${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
73 |   set(LIB_FILE 
74 |     "${CMAKE_CURRENT_BINARY_DIR}/${LIB_TARGET}/${LIB_BUILD_TYPE}/${SHARED_LIB_NAME}")
75 | 
76 |   add_custom_command(
77 |       OUTPUT ${LIB_FILE}
78 |       COMMAND ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS}
79 |       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
80 |       DEPENDS ${LIB_SOURCES}
81 |       COMMENT "Building cargo library ${LIB_FILE}"
82 |   )
83 |   add_custom_target(${CARGO_NAME}_target ALL DEPENDS ${LIB_FILE})
84 | 
85 |   # add the library target
86 |   add_library(${CARGO_NAME} SHARED IMPORTED GLOBAL)
87 |   add_dependencies(${CARGO_NAME} ${CARGO_NAME}_target)
88 |   set_target_properties(${CARGO_NAME} PROPERTIES 
89 |     IMPORTED_LOCATION ${LIB_FILE}
90 |     INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}
91 |     IMPORTED_NO_SONAME TRUE
92 |   )
93 | endfunction()
94 | 


--------------------------------------------------------------------------------
/cmake/cc_binary.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | 
 3 | # inspired by https://github.com/abseil/abseil-cpp
 4 | # cc_binary()
 5 | # CMake function to imitate Bazel's cc_binary rule.
 6 | #
 7 | # Parameters:
 8 | # NAME: name of target
 9 | # HDRS: List of public header files for the library
10 | # SRCS: List of source files for the library
11 | # COPTS: List of private compile options
12 | # DEFINES: List of public defines
13 | # LINKOPTS: List of link options
14 | # DEPS: List of other libraries to be linked in to the binary targets
15 | #
16 | # cc_library(
17 | #   NAME
18 | #     awesome
19 | #   HDRS
20 | #     "a.h"
21 | #   SRCS
22 | #     "a.cc"
23 | # )
24 | # cc_binary(
25 | #   NAME
26 | #     fantastic
27 | #   SRCS
28 | #     "b.cc"
29 | #   DEPS
30 | #     :awesome
31 | # )
32 | #
33 | function(cc_binary)
34 |   cmake_parse_arguments(
35 |     CC_BINARY # prefix
36 |     "" # options
37 |     "NAME" # one value args
38 |     "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args
39 |     ${ARGN}
40 |   )
41 | 
42 |   add_executable(${CC_BINARY_NAME} "")
43 |   target_sources(${CC_BINARY_NAME} 
44 |     PRIVATE ${CC_BINARY_SRCS} ${CC_BINARY_HDRS}
45 |   )
46 |   target_link_libraries(${CC_BINARY_NAME}
47 |     PUBLIC 
48 |       ${CC_BINARY_DEPS}
49 |     PRIVATE
50 |       ${CC_BINARY_LINKOPTS}
51 |   )
52 |   target_include_directories(${CC_BINARY_NAME}
53 |     PUBLIC
54 |       "$<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>"      
55 |   )
56 |   target_compile_options(${CC_BINARY_NAME} PRIVATE ${CC_BINARY_COPTS})
57 |   target_compile_definitions(${CC_BINARY_NAME} PUBLIC ${CC_BINARY_DEFINES})
58 | 
59 |   add_executable(:${CC_BINARY_NAME} ALIAS ${CC_BINARY_NAME})
60 | endfunction()
61 | 
62 | 


--------------------------------------------------------------------------------
/cmake/cc_library.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | 
 3 | # inspired by https://github.com/abseil/abseil-cpp
 4 | # cc_library()
 5 | # CMake function to imitate Bazel's cc_library rule.
 6 | #
 7 | # Parameters:
 8 | # NAME: name of target
 9 | # HDRS: List of public header files for the library
10 | # SRCS: List of source files for the library
11 | # DEPS: List of other libraries to be linked in to the binary targets
12 | # COPTS: List of private compile options
13 | # DEFINES: List of public defines
14 | # LINKOPTS: List of link options
15 | #
16 | # cc_library(
17 | #   NAME
18 | #     awesome
19 | #   HDRS
20 | #     "a.h"
21 | #   SRCS
22 | #     "a.cc"
23 | # )
24 | # cc_library(
25 | #   NAME
26 | #     fantastic_lib
27 | #   SRCS
28 | #     "b.cc"
29 | #   DEPS
30 | #     :awesome
31 | # )
32 | #
33 | function(cc_library)
34 |   cmake_parse_arguments(
35 |     CC_LIB # prefix
36 |     "TESTONLY" # options
37 |     "NAME" # one value args
38 |     "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS;INCLUDES" # multi value args
39 |     ${ARGN}
40 |   )
41 | 
42 |   if(CC_LIB_TESTONLY AND (NOT BUILD_TESTING))
43 |     return()
44 |   endif()
45 | 
46 |   # Check if this is a header only library
47 |   set(_CC_SRCS "${CC_LIB_SRCS}")
48 |   foreach(src_file IN LISTS _CC_SRCS)
49 |     if(${src_file} MATCHES ".*\\.(h|inc)")
50 |       list(REMOVE_ITEM _CC_SRCS "${src_file}")
51 |     endif()
52 |   endforeach()
53 | 
54 |   if(_CC_SRCS STREQUAL "")
55 |     set(CC_LIB_IS_INTERFACE 1)
56 |   else()
57 |     set(CC_LIB_IS_INTERFACE 0)
58 |   endif()
59 | 
60 |   if(NOT CC_LIB_IS_INTERFACE)
61 |     add_library(${CC_LIB_NAME} STATIC)
62 |     target_sources(${CC_LIB_NAME} 
63 |       PRIVATE ${CC_LIB_SRCS} ${CC_LIB_HDRS})
64 |     target_link_libraries(${CC_LIB_NAME}
65 |       PUBLIC ${CC_LIB_DEPS}
66 |       PRIVATE ${CC_LIB_LINKOPTS}
67 |     )
68 |     target_include_directories(${CC_LIB_NAME}
69 |       PUBLIC 
70 |         "$<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>"
71 |         ${CC_LIB_INCLUDES}
72 |     )
73 |     target_compile_options(${CC_LIB_NAME} PRIVATE ${CC_LIB_COPTS})
74 |     target_compile_definitions(${CC_LIB_NAME} PUBLIC ${CC_LIB_DEFINES})
75 |   else()
76 |     # Generating header only library
77 |     add_library(${CC_LIB_NAME} INTERFACE)
78 |     target_include_directories(${CC_LIB_NAME}
79 |       INTERFACE 
80 |         "$<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>"
81 |         ${CC_LIB_INCLUDES}
82 |     )
83 | 
84 |     target_link_libraries(${CC_LIB_NAME}
85 |       INTERFACE ${CC_LIB_DEPS} ${CC_LIB_LINKOPTS}
86 |     )
87 |     target_compile_definitions(${CC_LIB_NAME} INTERFACE ${CC_LIB_DEFINES})
88 |   endif()
89 | 
90 |   # add alias for the library target
91 |   add_library(:${CC_LIB_NAME} ALIAS ${CC_LIB_NAME})
92 | endfunction()
93 | 


--------------------------------------------------------------------------------
/cmake/cc_test.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | 
 3 | # inspired by https://github.com/abseil/abseil-cpp
 4 | # cc_test()
 5 | # CMake function to imitate Bazel's cc_test rule.
 6 | #
 7 | # Parameters:
 8 | # NAME: name of target (see Usage below)
 9 | # SRCS: List of source files for the binary
10 | # DEPS: List of other libraries to be linked in to the binary targets
11 | # COPTS: List of private compile options
12 | # LINKOPTS: List of link options
13 | # ARGS: Command line arguments to test case
14 | #
15 | # Usage:
16 | # cc_library(
17 | #   NAME
18 | #     awesome
19 | #   HDRS
20 | #     "a.h"
21 | #   SRCS
22 | #     "a.cc"
23 | # )
24 | #
25 | # cc_test(
26 | #   NAME
27 | #     awesome_test
28 | #   SRCS
29 | #     "awesome_test.cc"
30 | #   DEPS
31 | #     :awesome
32 | #     GTest::gmock
33 | # )
34 | #
35 | function(cc_test)
36 |   if(NOT BUILD_TESTING)
37 |     return()
38 |   endif()
39 | 
40 |   cmake_parse_arguments(
41 |     CC_TEST # prefix
42 |     "" # options
43 |     "NAME" # one value args
44 |     "SRCS;COPTS;LINKOPTS;DEPS;INCLUDES;ARGS;DATA" # multi value args
45 |     ${ARGN}
46 |   )
47 | 
48 |   # place test data in build directory
49 |   if(CC_TEST_DATA)
50 |     foreach(data ${CC_TEST_DATA})
51 |       configure_file(${data} ${CMAKE_CURRENT_BINARY_DIR}/${data} COPYONLY)
52 |     endforeach()
53 |   endif()
54 | 
55 |   add_executable(${CC_TEST_NAME})
56 |   target_sources(${CC_TEST_NAME} PRIVATE ${CC_TEST_SRCS})
57 |   target_include_directories(${CC_TEST_NAME}
58 |     PUBLIC 
59 |       "$<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>" 
60 |       ${CC_TEST_INCLUDES}
61 |   )
62 | 
63 |   target_compile_options(${CC_TEST_NAME}
64 |     PRIVATE ${CC_TEST_COPTS}
65 |   )
66 | 
67 |   target_link_libraries(${CC_TEST_NAME}
68 |     PUBLIC ${CC_TEST_DEPS}
69 |     PRIVATE ${CC_TEST_LINKOPTS}
70 |   )
71 | 
72 |   gtest_add_tests(
73 |     TARGET ${CC_TEST_NAME}
74 |     EXTRA_ARGS ${CC_TEST_ARGS}
75 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
76 |   )
77 | endfunction()
78 | 


--------------------------------------------------------------------------------
/cmake/grpc_proto_library.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | include(CMakePrintHelpers)
 3 | 
 4 | # inspired by https://github.com/abseil/abseil-cpp
 5 | # grpc_proto_library()
 6 | # CMake function to imitate Bazel's grpc_proto_library rule.
 7 | #
 8 | # Parameters:
 9 | # NAME: name of target
10 | # SRCS: List of proto source files for the library
11 | # DEPS: List of other libraries to be linked in to the binary targets
12 | # COPTS: List of private compile options
13 | # DEFINES: List of public defines
14 | # LINKOPTS: List of link options
15 | #
16 | # grpc_proto_library(
17 | #   NAME
18 | #     proto_lib
19 | #   SRCS
20 | #     "b.proto"
21 | # )
22 | #
23 | function(grpc_proto_library)
24 |   cmake_parse_arguments(
25 |     PROTO_LIB # prefix
26 |     "" # options
27 |     "NAME" # one value args
28 |     "SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args
29 |     ${ARGN}
30 |   )
31 | 
32 |   # Add Library target with protobuf sources
33 |   add_library(${PROTO_LIB_NAME} ${PROTO_LIB_SRCS})
34 | 
35 |   # Link dependencies
36 |   target_link_libraries(${PROTO_LIB_NAME}
37 |     PUBLIC
38 |       protobuf::libprotobuf
39 |       gRPC::grpc
40 |       gRPC::grpc++
41 |       gRPC::grpc++_reflection
42 |     PRIVATE
43 |       ${PROTO_LIB_DEPS}
44 |   )
45 | 
46 |   # Set include directories
47 |   target_include_directories(${PROTO_LIB_NAME}
48 |     PUBLIC 
49 |       ${Protobuf_INCLUDE_DIRS}
50 |       ${CMAKE_CURRENT_BINARY_DIR}
51 |   )
52 | 
53 |   # Set compile options
54 |   target_compile_options(${PROTO_LIB_NAME}
55 |     PRIVATE
56 |       ${PROTO_LIB_COPTS}
57 |       -Wno-unused-parameter
58 |   )
59 | 
60 |   # Set compile definitions
61 |   target_compile_definitions(${PROTO_LIB_NAME}
62 |     PUBLIC ${PROTO_LIB_DEFINES}
63 |   )
64 | 
65 |   # Compile protobuf and grpc files
66 |   protobuf_generate(
67 |     TARGET ${PROTO_LIB_NAME} 
68 |     IMPORT_DIRS .
69 |     LANGUAGE cpp)
70 | 
71 |   # Get grpc_cpp_plugin location
72 |   get_target_property(grpc_cpp_plugin_location gRPC::grpc_cpp_plugin LOCATION)
73 | 
74 |   # Generate grpc files from protobuf files using grpc_cpp_plugin
75 |   protobuf_generate(
76 |     TARGET ${PROTO_LIB_NAME}
77 |     LANGUAGE grpc
78 |     IMPORT_DIRS .
79 |     GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc
80 |     PLUGIN "protoc-gen-grpc=${grpc_cpp_plugin_location}"
81 |   )
82 | 
83 |   # Set alias for library
84 |   add_library(grpc_proto::${PROTO_LIB_NAME} ALIAS ${PROTO_LIB_NAME})
85 | endfunction()
86 | 


--------------------------------------------------------------------------------
/cmake/proto_library.cmake:
--------------------------------------------------------------------------------
 1 | include(CMakeParseArguments)
 2 | include(CMakePrintHelpers)
 3 | 
 4 | # inspired by https://github.com/abseil/abseil-cpp
 5 | # proto_library()
 6 | # CMake function to imitate Bazel's proto_library rule.
 7 | #
 8 | # Parameters:
 9 | # NAME: name of target
10 | # SRCS: List of proto source files for the library
11 | # DEPS: List of other libraries to be linked in to the binary targets
12 | # COPTS: List of private compile options
13 | # DEFINES: List of public defines
14 | # LINKOPTS: List of link options
15 | #
16 | # cc_library(
17 | #   NAME
18 | #     awesome
19 | #   HDRS
20 | #     "a.h"
21 | #   SRCS
22 | #     "a.cc"
23 | # )
24 | # proto_library(
25 | #   NAME
26 | #     proto_lib
27 | #   SRCS
28 | #     "b.proto"
29 | #   DEPS
30 | #     :awesome
31 | # )
32 | #
33 | function(proto_library)
34 |   # parse arguments and set variables
35 |   cmake_parse_arguments(
36 |     PROTO_LIB # prefix
37 |     "" # options
38 |     "NAME" # one value args
39 |     "SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args
40 |     ${ARGN}
41 |   )
42 |   # generate cpp and hpp files from proto files using protoc compiler
43 |   protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_LIB_SRCS})
44 | 
45 |   add_library(${PROTO_LIB_NAME} STATIC)
46 |   target_sources(${PROTO_LIB_NAME}
47 |     PRIVATE ${PROTO_SRCS} ${PROTO_HDRS}
48 |   )
49 | 
50 |   target_link_libraries(${PROTO_LIB_NAME}
51 |     PUBLIC protobuf::libprotobuf
52 |   )
53 |   target_include_directories(${PROTO_LIB_NAME}
54 |     PUBLIC 
55 |       ${Protobuf_INCLUDE_DIRS}
56 |       ${CMAKE_CURRENT_BINARY_DIR}
57 |   )
58 |   target_compile_options(${PROTO_LIB_NAME} 
59 |     PRIVATE 
60 |       ${PROTO_LIB_COPTS}
61 |       -Wno-unused-parameter
62 |   )
63 |   target_compile_definitions(${PROTO_LIB_NAME} 
64 |     PUBLIC 
65 |       ${PROTO_LIB_DEFINES}
66 |   )
67 | 
68 |   add_library(proto::${PROTO_LIB_NAME} ALIAS ${PROTO_LIB_NAME})
69 | endfunction()
70 | 
71 | 


--------------------------------------------------------------------------------
/cmake/static_analyzers.cmake:
--------------------------------------------------------------------------------
 1 | option(ENABLE_CPPCHECK "Enable static analysis with cppcheck" OFF)
 2 | option(ENABLE_CLANG_TIDY "Enable static analysis with clang-tidy" OFF)
 3 | option(ENABLE_INCLUDE_WHAT_YOU_USE "Enable static analysis with include-what-you-use" OFF)
 4 | 
 5 | if(ENABLE_CPPCHECK)
 6 |   find_program(CPPCHECK cppcheck)
 7 | 
 8 |   if(CPPCHECK)
 9 |     set(CMAKE_CXX_CPPCHECK ${CPPCHECK} 
10 |       --suppressions-list=${CMAKE_CURRENT_SOURCE_DIR}/.cppcheck-suppress 
11 |       --enable=all 
12 |       --inconclusive 
13 |       --inline-suppr)
14 |     message(STATUS "Using cppcheck: " ${CPPCHECK})
15 |   else()
16 |     message(SEND_ERROR "cppcheck requested but executable not found")
17 |   endif()
18 | endif()
19 | 
20 | if(ENABLE_CLANG_TIDY)
21 |   find_program(CLANGTIDY clang-tidy)
22 | 
23 |   if(CLANGTIDY)
24 |     set(CMAKE_CXX_CLANG_TIDY ${CLANGTIDY} -extra-arg=-Wno-unknown-warning-option)
25 |     message(STATUS "Using clang-tidy: " ${CLANGTIDY})
26 |   else()
27 |     message(SEND_ERROR "clang-tidy requested but executable not found")
28 |   endif()
29 | endif()
30 | 
31 | if(ENABLE_INCLUDE_WHAT_YOU_USE)
32 |   find_program(INCLUDE_WHAT_YOU_USE include-what-you-use)
33 | 
34 |   if(INCLUDE_WHAT_YOU_USE)
35 |     set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE ${INCLUDE_WHAT_YOU_USE})
36 |     message(STATUS "Using include-what-you-use: " ${INCLUDE_WHAT_YOU_USE})
37 |   else()
38 |     message(SEND_ERROR "include-what-you-use requested but executable not found")
39 |   endif()
40 | endif()
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/assets/service_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/service_arch.png


--------------------------------------------------------------------------------
/docs/assets/wechat_qrcode1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/wechat_qrcode1.png


--------------------------------------------------------------------------------
/docs/assets/wechat_qrcode2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/wechat_qrcode2.png


--------------------------------------------------------------------------------
/docs/assets/xllm_service_title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/xllm_service_title.png


--------------------------------------------------------------------------------
/docs/en/getting_started.md:
--------------------------------------------------------------------------------
 1 | # Compilation and Execution
 2 | 
 3 | ## Container
 4 | First, download the image we provide:
 5 | ```bash
 6 | docker pull xllm-ai/xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64
 7 | ```
 8 | Then create the corresponding container:
 9 | ```bash
10 | sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host  --device=/dev/davinci0  --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /var/queue_schedule:/var/queue_schedule -v /mnt/cfs/9n-das-admin/llm_models:/mnt/cfs/9n-das-admin/llm_models -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi -v /usr/local/sbin/:/usr/local/sbin/ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf -v /var/log/npu/slog/:/var/log/npu/slog -v /export/home:/export/home -w /export/home -v ~/.ssh:/root/.ssh  -v /var/log/npu/profiling/:/var/log/npu/profiling -v /var/log/npu/dump/:/var/log/npu/dump -v /home/:/home/  -v /runtime/:/runtime/  xllm-ai:xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64
11 | ```
12 | 
13 | ## Compilation
14 | ```bash
15 | git clone https://github.com/jd-opensource/xllm-service
16 | cd xllm_service
17 | git submodule init
18 | git submodule update
19 | ```
20 | 
21 | ### etcd Installation
22 | Use the installation script provided by etcd official:
23 | ```bash
24 | mv /tmp/etcd-download-test/etcd /path/to/your/etcd
25 | ```
26 | 
27 | ### Adding a Patch
28 | `etcd_cpp_apiv3` depends on the cpprest static library, but cpprest is built as a dynamic library by default. Therefore, you need to add a patch to the CMakeLists.txt of cpprest:
29 | ```bash
30 | bash prepare.sh
31 | ```
32 | 
33 | ### xLLM Service Compilation
34 | ```bash
35 | mkdir -p build
36 | cd build
37 | cmake ..
38 | make -j 8
39 | cd ..
40 | ```
41 | !!! warning "Possible Errors"
42 |     Here may encounter installation errors about `boost-locale` and `boost-interprocess`: `vcpkg-src/packages/boost-locale_x64-linux/include: No such file or directory`,`/vcpkg-src/packages/boost-interprocess_x64-linux/include: No such file or directory`
43 |     We use `vcpkg` to reinstall these packages:
44 |     ```bash
45 |     /path/to/vcpkg remove boost-locale boost-interprocess
46 |     /path/to/vcpkg install boost-locale:x64-linux
47 |     /path/to/vcpkg install boost-interprocess:x64-linux
48 |     ```
49 | 
50 | ## Execution
51 | 1. First, start the etcd service:
52 | ```bash 
53 | ./etcd-download-test/etcd --listen-peer-urls 'http://localhost:2390'  --listen-client-urls 'http://localhost:2389' --advertise-client-urls  'http://localhost:2391'
54 | ```
55 | 
56 | 2. Then start the xllm-service service:
57 | ```bash
58 | ENABLE_DECODE_RESPONSE_TO_SERVICE=0 \
59 | ENABLE_XLLM_DEBUG_LOG=1 \
60 | ./build/xllm_service/xllm_master_serving \
61 |     --etcd_addr="127.0.0.1:2389" \
62 |     --http_server_port=9888 \
63 |     --rpc_server_port=9889 \
64 |     --tokenizer_path /path/to/tokenizer_config/
65 | ```
66 | 
67 | xllm-service needs to start an http service and an rpc service. The http service is used to receive and process user requests, and the rpc service is used to interact with xllm instances.
68 | 
69 | The complete usage process needs to be used with xllm, please refer to the link: [xLLM PD Disaggregated Deployment](https://xllm.readthedocs.io/zh-cn/latest/zh/getting_started/PD_disagg/)
70 | 
71 | ### service Parameters
72 | http service：It is used to receive and process user requests.
73 | | Parameter | Description | Default Value |
74 | | --- | --- | --- |
75 | | http_server_host | http service address | "" |
76 | | http_server_port | http service port | 8888 |
77 | | http_server_idle_timeout_s | http service timeout | -1 |
78 | | http_server_num_threads | http service thread number | 32 |
79 | | http_server_max_concurrency | http service max concurrency | 128 |
80 | 
81 | rpc service：It is used to interact with xllm, manage the status of xllm instance clusters, etc.
82 | | Parameter | Description | Default Value |
83 | | --- | --- | --- |
84 | | rpc_server_host | rpc service address | "" |
85 | | rpc_server_port | rpc service port | 8889 |
86 | | rpc_server_idle_timeout_s | rpc service timeout | -1 |
87 | | rpc_server_num_threads | rpc service thread number | 32 |
88 | | rpc_server_max_concurrency | rpc service max concurrency | 128 |
89 | 
90 | Environment Variables:
91 | ENABLE_DECODE_RESPONSE_TO_SERVICE: In the PD disaggregated scenario, whether to return the decoding result to the service directly(without forwarding through the P instance), 0 means "no", 1 means "yes".
92 | ENABLE_XLLM_DEBUG_LOG: Whether to enable xllm debug log, 0 means "no", 1 means "yes".
93 | 


--------------------------------------------------------------------------------
/docs/en/overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 | ---
 5 | 
 6 | <h1 align="center">
 7 |     xLLM-service
 8 | </h1>
 9 | 
10 | ## 1. Project Overview
11 | 
12 | **xLLM-service** is a service-layer framework developed based on the **xLLM** inference engine, providing efficient, fault-tolerant, and flexible LLM inference services for clustered deployment.
13 | 
14 | xLLM-service targets to address key challenges in enterprise-level service scenarios:
15 | 
16 | - How to ensure the SLA of online services and improve resource utilization of offline tasks in a hybrid online-offline deployment environment.
17 | 
18 | - How to react to changing request loads in actual businesses, such as fluctuations in input/output lengths.
19 | 
20 | - Resolving performance bottlenecks of multimodal model requests.
21 | 
22 | - Ensuring high reliability of computing instances.
23 | 
24 | #### Background
25 | 
26 | LLM with parameter scales ranging from tens of billions to trillions are being rapidly deployed in core business scenarios such as intelligent customer service, real-time recommendation, and content generation. Efficient support for domestic computing hardware has become a core requirement for low-cost inference deployment. Existing inference engines struggle to effectively adapt to the architectural characteristics of dedicated accelerators like domestic chips. Performance issues such as low utilization of computing units, load imbalance and communication overhead bottlenecks under the MoE architecture, and difficulties in kv cache management have restricted the efficient inference of requests and the scalability of the system. The xLLM-service + xLLM inference engine improves the efficiency of the entire performance link and currently supports JD\.com's online services across multiple scenarios and with multiple models.
27 | 
28 | --- 
29 | 
30 | ## 2. Overall Architecture
31 | The overall architecture of xLLM-service is shown in the figure below:
32 | 
33 | ![1](../assets/service_arch.png)
34 | 
35 | ## 3. Core Components
36 | 
37 | ### ETCD Cluster
38 | It is used for metadata management, including the storage and management of metadata such as models, xllm instances, and requests. It also provides xllm node registration and discovery services.
39 | 
40 | ### Fault Tolerance
41 | xLLM-service provides fault tolerance management to ensure service quality and stability.
42 | 
43 | ### Global Scheduler
44 | It implements globally aware scheduling. Based on the current system status, it accurately dispatches requests to the optimal instances for execution, effectively improving the overall service response efficiency and resource utilization.
45 | 
46 | ### Global KV Cache Manager
47 | It is responsible for global KV Cache management. Its core capabilities include distributed KV cache awareness, Prefix matching, and dynamic migration of KV Cache, which optimize the efficiency of cache resource usage.
48 | 
49 | ### Instance Manager
50 | It focuses on the full-lifecycle management of instances. All xllm instances must register to service after startup. Based on preset policies, the module provides support for instances such as scheduling adaptation and fault tolerance handling.
51 | 
52 | ### Event Plane
53 | As the metrics and event hub, it receives Metrics data reported by various instances, uniformly collects and organizes statistical indicators, and provides data support for decisions such as service scheduling, fault tolerance, and scaling.
54 | 
55 | ### Planner
56 | It undertakes the functions of strategy analysis and decision-making. Based on the Metrics data reported by the Event Plane (including instance runtime indicators, machine load indicators, etc.), it analyzes the service scaling needs and the necessity of expanding hot instances, and outputs resource adjustment and instance optimization strategies.
57 | 


--------------------------------------------------------------------------------
/docs/zh/getting_started.md:
--------------------------------------------------------------------------------
 1 | # 编译与运行
 2 | 
 3 | ## 容器
 4 | 首先下载我们提供的镜像：
 5 | ```bash
 6 | docker pull xllm-ai/xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64
 7 | ```
 8 | 然后创建对应的容器
 9 | ```bash
10 | sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host  --device=/dev/davinci0  --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /var/queue_schedule:/var/queue_schedule -v /mnt/cfs/9n-das-admin/llm_models:/mnt/cfs/9n-das-admin/llm_models -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi -v /usr/local/sbin/:/usr/local/sbin/ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf -v /var/log/npu/slog/:/var/log/npu/slog -v /export/home:/export/home -w /export/home -v ~/.ssh:/root/.ssh  -v /var/log/npu/profiling/:/var/log/npu/profiling -v /var/log/npu/dump/:/var/log/npu/dump -v /home/:/home/  -v /runtime/:/runtime/  xllm-ai:xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64
11 | ```
12 | 
13 | ## 编译
14 | ```bash
15 | git clone https://github.com/jd-opensource/xllm-service
16 | cd xllm_service
17 | git submodule init
18 | git submodule update
19 | ```
20 | 
21 | ### etcd安装
22 | 使用etcd官方提供的[安装脚本](https://github.com/etcd-io/etcd/releases)进行安装，其脚本提供的默认安装路径是`/tmp/etcd-download-test/etcd`，我们可以手动修改其脚本中的安装路径，也可以运行完脚本之后手动迁移：
23 | ```bash
24 | mv /tmp/etcd-download-test/etcd /path/to/your/etcd
25 | ```
26 | 
27 | ### 添加补丁
28 | etcd_cpp_apiv3 依赖 cpprest 静态库，但 cpprest 编译产生的是动态库，因此需要给 cpprest 的 CMakeLists.txt 加一个补丁：
29 | ```bash
30 | bash prepare.sh
31 | ```
32 | 
33 | ### xLLM Service编译
34 | 再执行编译:
35 | ```bash
36 | mkdir -p build
37 | cd build
38 | cmake ..
39 | make -j 8
40 | cd ..
41 | ```
42 | !!! warning "可能的错误"
43 |     这里能会遇到关于`boost-locale`和`boost-interprocess`的安装错误：`vcpkg-src/packages/boost-locale_x64-linux/include: No such     file or directory`,`/vcpkg-src/packages/boost-interprocess_x64-linux/include: No such file or directory`
44 |     我们使用`vcpkg`重新安装这些包:
45 |     ```bash
46 |     /path/to/vcpkg remove boost-locale boost-interprocess
47 |     /path/to/vcpkg install boost-locale:x64-linux
48 |     /path/to/vcpkg install boost-interprocess:x64-linux
49 |     ```
50 | 
51 | ## 运行
52 | 1. 首先需要启动etcd服务:
53 | ```bash 
54 | ./etcd-download-test/etcd --listen-peer-urls 'http://localhost:2390'  --listen-client-urls 'http://localhost:2389' --advertise-client-urls  'http://localhost:2391'
55 | ```
56 | 
57 | 2. 然后启动service服务:
58 | ```bash
59 | ENABLE_DECODE_RESPONSE_TO_SERVICE=0 \
60 | ENABLE_XLLM_DEBUG_LOG=1 \
61 | ./build/xllm_service/xllm_master_serving \
62 |     --etcd_addr="127.0.0.1:2389" \
63 |     --http_server_port=9888 \
64 |     --rpc_server_port=9889 \
65 |     --tokenizer_path /path/to/tokenizer_config/
66 | ```
67 | 
68 | xllm-service需要启动一个http服务和一个rpc服务，http服务用于对外接收与处理用户请求，rpc服务用于和xllm实例进行交互。
69 | 
70 | 完整的使用流程需要结合xllm一起使用，请查看链接: [xLLM PD分离部署](https://xllm.readthedocs.io/zh-cn/latest/zh/getting_started/PD_disagg/)
71 | 
72 | ### service参数
73 | http服务：用于对外接收以及处理用户请求。
74 | | 参数 | 说明 | 默认值 |
75 | | --- | --- | --- |
76 | | http_server_host | http 服务地址 | "" |
77 | | http_server_port | http 服务端口 | 8888 |
78 | | http_server_idle_timeout_s | http 服务超时时间 | -1 |
79 | | http_server_num_threads | http 服务线程数 | 32 |
80 | | http_server_max_concurrency | http 服务最大请求并发数 | 128 |
81 | 
82 | rpc服务：用于与xllm之间交互，管理xllm实例集群状态等。
83 | | 参数 | 说明 | 默认值 |
84 | | --- | --- | --- |
85 | | rpc_server_host | rpc 服务地址 | "" |
86 | | rpc_server_port | rpc 服务端口 | 8889 |
87 | | rpc_server_idle_timeout_s | rpc 服务超时时间 | -1 |
88 | | rpc_server_num_threads | rpc 服务线程数 | 32 |
89 | | rpc_server_max_concurrency | rpc 服务最大请求并发数 | 128 |
90 | 
91 | 环境参数:
92 | ENABLE_DECODE_RESPONSE_TO_SERVICE: 在PD分离场景下，是否将解码结果直接返回给service(不需要经过P实例转发)，0表示“否”，1表示“是”。
93 | ENABLE_XLLM_DEBUG_LOG: 是否开启xllm debug log，0表示不开启，1表示开启。
94 | 


--------------------------------------------------------------------------------
/docs/zh/overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 | ---
 5 | 
 6 | <h1 align="center">
 7 |     xLLM-service
 8 | </h1>
 9 | 
10 | ## 1. 简介
11 | 
12 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架，为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。
13 | 
14 | xLLM-service 旨在解决企业级服务场景中的关键挑战：
15 | - 如何于在离线混合部署环境中，保障在线服务的SLA，提升离线任务的资源利用率。
16 | - 如何适应实际业务中动态变化的请求负载，如输入/输出长度出现剧烈波动。
17 | - 解决多模态模型请求的性能瓶颈。
18 | - 保障集群计算实例的高可靠性。
19 | 
20 | #### 背景
21 | 当前，百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景，对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性，硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题，制约了请求的高效推理与系统的可扩展性。xLLM-service + xLLM推理引擎提升了全链路效率，目前已支撑京东多场景、多模型的线上服务。
22 | 
23 | --- 
24 | 
25 | ## 2. 整体架构
26 | xLLM-service 整体架构如图所示:
27 | 
28 | ![1](../assets/service_arch.png)
29 | 
30 | ## 3. 核心组件
31 | 
32 | ### ETCD Cluster
33 | 用于元信息管理，包括模型，xllm实例，请求等元信息的存储与管理。同时提供xllm节点注册与发现服务。
34 | 
35 | ### Fault Tolerance
36 | xLLM-service 提供容错管理，保障服务质量以及稳定性。
37 | 
38 | ### Global Scheduler
39 | 实现全局感知调度，根据当前系统状态，将请求精准调度至最优实例执行，有效提升整体服务响应效率与资源利用率。
40 | 
41 | ### Global KV Cache Manager
42 | 负责全局 KV Cache 管理，核心能力包括分布式 KV 缓存感知、Prefix 前缀匹配、KV Cache 动态迁移等，优化缓存资源使用效率。
43 | 
44 | ### Instance Manager
45 | 聚焦实例全生命周期管理，所有 xllm 实例启动后需向本模块注册，模块基于预设策略，为实例提供调度适配、容错处理等支持。
46 | 
47 | ### Event Plane
48 | 作为指标与事件中枢，接收各实例上报的 Metrics 数据，对统计指标进行统一收集与整理，为服务调度、容错、扩缩容等决策提供数据支撑。
49 | 
50 | ### Planner
51 | 承担策略分析与决策职能，基于 Event Plane 上报的 Metrics 数据（含实例运行时指标、机器负载指标等），分析服务扩缩容需求、热点实例扩展必要性，输出资源调整与实例优化策略。


--------------------------------------------------------------------------------
/prepare.sh:
--------------------------------------------------------------------------------
1 | cd ./third_party/cpprestsdk
2 | git apply ../custom_cache/cpprestsdk.patch


--------------------------------------------------------------------------------
/third_party/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | add_subdirectory(cpprestsdk)
 4 | set(CPPREST_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cpprestsdk/Release/include)
 5 | set(CPPREST_LIB ${CMAKE_BINARY_DIR}/third_party/cpprestsdk/Release/Binaries/libcpprest.a)
 6 | add_subdirectory(etcd_cpp_apiv3)
 7 | add_subdirectory(brpc)
 8 | add_subdirectory(minja)
 9 | add_subdirectory(sentencepiece)
10 | add_subdirectory(smhasher/src)


--------------------------------------------------------------------------------
/third_party/custom_cache/cpprestsdk.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/Release/CMakeLists.txt b/Release/CMakeLists.txt
 2 | index 14e43ced..428e0038 100644
 3 | --- a/Release/CMakeLists.txt
 4 | +++ b/Release/CMakeLists.txt
 5 | @@ -26,7 +26,7 @@ set(CPPREST_INSTALL ON CACHE BOOL "Add install commands.")
 6 |  if(IOS OR ANDROID)
 7 |    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
 8 |  else()
 9 | -  set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared libraries")
10 | +  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
11 |  endif()
12 |  
13 |  if(IOS OR ANDROID OR WINDOWS_STORE OR WINDOWS_PHONE)
14 | 


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "xllm",
  3 |   "version": "0.0.1",
  4 |   "dependencies": [
  5 |     {
  6 |       "name": "abseil",
  7 |       "version>=": "20230125.3"
  8 |     },
  9 |     {
 10 |       "name": "boost-asio",
 11 |       "version>=": "1.84.0"
 12 |     },
 13 |     {
 14 |       "name": "boost-algorithm",
 15 |       "version>=": "1.84.0"
 16 |     },
 17 |     {
 18 |       "name": "boost-beast",
 19 |       "version>=": "1.84.0"
 20 |     },
 21 |     {
 22 |       "name": "boost-thread",
 23 |       "version>=": "1.84.0"
 24 |     },
 25 |     {
 26 |       "name": "boost-filesystem",
 27 |       "version>=": "1.84.0"
 28 |     },
 29 |     {
 30 |       "name": "boost-chrono",
 31 |       "version>=": "1.84.0"
 32 |     },
 33 |     {
 34 |       "name": "boost-atomic",
 35 |       "version>=": "1.84.0"
 36 |     },
 37 |     {
 38 |       "name": "boost-random",
 39 |       "version>=": "1.84.0"
 40 |     },
 41 |     {
 42 |       "name": "boost-serialization",
 43 |       "version>=": "1.84.0"
 44 |     },
 45 |     {
 46 |       "name": "boost-locale",
 47 |       "version>=": "1.84.0"
 48 |     },
 49 |     {
 50 |       "name": "boost-interprocess",
 51 |       "version>=": "1.84.0"
 52 |     },
 53 |     {
 54 |       "name": "eigen3",
 55 |       "version>=": "3.4.0"
 56 |     },
 57 |     {
 58 |       "name": "protobuf",
 59 |       "version>=": "3.21.12",
 60 |       "features": ["zlib"]
 61 |     },
 62 |     {
 63 |       "name": "gflags",
 64 |       "version>=": "2.2.2#7"
 65 |     },
 66 |     {
 67 |       "name": "gtest",
 68 |       "version>=": "1.13.0"
 69 |     },
 70 |     {
 71 |       "name": "glog",
 72 |       "version>=": "0.6.0#2"
 73 |     },
 74 |     {
 75 |       "name": "grpc",
 76 |       "version>=": "1.51.1",
 77 |       "default-features": false
 78 |     },
 79 |     {
 80 |       "name": "leveldb",
 81 |       "version>=": "1.23",
 82 |       "default-features": false
 83 |     },
 84 |     {
 85 |       "name": "openssl",
 86 |       "version>=": "3.2.1"
 87 |     },
 88 |     {
 89 |       "name": "snappy",
 90 |       "version>=": "1.1.10"
 91 |     },
 92 |     {
 93 |       "name": "nlohmann-json",
 94 |       "version>=": "3.11.2",
 95 |       "default-features": false
 96 |     },
 97 |     {
 98 |       "name": "zlib" ,
 99 |       "version>=": "1.3.1"
100 |     },
101 |     {
102 |       "name": "re2",
103 |       "version>=": "2023-07-01",
104 |       "default-features": false
105 |     }
106 |   ],
107 |   "builtin-baseline": "fba75d09065fcc76a25dcf386b1d00d33f5175af"
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/xllm_service/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | add_subdirectory(proto)
 6 | add_subdirectory(common)
 7 | add_subdirectory(request)
 8 | add_subdirectory(rpc_service)
 9 | add_subdirectory(tokenizer)
10 | add_subdirectory(chat_template)
11 | add_subdirectory(http_service)
12 | add_subdirectory(scheduler)
13 | 
14 | cc_binary(
15 |   NAME
16 |     xllm_master_serving
17 |   HDRS
18 |     master.h
19 |   SRCS
20 |     master.cpp
21 |   DEPS
22 |     :xllm_http_service
23 |     :xllm_rpc_service
24 | )
25 | 
26 | add_subdirectory(examples)
27 | 


--------------------------------------------------------------------------------
/xllm_service/chat_template/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library (
 5 |   NAME
 6 |     chat_template
 7 |   HDRS
 8 |     jinja_chat_template.h
 9 |   SRCS
10 |     jinja_chat_template.cpp
11 |   DEPS
12 |     :minja
13 |     :tokenizer
14 |     nlohmann_json::nlohmann_json
15 |     glog::glog
16 | )
17 | 
18 | cc_test (
19 |   NAME
20 |     chat_template_test
21 |   SRCS
22 |     jinja_chat_template_test.cpp
23 |   DEPS
24 |     :chat_template
25 |     GTest::gtest_main
26 | )
27 | 


--------------------------------------------------------------------------------
/xllm_service/chat_template/jinja_chat_template.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "jinja_chat_template.h"
 17 | 
 18 | #include <glog/logging.h>
 19 | #include <unistd.h>
 20 | 
 21 | #include <optional>
 22 | #include <string>
 23 | 
 24 | namespace xllm_service {
 25 | 
 26 | JinjaChatTemplate::JinjaChatTemplate(const TokenizerArgs& args) : args_(args) {
 27 |   try {
 28 |     template_ = std::make_unique<minja::chat_template>(
 29 |         args_.chat_template(), args_.bos_token(), args_.eos_token());
 30 |     LOG(INFO) << "Jinja chat template init succeed.";
 31 | 
 32 |   } catch (const std::exception& e) {
 33 |     LOG(FATAL) << "Failed to parse jinja chat template, TokenizerArgs: "
 34 |                << args_ << std::endl
 35 |                << "Error message: " << e.what();
 36 |   }
 37 | }
 38 | 
 39 | std::optional<std::string> JinjaChatTemplate::apply(
 40 |     const ChatMessages& messages) const {
 41 |   const std::vector<xllm_service::JsonTool> empty_tools;
 42 |   return apply(messages, empty_tools);
 43 | }
 44 | 
 45 | std::optional<std::string> JinjaChatTemplate::apply(
 46 |     nlohmann::ordered_json& messages) const {
 47 |   // Call the overloaded method with empty tools
 48 |   nlohmann::ordered_json empty_tools = nlohmann::json::array();
 49 |   return apply(messages, empty_tools);
 50 | }
 51 | 
 52 | std::optional<std::string> JinjaChatTemplate::apply(
 53 |     const ChatMessages& messages,
 54 |     const std::vector<xllm_service::JsonTool>& json_tools) const {
 55 |   // convert the messages to json object
 56 |   nlohmann::ordered_json messages_json = nlohmann::json::array();
 57 |   for (const auto& message : messages) {
 58 |     nlohmann::ordered_json message_json;
 59 |     message_json["role"] = message.role;
 60 | 
 61 |     if (std::holds_alternative<std::string>(message.content)) {
 62 |       message_json["content"] = std::get<std::string>(message.content);
 63 |     } else if (std::holds_alternative<Message::MMContentVec>(message.content)) {
 64 |       message_json["content"] =
 65 |           get_mm_content(std::get<Message::MMContentVec>(message.content));
 66 |     }
 67 | 
 68 |     messages_json.push_back(message_json);
 69 |   }
 70 | 
 71 |   nlohmann::ordered_json tools_json = nlohmann::json::array();
 72 |   for (const auto& json_tool : json_tools) {
 73 |     nlohmann::ordered_json tool_json;
 74 |     tool_json["type"] = json_tool.type;
 75 | 
 76 |     nlohmann::ordered_json function_json;
 77 |     function_json["name"] = json_tool.function.name;
 78 |     function_json["description"] = json_tool.function.description;
 79 |     function_json["parameters"] = json_tool.function.parameters;
 80 | 
 81 |     tool_json["function"] = function_json;
 82 |     tools_json.push_back(tool_json);
 83 |   }
 84 |   // apply the template
 85 |   return apply(messages_json, tools_json);
 86 | }
 87 | 
 88 | std::optional<std::string> JinjaChatTemplate::apply(
 89 |     nlohmann::ordered_json& messages,
 90 |     const nlohmann::ordered_json& tools) const {
 91 |   minja::chat_template_inputs input;
 92 |   input.messages = messages;
 93 |   input.tools = tools;
 94 |   input.add_generation_prompt = true;
 95 |   minja::chat_template_options options;
 96 | 
 97 |   return template_->apply(input, options);
 98 | }
 99 | 
100 | nlohmann::ordered_json JinjaChatTemplate::get_mm_content(
101 |     const Message::MMContentVec& vec) const {
102 |   nlohmann::ordered_json content_json = nlohmann::json::array();
103 | 
104 |   for (const auto& item : vec) {
105 |     nlohmann::ordered_json item_json;
106 |     item_json["type"] = item.type;
107 | 
108 |     if (item.type == "text") {
109 |       item_json["text"] = item.text;
110 |     } else {
111 |       item_json[item.type] = "mm place holder";
112 |     }
113 | 
114 |     content_json.emplace_back(item_json);
115 |   }
116 | 
117 |   return std::move(content_json);
118 | }
119 | 
120 | }  // namespace xllm_service
121 | 


--------------------------------------------------------------------------------
/xllm_service/chat_template/jinja_chat_template.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <minja/chat-template.hpp>
19 | #include <nlohmann/json.hpp>
20 | #include <optional>
21 | #include <string>
22 | #include <variant>
23 | #include <vector>
24 | 
25 | #include "common/types.h"
26 | #include "tokenizer/tokenizer_args.h"
27 | 
28 | namespace xllm_service {
29 | 
30 | struct Message {
31 |   struct MMUrl {
32 |     std::string url;
33 |   };
34 | 
35 |   struct MMContent {
36 |     MMContent(const std::string& type) : type(type) {}
37 |     MMContent(const std::string& type, const std::string& text)
38 |         : type(type), text(text) {}
39 | 
40 |     std::string type;
41 | 
42 |     std::string text;
43 |     MMUrl image_url;  // image place holder
44 | 
45 |     MMUrl video_url;  // video place holder
46 |     MMUrl audio_url;  // audio place holder
47 |   };
48 | 
49 |   using MMContentVec = std::vector<MMContent>;
50 |   using Content = std::variant<std::string, MMContentVec>;
51 | 
52 |   Message() = default;
53 |   Message(const std::string& role, const std::string& content)
54 |       : role(role), content(content) {}
55 | 
56 |   Message(const std::string& role, const MMContentVec& content)
57 |       : role(role), content(content) {}
58 | 
59 |   std::string role;
60 |   Content content;
61 | };
62 | using ChatMessages = std::vector<Message>;
63 | 
64 | // A chat template implementation that uses jinja2 as the template engine.
65 | class JinjaChatTemplate {
66 |  public:
67 |   JinjaChatTemplate(const TokenizerArgs& args);
68 | 
69 |   std::optional<std::string> apply(const ChatMessages& messages) const;
70 | 
71 |   std::optional<std::string> apply(
72 |       const ChatMessages& messages,
73 |       const std::vector<xllm_service::JsonTool>& json_tools) const;
74 | 
75 |   // expose this function for testing
76 |   // apply the template to the values in the json object
77 |   std::optional<std::string> apply(nlohmann::ordered_json& messages) const;
78 | 
79 |   std::optional<std::string> apply(nlohmann::ordered_json& messages,
80 |                                    const nlohmann::ordered_json& tools) const;
81 | 
82 |  private:
83 |   nlohmann::ordered_json get_mm_content(const Message::MMContentVec& vec) const;
84 | 
85 |  private:
86 |   TokenizerArgs args_;
87 |   std::unique_ptr<minja::chat_template> template_;
88 | };
89 | 
90 | }  // namespace xllm_service
91 | 


--------------------------------------------------------------------------------
/xllm_service/chat_template/jinja_chat_template_test.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "jinja_chat_template.h"
17 | 
18 | #include <gtest/gtest.h>
19 | 
20 | namespace xllm_service {
21 | 
22 | TEST(JinjaChatTemplate, OpenChatModel) {
23 |   // clang-format off
24 |   const std::string template_str =
25 |       "<s>"
26 |       "{% for message in messages %}"
27 |         "{{ 'GPT4 Correct ' + message['role'] + ': ' + message['content'] + '<|end_of_turn|>'}}"
28 |       "{% endfor %}"
29 |       "{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}";
30 | 
31 |   nlohmann::ordered_json messages = {
32 |       {{"role", "system"}, {"content", "you are a helpful assistant."}},
33 |       {{"role", "user"}, {"content", "hi"}},
34 |       {{"role", "assistant"}, {"content", "what i can do for you?"}},
35 |       {{"role", "user"}, {"content", "how are you?"}}};
36 |   const std::string expected =
37 |     "<s>"
38 |     "GPT4 Correct system: you are a helpful assistant.<|end_of_turn|>"
39 |     "GPT4 Correct user: hi<|end_of_turn|>"
40 |     "GPT4 Correct assistant: what i can do for you?<|end_of_turn|>"
41 |     "GPT4 Correct user: how are you?<|end_of_turn|>"
42 |     "GPT4 Correct Assistant:";
43 |   // clang-format on
44 | 
45 |   TokenizerArgs args;
46 |   args.chat_template(template_str);
47 |   args.bos_token("");
48 |   args.eos_token("<|end_of_turn|>");
49 |   JinjaChatTemplate template_(args);
50 |   auto result = template_.apply(messages);
51 |   ASSERT_TRUE(result.has_value());
52 | 
53 |   EXPECT_EQ(result.value(), expected);
54 | }
55 | 
56 | }  // namespace xllm_service
57 | 


--------------------------------------------------------------------------------
/xllm_service/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |   common
 6 |   HDRS
 7 |     call_data.h
 8 |     closure_guard.h
 9 |     concurrent_queue.h
10 |     global_gflags.h
11 |     json_reader.h
12 |     macros.h
13 |     slice.h
14 |     threadpool.h
15 |     ttft_predictor.h
16 |     types.h
17 |     utils.h
18 |     hash_util.h
19 |     xllm/output.h
20 |     xllm/status.h
21 |     xllm/uuid.h
22 |   SRCS
23 |     global_gflags.cpp
24 |     json_reader.cpp
25 |     threadpool.cpp
26 |     ttft_predictor.cpp
27 |     utils.cpp
28 |     hash_util.cpp
29 |     xllm/uuid.cpp
30 |   DEPS
31 |     absl::random_random
32 |     absl::strings
33 |     glog::glog
34 |     gflags::gflags
35 |     nlohmann_json::nlohmann_json
36 |     SMHasherSupport
37 |     proto_xllm
38 | )
39 | add_dependencies(common brpc-static)
40 | 


--------------------------------------------------------------------------------
/xllm_service/common/closure_guard.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <google/protobuf/service.h>
20 | 
21 | #include "butil/macros.h"
22 | 
23 | namespace xllm_service {
24 | 
25 | // RAII: Call Run() of the closure on destruction.
26 | class ClosureGuard {
27 |  public:
28 |   ClosureGuard() : done_(nullptr) {}
29 | 
30 |   // Constructed with a closure which will be Run() inside dtor.
31 |   explicit ClosureGuard(google::protobuf::Closure* done) : done_(done) {}
32 | 
33 |   // Run internal closure if it's not NULL.
34 |   ~ClosureGuard() {
35 |     if (done_) {
36 |       done_->Run();
37 |     }
38 |   }
39 | 
40 |   // Run internal closure if it's not NULL and set it to `done'.
41 |   void reset(google::protobuf::Closure* done) {
42 |     if (done_) {
43 |       done_->Run();
44 |     }
45 |     done_ = done;
46 |   }
47 | 
48 |   // Return and set internal closure to NULL.
49 |   google::protobuf::Closure* release() {
50 |     google::protobuf::Closure* const prev_done = done_;
51 |     done_ = nullptr;
52 |     return prev_done;
53 |   }
54 | 
55 |   // True if no closure inside.
56 |   bool empty() const { return done_ == nullptr; }
57 | 
58 |   // Exchange closure with another guard.
59 |   void swap(ClosureGuard& other) { std::swap(done_, other.done_); }
60 | 
61 |  private:
62 |   // Copying this object makes no sense.
63 |   DISALLOW_COPY_AND_ASSIGN(ClosureGuard);
64 | 
65 |   google::protobuf::Closure* done_ = nullptr;
66 | };
67 | 
68 | }  // namespace xllm_service
69 | 


--------------------------------------------------------------------------------
/xllm_service/common/concurrent_queue.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | ==============================================================================*/
 16 | 
 17 | #pragma once
 18 | 
 19 | #include <absl/synchronization/mutex.h>
 20 | 
 21 | #include <queue>
 22 | 
 23 | #if __has_attribute(guarded_by)
 24 | #define GUARDED_BY(x) __attribute__((guarded_by(x)))
 25 | #else
 26 | #define GUARDED_BY(x)
 27 | #endif
 28 | 
 29 | namespace xllm_service {
 30 | 
 31 | // a simple thread-safe queue that supports multiple producers and multiple
 32 | // consumers concurrently the queue is implemented as a queue with condition
 33 | // variable and mutex lock
 34 | template <typename T>
 35 | class ConcurrentQueue {
 36 |  public:
 37 |   // constructor
 38 |   ConcurrentQueue() = default;
 39 | 
 40 |   explicit ConcurrentQueue(size_t capacity) : capacity_(capacity) {}
 41 | 
 42 |   // destructor
 43 |   ~ConcurrentQueue() = default;
 44 | 
 45 |   // push an element to the queue
 46 |   void push(T value) {
 47 |     absl::MutexLock lock(&mutex_);
 48 |     if (capacity_ > 0) {
 49 |       auto not_full = [this]() { return queue_.size() < capacity_; };
 50 |       mutex_.Await(absl::Condition(&not_full));
 51 |     }
 52 |     queue_.push(std::move(value));
 53 |   }
 54 | 
 55 |   template <typename... Args>
 56 |   void emplace(Args&&... args) {
 57 |     absl::MutexLock lock(&mutex_);
 58 |     if (capacity_ > 0) {
 59 |       auto not_full = [this]() { return queue_.size() < capacity_; };
 60 |       mutex_.Await(absl::Condition(&not_full));
 61 |     }
 62 |     queue_.emplace(std::forward<Args>(args)...);
 63 |   }
 64 | 
 65 |   // pop an element from the queue, block if the queue is empty
 66 |   T pop() {
 67 |     absl::MutexLock lock(&mutex_);
 68 | 
 69 |     auto not_empty = [this]() { return !queue_.empty(); };
 70 |     mutex_.Await(absl::Condition(&not_empty));
 71 | 
 72 |     T value = std::move(queue_.front());
 73 |     queue_.pop();
 74 |     return value;
 75 |   }
 76 | 
 77 |   // return the size of the queue
 78 |   size_t size() {
 79 |     absl::MutexLock lock(&mutex_);
 80 |     return queue_.size();
 81 |   }
 82 | 
 83 |   // return true if the queue is empty
 84 |   bool empty() {
 85 |     absl::MutexLock lock(&mutex_);
 86 |     return queue_.empty();
 87 |   }
 88 | 
 89 |  private:
 90 |   // the underlying queue
 91 |   std::queue<T> queue_ GUARDED_BY(mutex_);
 92 |   // mutex lock for the queue
 93 |   absl::Mutex mutex_;
 94 | 
 95 |   // maximum capacity of the queue, 0 means no limit.
 96 |   // when the queue is full, push will block
 97 |   size_t capacity_ = 0;
 98 | };
 99 | 
100 | }  // namespace xllm_service
101 | 


--------------------------------------------------------------------------------
/xllm_service/common/global_gflags.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "common/global_gflags.h"
17 | 
18 | DEFINE_string(server_host,
19 |               "",
20 |               "Server listen address, may be IPV4/IPV6/UDS."
21 |               " If this is set, the flag port will be ignored");
22 | 
23 | DEFINE_int32(http_server_port, 8888, "Port for xllm http service to listen on");
24 | 
25 | DEFINE_int32(http_server_idle_timeout_s,
26 |              -1,
27 |              "Connection will be closed if there is no "
28 |              "read/write operations during the last `idle_timeout_s'");
29 | 
30 | DEFINE_int32(http_server_num_threads, 32, "Maximum number of threads to use");
31 | 
32 | DEFINE_int32(http_server_max_concurrency,
33 |              128,
34 |              "Limit number of requests processed in parallel");
35 | 
36 | DEFINE_int32(rpc_server_port, 8889, "Port for xllm rpc service to listen on");
37 | 
38 | DEFINE_int32(rpc_server_idle_timeout_s,
39 |              -1,
40 |              "Connection will be closed if there is no "
41 |              "read/write operations during the last `idle_timeout_s'");
42 | 
43 | DEFINE_int32(rpc_server_num_threads, 32, "Maximum number of threads to use");
44 | 
45 | DEFINE_int32(rpc_server_max_concurrency,
46 |              128,
47 |              "Limit number of requests processed in parallel");
48 | 
49 | DEFINE_string(etcd_addr,
50 |               "0.0.0.0:2379",
51 |               "etcd adderss for save instance meta info");
52 | 
53 | DEFINE_uint32(murmur_hash3_seed, 1024, "default Murmur Hash seed");
54 | 
55 | DEFINE_int32(port, 8888, "Port for xllm service to listen on");
56 | 
57 | DEFINE_int32(num_threads, 32, "Number of threads to process requests");
58 | 
59 | DEFINE_int32(max_concurrency,
60 |              128,
61 |              "Limit number of requests processed in parallel");
62 | 
63 | DEFINE_int32(timeout_ms,
64 |              -1,
65 |              "Max duration of bRPC Channel. -1 means wait indefinitely.");
66 | 
67 | DEFINE_string(listen_addr,
68 |               "",
69 |               "Server listen address, may be IPV4/IPV6/UDS."
70 |               " If this is set, the flag port will be ignored");
71 | 
72 | DEFINE_int32(idle_timeout_s,
73 |              -1,
74 |              "Connection will be closed if there is no "
75 |              "read/write operations during the last `idle_timeout_s'");
76 | 
77 | DEFINE_string(load_balance_policy,
78 |               "RR",
79 |               "Disaggregated prefill-decode policy.");
80 | 
81 | DEFINE_int32(detect_disconnected_instance_interval,
82 |              15,
83 |              "The interval that server detect the disconnected instance.");
84 | 
85 | DEFINE_int32(block_size,
86 |              16,
87 |              "Number of slots per kv cache block. Default is 16.");
88 | 
89 | DEFINE_string(tokenizer_path, "", "tokenizer config path.");
90 | 
91 | DEFINE_bool(enable_request_trace, false, "Whether to enable request trace");
92 | 


--------------------------------------------------------------------------------
/xllm_service/common/global_gflags.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <gflags/gflags.h>
19 | 
20 | DECLARE_string(server_host);
21 | 
22 | DECLARE_int32(http_server_port);
23 | 
24 | DECLARE_int32(http_server_idle_timeout_s);
25 | 
26 | DECLARE_int32(http_server_num_threads);
27 | 
28 | DECLARE_int32(http_server_max_concurrency);
29 | 
30 | DECLARE_int32(rpc_server_port);
31 | 
32 | DECLARE_int32(rpc_server_idle_timeout_s);
33 | 
34 | DECLARE_int32(rpc_server_num_threads);
35 | 
36 | DECLARE_int32(rpc_server_max_concurrency);
37 | 
38 | DECLARE_uint32(murmur_hash3_seed);
39 | 
40 | DECLARE_int32(timeout_ms);
41 | 
42 | DECLARE_string(listen_addr);
43 | 
44 | DECLARE_int32(port);
45 | 
46 | DECLARE_int32(idle_timeout_s);
47 | 
48 | DECLARE_int32(num_threads);
49 | 
50 | DECLARE_int32(max_concurrency);
51 | 
52 | DECLARE_string(etcd_addr);
53 | 
54 | DECLARE_string(load_balance_policy);
55 | 
56 | DECLARE_int32(detect_disconnected_instance_interval);
57 | 
58 | DECLARE_int32(block_size);
59 | 
60 | DECLARE_string(tokenizer_path);
61 | 
62 | DECLARE_bool(enable_request_trace);
63 | 


--------------------------------------------------------------------------------
/xllm_service/common/hash_util.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "common/hash_util.h"
 3 | 
 4 | #include <MurmurHash3.h>
 5 | #include <assert.h>
 6 | 
 7 | #include <iomanip>
 8 | #include <iostream>
 9 | #include <mutex>
10 | #include <sstream>
11 | #include <thread>
12 | 
13 | #include "common/global_gflags.h"
14 | 
15 | namespace xllm_service {
16 | 
17 | void murmur_hash3(const uint8_t* pre_hash_value,
18 |                   const Slice<int32_t>& token_ids,
19 |                   uint8_t* hash_value) {
20 |   if (pre_hash_value == nullptr) {
21 |     MurmurHash3_x64_128(reinterpret_cast<const void*>(token_ids.data()),
22 |                         sizeof(int32_t) * token_ids.size(),
23 |                         FLAGS_murmur_hash3_seed,
24 |                         hash_value);
25 |   } else {
26 |     uint8_t key[1024];
27 | 
28 |     int32_t data_len =
29 |         sizeof(int32_t) * token_ids.size() + MURMUR_HASH3_VALUE_LEN;
30 |     assert(sizeof(key) > data_len);
31 | 
32 |     memcpy(key, pre_hash_value, MURMUR_HASH3_VALUE_LEN);
33 |     memcpy(key + MURMUR_HASH3_VALUE_LEN,
34 |            reinterpret_cast<const void*>(token_ids.data()),
35 |            sizeof(int32_t) * token_ids.size());
36 | 
37 |     // print_hex_array(key, data_len);
38 |     MurmurHash3_x64_128(reinterpret_cast<const void*>(key),
39 |                         data_len,
40 |                         FLAGS_murmur_hash3_seed,
41 |                         hash_value);
42 |   }
43 | }
44 | 
45 | void print_hex_array(uint8_t* array) {
46 |   for (size_t i = 0; i < MURMUR_HASH3_VALUE_LEN; ++i) {
47 |     unsigned char uc = static_cast<unsigned char>(array[i]);
48 |     std::cout << std::hex << std::setw(2) << std::setfill('0')
49 |               << static_cast<int>(uc);
50 | 
51 |     if (i % MURMUR_HASH3_VALUE_LEN == MURMUR_HASH3_VALUE_LEN - 1) {
52 |       std::cout << std::endl;
53 |     }
54 | 
55 |     else {
56 |       std::cout << " ";
57 |     }
58 |   }
59 |   std::cout << std::dec << std::endl;
60 | }
61 | 
62 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/hash_util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string.h>
 4 | 
 5 | #include <cstdint>
 6 | #include <string>
 7 | #include <unordered_set>
 8 | #include <vector>
 9 | 
10 | #include "common/slice.h"
11 | 
12 | namespace xllm_service {
13 | constexpr uint32_t MURMUR_HASH3_VALUE_LEN = 16;
14 | 
15 | struct Murmur3Key {
16 |   uint8_t data[MURMUR_HASH3_VALUE_LEN];
17 | 
18 |   Murmur3Key() {}
19 |   Murmur3Key(const uint8_t* const input_data) {
20 |     memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN);
21 |   }
22 |   Murmur3Key(const char* const input_data) {
23 |     memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN);
24 |   }
25 | 
26 |   std::string to_string() const {
27 |     return std::string(reinterpret_cast<const char*>(data),
28 |                        MURMUR_HASH3_VALUE_LEN);
29 |   }
30 | 
31 |   bool operator==(const Murmur3Key& other) {
32 |     return strncmp(reinterpret_cast<const char*>(data),
33 |                    reinterpret_cast<const char*>(other.data),
34 |                    MURMUR_HASH3_VALUE_LEN);
35 |   }
36 | };
37 | 
38 | struct FixedStringKeyHash {
39 |   size_t operator()(const Murmur3Key& key) const {
40 |     return std::hash<std::string_view>()(std::string_view(
41 |         reinterpret_cast<const char*>(key.data), sizeof(key.data)));
42 |   }
43 | };
44 | 
45 | struct FixedStringKeyEqual {
46 |   bool operator()(const Murmur3Key& left, const Murmur3Key& right) const {
47 |     return strncmp(reinterpret_cast<const char*>(left.data),
48 |                    reinterpret_cast<const char*>(right.data),
49 |                    sizeof(left.data)) == 0;
50 |   }
51 | };
52 | 
53 | void print_hex_array(uint8_t* array);
54 | 
55 | void murmur_hash3(const uint8_t* pre_hash_value,
56 |                   const Slice<int32_t>& token_ids,
57 |                   uint8_t* hash_value);
58 | 
59 | }  // namespace xllm_service
60 | 


--------------------------------------------------------------------------------
/xllm_service/common/json_reader.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "common/json_reader.h"
18 | 
19 | #include <glog/logging.h>
20 | 
21 | #include <filesystem>
22 | #include <fstream>
23 | #include <nlohmann/json.hpp>
24 | #include <string>
25 | namespace xllm_service {
26 | 
27 | bool JsonReader::parse(const std::string& json_file_path) {
28 |   if (!std::filesystem::exists(json_file_path)) {
29 |     return false;
30 |   }
31 | 
32 |   std::ifstream ifs(json_file_path);
33 |   if (!ifs.is_open()) {
34 |     return false;
35 |   }
36 | 
37 |   data_ = nlohmann::json::parse(ifs);
38 |   return true;
39 | }
40 | 
41 | bool JsonReader::contains(const std::string& key) const {
42 |   // slipt the key by '.' then traverse the json object
43 |   std::vector<std::string> keys = absl::StrSplit(key, '.');
44 |   nlohmann::json data = data_;
45 |   for (const auto& k : keys) {
46 |     if (!data.contains(k)) {
47 |       return false;
48 |     }
49 |     data = data[k];
50 |   }
51 |   return true;
52 | }
53 | 
54 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/json_reader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <absl/strings/str_split.h>
19 | 
20 | #include <nlohmann/json.hpp>
21 | #include <optional>
22 | #include <string>
23 | #include <vector>
24 | 
25 | namespace xllm_service {
26 | 
27 | // an thin wrapper around nlohmann/json to read json files.
28 | // it supports read keys with dot notation from json.
29 | // for exmaple: value_or("a.b.c", 0) will return 100 for following json:
30 | // {
31 | //   "a": {
32 | //     "b": {
33 | //       "c": 100
34 | //     }
35 | //   }
36 | // }
37 | //
38 | class JsonReader {
39 |  public:
40 |   // parse the json file, return true if success
41 |   bool parse(const std::string& json_file_path);
42 | 
43 |   // check if the json contains the key, key can be nested with dot notation
44 |   bool contains(const std::string& key) const;
45 | 
46 |   template <typename T, typename T2>
47 |   T value_or(const std::vector<std::string>& keys, T2 default_value) const {
48 |     for (const auto& key : keys) {
49 |       if (auto data = value<T>(key)) {
50 |         return data.value();
51 |       }
52 |     }
53 |     // may introduce implicit conversion from T2 to T
54 |     return default_value;
55 |   }
56 | 
57 |   template <typename T, typename T2>
58 |   T value_or(const std::string& key, T2 default_value) const {
59 |     if (auto data = value<T>(key)) {
60 |       return data.value();
61 |     }
62 |     // may introduce implicit conversion from T2 to T
63 |     return default_value;
64 |   }
65 | 
66 |   template <typename T>
67 |   std::optional<T> value(const std::string& key) const {
68 |     // slipt the key by '.' then traverse the json object
69 |     const std::vector<std::string> keys = absl::StrSplit(key, '.');
70 |     nlohmann::json data = data_;
71 |     for (const auto& k : keys) {
72 |       if (data.contains(k)) {
73 |         data = data[k];
74 |       } else {
75 |         return std::nullopt;
76 |       }
77 |     }
78 | 
79 |     if (data.is_null() || data.is_structured()) {
80 |       // cannot convert null or structured data to T
81 |       return std::nullopt;
82 |     }
83 |     return data.get<T>();
84 |   }
85 | 
86 |   nlohmann::json data() const { return data_; }
87 | 
88 |  private:
89 |   nlohmann::json data_;
90 | };
91 | 
92 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/macros.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | namespace xllm_service {
20 | #define PROPERTY(T, property)                                                 \
21 |  public:                                                                      \
22 |   [[nodiscard]] const T& property() const& noexcept { return property##_; }   \
23 |   [[nodiscard]] T& property() & noexcept { return property##_; }              \
24 |   [[nodiscard]] T&& property() && noexcept { return std::move(property##_); } \
25 |                                                                               \
26 |   auto property(const T& value) & -> decltype(*this) {                        \
27 |     property##_ = value;                                                      \
28 |     return *this;                                                             \
29 |   }                                                                           \
30 |                                                                               \
31 |   auto property(T&& value) & -> decltype(*this) {                             \
32 |     property##_ = std::move(value);                                           \
33 |     return *this;                                                             \
34 |   }                                                                           \
35 |                                                                               \
36 |   void property(const T& value) && = delete;                                  \
37 |   void property(T&& value) && = delete;                                       \
38 |                                                                               \
39 |   T property##_
40 | 
41 | #ifndef UNUSED_PARAMETER
42 | #define UNUSED_PARAMETER(x) ((void)(x))
43 | #endif
44 | 
45 | #if __has_attribute(guarded_by)
46 | #define GUARDED_BY(x) __attribute__((guarded_by(x)))
47 | #else
48 | #define GUARDED_BY(x)
49 | #endif
50 | 
51 | // concatenate two strings
52 | #define LLM_STR_CAT(s1, s2) s1##s2
53 | 
54 | // create an anonymous variable
55 | #define LLM_ANON_VAR(str) LLM_STR_CAT(str, __LINE__)
56 | 
57 | #define REQUIRES(...) std::enable_if_t<(__VA_ARGS__)>* = nullptr
58 | 
59 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
60 |   TypeName(const TypeName&) = delete;      \
61 |   void operator=(const TypeName&) = delete
62 | 
63 | // Define a macro to simplify adding elements from a vector to a repeated field
64 | #define ADD_VECTOR_TO_PROTO(proto_field, vec) \
65 |   do {                                        \
66 |     proto_field->Reserve(vec.size());         \
67 |     for (const auto& value : vec) {           \
68 |       *proto_field->Add() = value;            \
69 |     }                                         \
70 |   } while (0)
71 | 
72 | #define CALLBACK_WITH_ERROR(CODE, MSG) callback(Status{CODE, MSG});
73 | 
74 | }  // namespace xllm_service
75 | 


--------------------------------------------------------------------------------
/xllm_service/common/options.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <string>
19 | 
20 | #include "common/macros.h"
21 | 
22 | namespace xllm_service {
23 | 
24 | class Options {
25 |  public:
26 |   Options() = default;
27 |   ~Options() = default;
28 | 
29 |   // http server options
30 |   PROPERTY(std::string, server_host);
31 | 
32 |   PROPERTY(int32_t, http_port) = 9998;
33 | 
34 |   PROPERTY(int32_t, http_idle_timeout_s) = -1;
35 | 
36 |   PROPERTY(int32_t, http_num_threads) = 32;
37 | 
38 |   PROPERTY(int32_t, http_max_concurrency) = 0;
39 | 
40 |   // rpc server options
41 |   PROPERTY(int32_t, rpc_port) = 9999;
42 | 
43 |   PROPERTY(int32_t, rpc_idle_timeout_s) = -1;
44 | 
45 |   PROPERTY(int32_t, rpc_num_threads) = 32;
46 | 
47 |   PROPERTY(int32_t, rpc_max_concurrency) = 0;
48 | 
49 |   PROPERTY(int32_t, num_threads) = 32;
50 | 
51 |   PROPERTY(int32_t, max_concurrency) = 32;
52 | 
53 |   PROPERTY(int32_t, timeout_ms) = 32;
54 | 
55 |   // instance manager options
56 |   PROPERTY(std::string, etcd_addr);
57 | 
58 |   PROPERTY(int32_t, detect_disconnected_instance_interval) = 15;
59 | 
60 |   // scheduler options
61 |   PROPERTY(std::string, load_balance_policy);
62 | 
63 |   PROPERTY(int32_t, block_size) = 128;
64 | 
65 |   PROPERTY(uint32_t, murmur_hash3_seed) = 1024;
66 | 
67 |   PROPERTY(std::string, service_name);
68 | 
69 |   // tokenizer options
70 |   PROPERTY(std::string, tokenizer_path);
71 | 
72 |   // trace options
73 |   PROPERTY(bool, enable_request_trace) = false;
74 | };
75 | 
76 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/slice.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | ==============================================================================*/
 16 | 
 17 | #pragma once
 18 | 
 19 | #include <glog/logging.h>
 20 | 
 21 | #include <vector>
 22 | 
 23 | namespace xllm_service {
 24 | 
 25 | template <typename T>
 26 | class Slice final {
 27 |  public:
 28 |   Slice() = default;
 29 | 
 30 |   Slice(const T* data, size_t size) : data_(data), size_(size) {}
 31 | 
 32 |   // it is on purpose to allow implicit conversion from vector to slice
 33 |   Slice(const std::vector<T>& data) : data_(data.data()), size_(data.size()) {}
 34 | 
 35 |   Slice(const std::vector<T>& data, size_t size)
 36 |       : data_(data.data()), size_(size) {
 37 |     CHECK_LE(size, data.size());
 38 |   }
 39 | 
 40 |   // iterator for the slice
 41 |   const T* begin() const { return data_; }
 42 |   const T* end() const { return data_ + size_; }
 43 | 
 44 |   // get the size of the slice
 45 |   size_t size() const { return size_; }
 46 | 
 47 |   // check if the slice is empty
 48 |   bool empty() const { return size_ == 0; }
 49 | 
 50 |   // get the data pointer
 51 |   const T* data() const { return data_; }
 52 | 
 53 |   // index operator
 54 |   const T& operator[](size_t i) const { return data_[i]; }
 55 | 
 56 |   const T& front() const { return data_[0]; }
 57 | 
 58 |   const T& back() const { return data_[size_ - 1]; }
 59 | 
 60 |   // get a sub slice
 61 |   Slice<T> slice(size_t start) const {
 62 |     CHECK_LE(start, size_);
 63 |     return {data_ + start, size_ - start};
 64 |   }
 65 | 
 66 |   Slice<T> slice(size_t start, size_t end) const {
 67 |     CHECK(start <= end && end <= size_);
 68 |     return {data_ + start, end - start};
 69 |   }
 70 | 
 71 |   // it is safe to allow implicit conversion to vector
 72 |   operator std::vector<T>() const { return {data_, data_ + size_}; }
 73 | 
 74 |  private:
 75 |   const T* data_ = nullptr;
 76 |   size_t size_ = 0;
 77 | };
 78 | 
 79 | // help comparison operators between slices and std::vector
 80 | template <typename T>
 81 | inline bool operator==(const Slice<T>& lhs, const std::vector<T>& rhs) {
 82 |   return lhs.size() == rhs.size() &&
 83 |          (lhs.data() == rhs.data() ||
 84 |           std::equal(lhs.begin(), lhs.end(), rhs.begin()));
 85 | }
 86 | 
 87 | template <typename T>
 88 | inline bool operator==(const std::vector<T>& lhs, const Slice<T>& rhs) {
 89 |   return lhs.size() == rhs.size() &&
 90 |          (lhs.data() == rhs.data() ||
 91 |           std::equal(lhs.begin(), lhs.end(), rhs.begin()));
 92 | }
 93 | 
 94 | template <typename T>
 95 | inline bool operator==(const Slice<T>& lhs, const Slice<T>& rhs) {
 96 |   return lhs.size() == rhs.size() &&
 97 |          (lhs.data() == rhs.data() ||
 98 |           std::equal(lhs.begin(), lhs.end(), rhs.begin()));
 99 | }
100 | 
101 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/threadpool.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "common/threadpool.h"
18 | 
19 | #include <thread>
20 | 
21 | #include "common/concurrent_queue.h"
22 | 
23 | namespace xllm_service {
24 | 
25 | ThreadPool::ThreadPool(size_t num_threads) {
26 |   for (size_t i = 0; i < num_threads; ++i) {
27 |     threads_.emplace_back([this]() { internal_loop(); });
28 |   }
29 | }
30 | 
31 | ThreadPool::~ThreadPool() {
32 |   // push nullptr to the queue to signal threads to exit
33 |   for (size_t i = 0; i < threads_.size(); ++i) {
34 |     queue_.push(nullptr);
35 |   }
36 |   // wait for all threads to finish
37 |   for (auto& thread : threads_) {
38 |     thread.join();
39 |   }
40 | }
41 | 
42 | // schedule a task to be executed
43 | void ThreadPool::schedule(Task task) {
44 |   if (task == nullptr) {
45 |     return;
46 |   }
47 |   queue_.push(std::move(task));
48 | }
49 | 
50 | void ThreadPool::internal_loop() {
51 |   while (true) {
52 |     Task task = queue_.pop();
53 |     if (task == nullptr) {
54 |       // nullptr is a signal to exit
55 |       break;
56 |     }
57 |     task();
58 |   }
59 | }
60 | 
61 | }  // namespace xllm_service
62 | 


--------------------------------------------------------------------------------
/xllm_service/common/threadpool.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <functional>
20 | #include <thread>
21 | 
22 | #include "concurrent_queue.h"
23 | 
24 | namespace xllm_service {
25 | 
26 | class ThreadPool final {
27 |  public:
28 |   using Task = std::function<void()>;
29 | 
30 |   // constructors
31 |   ThreadPool() : ThreadPool(1) {}
32 | 
33 |   // disable copy/move constructor and assignment
34 |   ThreadPool(const ThreadPool&) = delete;
35 |   ThreadPool& operator=(const ThreadPool&) = delete;
36 |   ThreadPool(ThreadPool&&) = delete;
37 |   ThreadPool& operator=(ThreadPool&&) = delete;
38 | 
39 |   explicit ThreadPool(size_t num_threads);
40 | 
41 |   // destructor
42 |   ~ThreadPool();
43 | 
44 |   // schedule a task to be executed
45 |   void schedule(Task task);
46 | 
47 |  private:
48 |   void internal_loop();
49 | 
50 |   std::vector<std::thread> threads_;
51 |   ConcurrentQueue<Task> queue_;
52 | };
53 | 
54 | }  // namespace xllm_service
55 | 


--------------------------------------------------------------------------------
/xllm_service/common/ttft_predictor.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ttft_predictor.h"
17 | 
18 | static constexpr int32_t kDegree = 2;
19 | 
20 | namespace xllm_service {
21 | 
22 | TtftPredictor::TtftPredictor(
23 |     const std::vector<std::pair<int32_t, int64_t>>& ttft_profiling_data) {
24 |   if (!ttft_profiling_data.empty()) {
25 |     // construct Vandermonde matrix
26 |     int32_t m = ttft_profiling_data.size();
27 |     int32_t n = kDegree + 1;
28 |     Eigen::MatrixXd matrix(m, n);
29 |     for (int32_t i = 0; i < m; ++i) {
30 |       for (int32_t j = 0; j < n; ++j) {
31 |         matrix(i, j) = std::pow(ttft_profiling_data[i].first, j);
32 |       }
33 |     }
34 | 
35 |     // construct target vector
36 |     Eigen::VectorXd target(m);
37 |     for (int32_t i = 0; i < m; ++i) {
38 |       target(i) = ttft_profiling_data[i].second;
39 |     }
40 | 
41 |     // get coefficients
42 |     coefficients_ = matrix.colPivHouseholderQr().solve(target);
43 |   } else {
44 |     coefficients_ = Eigen::VectorXd::Zero(1);
45 |   }
46 | }
47 | 
48 | int64_t TtftPredictor::predict_ttft(int32_t length) {
49 |   double result = 0.0;
50 |   double power = 1.0;
51 |   for (int32_t i = 0; i < coefficients_.size(); ++i) {
52 |     result += coefficients_(i) * power;
53 |     power *= length;
54 |   }
55 | 
56 |   return static_cast<int64_t>(result);
57 | }
58 | 
59 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/ttft_predictor.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <Eigen/Dense>
19 | 
20 | namespace xllm_service {
21 | 
22 | // Predictor for predicting TTFT based on input length
23 | class TtftPredictor final {
24 |  public:
25 |   TtftPredictor(
26 |       const std::vector<std::pair<int32_t, int64_t>>& ttft_profiling_data);
27 |   ~TtftPredictor() = default;
28 | 
29 |   int64_t predict_ttft(int32_t length);
30 | 
31 |  private:
32 |   Eigen::VectorXd coefficients_;
33 | };
34 | 
35 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/common/utils.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "common/utils.h"
17 | 
18 | #include <glog/logging.h>
19 | #include <netinet/in.h>
20 | #include <sys/socket.h>
21 | #include <unistd.h>
22 | 
23 | #include <boost/asio.hpp>
24 | #include <mutex>
25 | 
26 | namespace xllm_service {
27 | namespace utils {
28 | 
29 | bool enable_debug_log() {
30 |   static bool debug_log_enabled = false;
31 |   static std::once_flag debug_flag;
32 |   std::call_once(debug_flag, []() {
33 |     const char* enable_debug_env = std::getenv("ENABLE_XLLM_DEBUG_LOG");
34 |     if (enable_debug_env != nullptr && std::string(enable_debug_env) == "1") {
35 |       debug_log_enabled = true;
36 |     }
37 |   });
38 | 
39 |   return debug_log_enabled;
40 | }
41 | 
42 | bool is_port_available(int port) {
43 |   int fd = socket(AF_INET, SOCK_STREAM, 0);
44 |   if (fd < 0) {
45 |     LOG(ERROR) << "create socket failed.";
46 |     return false;
47 |   }
48 | 
49 |   int opt = 1;
50 |   if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
51 |     LOG(WARNING) << "set socket options failed.";
52 |   }
53 | 
54 |   struct sockaddr_in addr;
55 |   memset(&addr, 0, sizeof(addr));
56 |   addr.sin_family = AF_INET;
57 |   addr.sin_addr.s_addr = INADDR_ANY;
58 |   addr.sin_port = htons(port);
59 |   if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) != 0) {
60 |     return false;
61 |   }
62 |   close(fd);
63 | 
64 |   return true;
65 | }
66 | 
67 | bool get_bool_env(const std::string& key, bool defaultValue) {
68 |   const char* val = std::getenv(key.c_str());
69 |   if (val == nullptr) {
70 |     return defaultValue;
71 |   }
72 |   std::string strVal(val);
73 |   return (strVal == "1" || strVal == "true" || strVal == "TRUE" ||
74 |           strVal == "True");
75 | }
76 | 
77 | std::string get_local_ip() {
78 |   using namespace boost::asio;
79 |   io_service io;
80 |   ip::tcp::resolver resolver(io);
81 |   ip::tcp::resolver::query query(ip::host_name(), "");
82 |   ip::tcp::resolver::iterator iter = resolver.resolve(query);
83 |   ip::tcp::resolver::iterator end;
84 | 
85 |   while (iter != end) {
86 |     ip::address addr = iter->endpoint().address();
87 |     if (!addr.is_loopback() && addr.is_v4()) {
88 |       return addr.to_string();
89 |     }
90 |     ++iter;
91 |   }
92 | 
93 |   LOG(FATAL) << "Get local ip faill!";
94 |   return "";
95 | }
96 | 
97 | }  // namespace utils
98 | }  // namespace xllm_service
99 | 


--------------------------------------------------------------------------------
/xllm_service/common/utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <string>
19 | 
20 | namespace xllm_service {
21 | namespace utils {
22 | 
23 | bool enable_debug_log();
24 | bool is_port_available(int port);
25 | bool get_bool_env(const std::string& key, bool defaultValue);
26 | std::string get_local_ip();
27 | 
28 | }  // namespace utils
29 | }  // namespace xllm_service
30 | 


--------------------------------------------------------------------------------
/xllm_service/common/xllm/output.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | ==============================================================================*/
 16 | 
 17 | #pragma once
 18 | 
 19 | #include <glog/logging.h>
 20 | 
 21 | #include <optional>
 22 | #include <string>
 23 | #include <vector>
 24 | 
 25 | #include "status.h"
 26 | 
 27 | namespace xllm_service {
 28 | namespace llm {
 29 | 
 30 | // "stop" - the model hit a natural stop point or a provided stop sequence.
 31 | // "length" - the maximum number of tokens specified in the request was reached.
 32 | // "function_call" - the model called a function.
 33 | enum class FinishReason {
 34 |   NONE = 0,
 35 |   STOP = 1,
 36 |   LENGTH,
 37 |   FUNCTION_CALL,
 38 | };
 39 | 
 40 | struct Usage {
 41 |   // the number of tokens in the prompt.
 42 |   size_t num_prompt_tokens = 0;
 43 | 
 44 |   // the number of tokens in the generated completion.
 45 |   size_t num_generated_tokens = 0;
 46 | 
 47 |   // the total number of tokens used in the request (prompt + completion).
 48 |   size_t num_total_tokens = 0;
 49 | };
 50 | 
 51 | struct LogProbData {
 52 |   // the text of the token.
 53 |   std::string token;
 54 |   // the token id.
 55 |   int32_t token_id;
 56 |   // the log probability of the token.
 57 |   float logprob = -9999.0f;
 58 |   // whether the token is finished.
 59 |   bool finished_token = true;
 60 | };
 61 | 
 62 | struct LogProb : public LogProbData {
 63 |   // the top log probabilities.
 64 |   std::optional<std::vector<LogProbData>> top_logprobs;
 65 | };
 66 | 
 67 | // TODO: support embeddings later
 68 | struct SequenceOutput {
 69 |   // the index of the sequence in the request.
 70 |   size_t index;
 71 | 
 72 |   // the generated/delta text.
 73 |   // delta text is the text generated since the last response for streaming.
 74 |   std::string text;
 75 | 
 76 |   // the token ids of the generated text.
 77 |   std::vector<int32_t> token_ids;
 78 | 
 79 |   // the reason the sequence finished.
 80 |   std::optional<std::string> finish_reason;
 81 | 
 82 |   // log probabilities of the generated tokens.
 83 |   std::optional<std::vector<LogProb>> logprobs;
 84 | };
 85 | 
 86 | struct RequestOutput {
 87 |   RequestOutput() = default;
 88 | 
 89 |   RequestOutput(Status&& _status) : status(std::move(_status)) {}
 90 | 
 91 |   // the id of the request.
 92 |   std::string request_id;
 93 | 
 94 |   std::string service_request_id;
 95 | 
 96 |   // the prompt text for the request.
 97 |   std::optional<std::string> prompt;
 98 | 
 99 |   // the status of the request.
100 |   std::optional<Status> status;
101 | 
102 |   // the output for each sequence in the request.
103 |   std::vector<SequenceOutput> outputs;
104 | 
105 |   // the statistics for the request.
106 |   std::optional<Usage> usage;
107 | 
108 |   // whether the request is finished.
109 |   bool finished = false;
110 | };
111 | 
112 | inline std::optional<std::string> to_string(FinishReason reason) {
113 |   switch (reason) {
114 |     case FinishReason::NONE:
115 |       return std::nullopt;
116 |     case FinishReason::STOP:
117 |       return "stop";
118 |     case FinishReason::LENGTH:
119 |       return "length";
120 |     case FinishReason::FUNCTION_CALL:
121 |       return "function_call";
122 |     default:
123 |       LOG(WARNING) << "Unknown finish reason: " << static_cast<int>(reason);
124 |   }
125 |   return std::nullopt;
126 | }
127 | 
128 | }  // namespace llm
129 | 
130 | using OutputCallback = std::function<bool(llm::RequestOutput output)>;
131 | 
132 | }  // namespace xllm_service
133 | 


--------------------------------------------------------------------------------
/xllm_service/common/xllm/status.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <cstdint>
20 | #include <ostream>
21 | #include <string>
22 | 
23 | namespace xllm_service {
24 | namespace llm {
25 | 
26 | enum class StatusCode : uint8_t {
27 |   // Not an error; returned on success.
28 |   OK = 0,
29 |   // The request was cancelled. (by user/server)
30 |   CANCELLED = 1,
31 |   // Unknown error.
32 |   UNKNOWN = 2,
33 |   // Client specified an invalid argument.
34 |   INVALID_ARGUMENT = 3,
35 |   // Deadline expired before operation could complete. for example, timeout.
36 |   DEADLINE_EXCEEDED = 4,
37 |   // Some resource has been exhausted.
38 |   RESOURCE_EXHAUSTED = 5,
39 |   // The request does not have valid authentication credentials.
40 |   UNAUTHENTICATED = 6,
41 |   // The service is currently unavailable.
42 |   UNAVAILABLE = 7,
43 |   // Not implemented or not supported in this service.
44 |   UNIMPLEMENTED = 8,
45 | };
46 | 
47 | class Status final {
48 |  public:
49 |   Status() = default;
50 | 
51 |   Status(StatusCode code) : code_(code) {}
52 | 
53 |   Status(StatusCode code, std::string msg)
54 |       : code_(code), msg_(std::move(msg)) {}
55 | 
56 |   StatusCode code() const { return code_; }
57 | 
58 |   const std::string& message() const { return msg_; }
59 | 
60 |   bool ok() const { return code_ == StatusCode::OK; }
61 | 
62 |  private:
63 |   StatusCode code_ = StatusCode::OK;
64 |   std::string msg_;
65 | };
66 | 
67 | inline std::ostream& operator<<(std::ostream& os, const Status& status) {
68 |   os << "Status, code: " << static_cast<uint8_t>(status.code())
69 |      << ", message: " << status.message();
70 |   return os;
71 | }
72 | 
73 | }  // namespace llm
74 | }  // namespace xllm_service
75 | 


--------------------------------------------------------------------------------
/xllm_service/common/xllm/uuid.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "uuid.h"
18 | 
19 | #include <absl/random/distributions.h>
20 | 
21 | namespace xllm_service {
22 | namespace llm {
23 | 
24 | std::string ShortUUID::random(size_t len) {
25 |   if (len == 0) {
26 |     len = 22;
27 |   }
28 | 
29 |   std::string uuid(len, ' ');
30 |   for (size_t i = 0; i < len; i++) {
31 |     const size_t rand = absl::Uniform<size_t>(
32 |         absl::IntervalClosedOpen, gen_, 0, alphabet_.size());
33 |     uuid[i] = alphabet_[rand];
34 |   }
35 |   return uuid;
36 | }
37 | 
38 | }  // namespace llm
39 | }  // namespace xllm_service
40 | 


--------------------------------------------------------------------------------
/xllm_service/common/xllm/uuid.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <absl/random/random.h>
19 | 
20 | #include <string>
21 | 
22 | namespace xllm_service {
23 | namespace llm {
24 | 
25 | class ShortUUID {
26 |  public:
27 |   ShortUUID() = default;
28 | 
29 |   std::string random(size_t len = 0);
30 | 
31 |  private:
32 |   std::string alphabet_ =
33 |       "23456789ABCDEFGHJKLMNPQRSTUVWXYZ"
34 |       "abcdefghijkmnopqrstuvwxyz";
35 |   absl::BitGen gen_;
36 | };
37 | 
38 | }  // namespace llm
39 | }  // namespace xllm_service
40 | 


--------------------------------------------------------------------------------
/xllm_service/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | 
 3 | cc_binary(
 4 |   NAME 
 5 |     rpc_hello_client
 6 |   SRCS 
 7 |     rpc_hello_client.cpp
 8 |   DEPS
 9 |     :xllm_rpc_service
10 |     gflags::gflags
11 |     glog::glog
12 | )
13 | 
14 | cc_binary(
15 |   NAME 
16 |     rpc_client_test
17 |   SRCS 
18 |     rpc_client_test.cpp
19 |   DEPS
20 |     :xllm_rpc_client
21 |     gflags::gflags
22 |     glog::glog
23 | )
24 | 
25 | cc_binary(
26 |   NAME
27 |     http_client_test
28 |   SRCS
29 |     http_client_test.cpp
30 |   DEPS
31 |     gflags::gflags
32 | )
33 | target_link_libraries(http_client_test PRIVATE brpc-static)
34 | 


--------------------------------------------------------------------------------
/xllm_service/examples/curl_http_client.sh:
--------------------------------------------------------------------------------
1 | curl -d '{"ping":"hello"}' 127.0.0.1:8888/hello
2 | 


--------------------------------------------------------------------------------
/xllm_service/examples/rpc_client_test.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <gflags/gflags.h>
17 | #include <glog/logging.h>
18 | 
19 | #include "rpc_service/client.h"
20 | 
21 | DEFINE_string(server_address, "localhost:9999", "Grpc server address.");
22 | DEFINE_string(client_name, "127.0.0.1@9999", "client name.");
23 | DEFINE_string(protocol,
24 |               "baidu_std",
25 |               "Protocol type. Defined in src/brpc/options.proto");
26 | DEFINE_string(connection_type,
27 |               "",
28 |               "Connection type. Available values: single, pooled, short");
29 | DEFINE_string(server, "0.0.0.0:8000", "IP Address of server");
30 | DEFINE_string(load_balancer, "", "The algorithm for load balancing");
31 | DEFINE_int32(timeout_ms, 100, "RPC timeout in milliseconds");
32 | DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
33 | DEFINE_int32(interval_ms, 1000, "Milliseconds between consecutive requests");
34 | 
35 | int main(int argc, char* argv[]) {
36 |   // initialize glog and gflags
37 |   google::InitGoogleLogging(argv[0]);
38 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
39 | 
40 |   // Define the server address and port
41 |   std::string server_address(FLAGS_server_address);
42 | 
43 |   xllm_service::ChannelOptions options;
44 | 
45 |   // Create a client instance
46 |   xllm_service::XllmRpcClient client(
47 |       FLAGS_client_name, server_address, options);
48 | 
49 |   // Register the instance
50 |   auto ret = client.register_instance();
51 |   if (ret != xllm_service::ErrorCode::OK) {
52 |     LOG(ERROR) << "Register instance failed.";
53 |     return -1;
54 |   }
55 | 
56 |   // Keep the client running
57 |   while (true) {
58 |     sleep(1);
59 |   }
60 | 
61 |   return 0;
62 | }
63 | 


--------------------------------------------------------------------------------
/xllm_service/examples/rpc_hello_client.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include <brpc/channel.h>
 17 | #include <butil/time.h>
 18 | #include <gflags/gflags.h>
 19 | #include <glog/logging.h>
 20 | 
 21 | #include <iostream>
 22 | #include <memory>
 23 | #include <string>
 24 | 
 25 | #include "xllm_rpc_service.pb.h"
 26 | 
 27 | DEFINE_string(server_address, "localhost:9999", "Grpc server address.");
 28 | DEFINE_string(protocol,
 29 |               "baidu_std",
 30 |               "Protocol type. Defined in src/brpc/options.proto");
 31 | DEFINE_string(connection_type,
 32 |               "",
 33 |               "Connection type. Available values: single, pooled, short");
 34 | DEFINE_string(load_balancer, "", "The algorithm for load balancing");
 35 | DEFINE_int32(timeout_ms, 100, "RPC timeout in milliseconds");
 36 | DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
 37 | DEFINE_int32(interval_ms, 1000, "Milliseconds between consecutive requests");
 38 | 
 39 | namespace xllm_service {
 40 | namespace test {
 41 | 
 42 | struct ChannelOptions {
 43 |   std::string protocol = "baidu_std";
 44 |   std::string connection_type = "";
 45 |   std::string load_balancer = "";
 46 |   int timeout_ms = 100;
 47 |   int max_retry = 3;
 48 |   int interval_ms = 1000;
 49 | };
 50 | 
 51 | class HelloClient final {
 52 |  public:
 53 |   HelloClient(const std::string& addr, ChannelOptions options) {
 54 |     brpc::ChannelOptions chan_options;
 55 |     chan_options.protocol = options.protocol;
 56 |     chan_options.connection_type = options.connection_type;
 57 |     chan_options.timeout_ms = options.timeout_ms /*milliseconds*/;
 58 |     chan_options.max_retry = options.max_retry;
 59 |     if (master_channel_.Init(
 60 |             addr.c_str(), options.load_balancer.c_str(), &chan_options) != 0) {
 61 |       LOG(ERROR) << "Fail to initialize brpc channel to server " << addr;
 62 |       return;
 63 |     }
 64 |     master_stub_ =
 65 |         std::make_unique<proto::XllmRpcService_Stub>(&master_channel_);
 66 |   }
 67 | 
 68 |   void hello() {
 69 |     // Create a message to send to the server
 70 |     brpc::Controller cntl;
 71 |     proto::Empty request;
 72 |     proto::Status response;
 73 |     master_stub_->Hello(&cntl, &request, &response, nullptr);
 74 |     if (cntl.Failed()) {
 75 |       LOG(ERROR) << "Send to server faild, err msg:" << cntl.ErrorText();
 76 |       return;
 77 |     }
 78 | 
 79 |     std::cout << "Get server response: " << response.ok() << "\n";
 80 |   }
 81 | 
 82 |  private:
 83 |   brpc::Channel master_channel_;
 84 |   std::unique_ptr<proto::XllmRpcService_Stub> master_stub_;
 85 | };
 86 | 
 87 | }  // namespace test
 88 | }  // namespace xllm_service
 89 | 
 90 | int main(int argc, char* argv[]) {
 91 |   // initialize glog and gflags
 92 |   google::InitGoogleLogging(argv[0]);
 93 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
 94 | 
 95 |   // Define the server address and port
 96 |   std::string server_address(FLAGS_server_address);
 97 | 
 98 |   xllm_service::test::ChannelOptions opt;
 99 |   opt.protocol = FLAGS_protocol;
100 |   opt.connection_type = FLAGS_connection_type;
101 |   opt.load_balancer = FLAGS_load_balancer;
102 |   opt.timeout_ms = FLAGS_timeout_ms;
103 |   opt.max_retry = FLAGS_max_retry;
104 |   opt.interval_ms = FLAGS_interval_ms;
105 | 
106 |   // Create a chat client
107 |   xllm_service::test::HelloClient client(server_address, opt);
108 | 
109 |   client.hello();
110 | 
111 |   return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/xllm_service/http_service/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | 
 4 | cc_library(
 5 |   NAME
 6 |   xllm_http_service
 7 |   HDRS
 8 |     service.h
 9 |     request_tracer.h
10 |   SRCS
11 |     service.cpp 
12 |     request_tracer.cpp
13 |   DEPS
14 |     :common
15 |     :request
16 |     :scheduler
17 |     absl::random_random
18 |     absl::synchronization
19 |     glog::glog
20 |     nlohmann_json::nlohmann_json
21 |     proto::proto_http_service
22 |     proto_xllm
23 | )
24 | target_link_libraries(xllm_http_service PRIVATE brpc-static leveldb::leveldb ZLIB::ZLIB protobuf::libprotobuf OpenSSL::SSL OpenSSL::Crypto)
25 | 
26 | cc_binary(
27 |   NAME 
28 |     xllm_http_serving
29 |   SRCS 
30 |     main.cpp
31 |   DEPS
32 |     :xllm_http_service
33 |     gflags::gflags
34 | )
35 | 


--------------------------------------------------------------------------------
/xllm_service/http_service/main.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <brpc/server.h>
17 | #include <gflags/gflags.h>
18 | #include <glog/logging.h>
19 | #include <grpcpp/grpcpp.h>
20 | 
21 | #include "common/global_gflags.h"
22 | #include "common/options.h"
23 | #include "http_service/service.h"
24 | 
25 | int main(int argc, char** argv) {
26 |   // Initialize gflags
27 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
28 | 
29 |   // Initialize glog
30 |   google::InitGoogleLogging(argv[0]);
31 |   FLAGS_logtostderr = true;
32 | 
33 |   LOG(INFO) << "Starting xllm http service, port: " << FLAGS_port;
34 | 
35 |   xllm_service::Options service_options;
36 |   xllm_service::XllmHttpServiceImpl service_impl(service_options, nullptr);
37 | 
38 |   // register http methods here
39 |   brpc::Server server;
40 |   if (server.AddService(&service_impl,
41 |                         brpc::SERVER_DOESNT_OWN_SERVICE,
42 |                         "/hello => Hello,"
43 |                         "/v1/completions => Completions,") != 0) {
44 |     LOG(ERROR) << "Fail to add brpc http service";
45 |     return false;
46 |   }
47 | 
48 |   brpc::ServerOptions options;
49 |   options.idle_timeout_sec = FLAGS_idle_timeout_s;
50 |   options.num_threads = FLAGS_num_threads;
51 |   options.max_concurrency = FLAGS_max_concurrency;
52 |   if (server.Start(FLAGS_port, &options) != 0) {
53 |     LOG(ERROR) << "Failed to start brpc http server on port " << FLAGS_port;
54 |     return false;
55 |   }
56 | 
57 |   LOG(INFO) << "Xllm http server started on port " << FLAGS_port
58 |             << ", idle_timeout_sec: " << FLAGS_idle_timeout_s
59 |             << ", num_threads: " << FLAGS_num_threads
60 |             << ", max_concurrency: " << FLAGS_max_concurrency;
61 | 
62 |   // Wait until Ctrl-C is pressed, then Stop() and Join() the server.
63 |   server.RunUntilAskedToQuit();
64 | 
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/xllm_service/http_service/request_tracer.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "http_service/request_tracer.h"
17 | 
18 | #include <glog/logging.h>
19 | 
20 | #include <chrono>
21 | #include <filesystem>
22 | #include <mutex>
23 | #include <nlohmann/json.hpp>
24 | 
25 | namespace xllm_service {
26 | 
27 | static std::string get_current_timestamp() {
28 |   auto now = std::chrono::system_clock::now();
29 |   auto in_time_t = std::chrono::system_clock::to_time_t(now);
30 | 
31 |   std::stringstream ss;
32 |   ss << std::put_time(std::localtime(&in_time_t), "%Y-%m-%d %H:%M:%S");
33 |   return ss.str();
34 | }
35 | 
36 | RequestTracer::RequestTracer(bool enable_request_trace)
37 |     : enable_request_trace_(enable_request_trace) {
38 |   if (!enable_request_trace_) return;
39 |   std::filesystem::create_directories("trace");
40 |   log_stream_.open("trace/trace.json", std::ios::app);
41 |   if (!log_stream_.is_open()) {
42 |     LOG(ERROR) << "Failed to open log file: trace/trace.json";
43 |   }
44 | }
45 | 
46 | void RequestTracer::log(const std::string& service_request_id,
47 |                         const std::string& input_or_output) {
48 |   if (!enable_request_trace_) return;
49 | 
50 |   std::lock_guard<std::mutex> lock(mutex_);
51 |   std::string timestamp = get_current_timestamp();
52 | 
53 |   nlohmann::json j;
54 |   j["timestamp"] = timestamp;
55 |   j["service_request_id"] = service_request_id;
56 |   j["data"] = input_or_output;
57 | 
58 |   log_stream_ << j.dump() << "\n";
59 | }
60 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/http_service/request_tracer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <fstream>
18 | #include <mutex>
19 | #include <string>
20 | 
21 | namespace xllm_service {
22 | 
23 | class RequestTracer {
24 |  public:
25 |   RequestTracer(bool enable_request_trace);
26 |   ~RequestTracer() = default;
27 |   RequestTracer(const RequestTracer&) = delete;
28 |   RequestTracer& operator=(const RequestTracer&) = delete;
29 |   RequestTracer(RequestTracer&&) = delete;
30 |   RequestTracer& operator=(RequestTracer&&) = delete;
31 |   void log(const std::string& service_request_id,
32 |            const std::string& input_or_output);
33 | 
34 |  private:
35 |   std::ofstream log_stream_;
36 |   std::mutex mutex_;
37 |   bool enable_request_trace_ = false;
38 | };
39 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/http_service/service.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #pragma once
 17 | 
 18 | #include <brpc/channel.h>
 19 | 
 20 | #include <iostream>
 21 | #include <mutex>
 22 | 
 23 | #include "chat.pb.h"
 24 | #include "common/call_data.h"
 25 | #include "common/options.h"
 26 | #include "common/threadpool.h"
 27 | #include "common/types.h"
 28 | #include "completion.pb.h"
 29 | #include "request/request.h"
 30 | #include "request_tracer.h"
 31 | #include "xllm_http_service.pb.h"
 32 | 
 33 | namespace xllm_service {
 34 | 
 35 | class Scheduler;
 36 | class InstanceMgr;
 37 | class ClosureGuard;
 38 | 
 39 | class XllmHttpServiceImpl : public proto::XllmHttpService {
 40 |  public:
 41 |   XllmHttpServiceImpl(const Options& options, Scheduler* scheduler);
 42 |   ~XllmHttpServiceImpl();
 43 | 
 44 |   void Hello(::google::protobuf::RpcController* controller,
 45 |              const proto::HttpHelloRequest* request,
 46 |              proto::HttpHelloResponse* response,
 47 |              ::google::protobuf::Closure* done) override;
 48 | 
 49 |   void Completions(::google::protobuf::RpcController* controller,
 50 |                    const proto::HttpRequest* request,
 51 |                    proto::HttpResponse* response,
 52 |                    ::google::protobuf::Closure* done) override;
 53 | 
 54 |   void ChatCompletions(::google::protobuf::RpcController* controller,
 55 |                        const proto::HttpRequest* request,
 56 |                        proto::HttpResponse* response,
 57 |                        ::google::protobuf::Closure* done) override;
 58 | 
 59 |   void Embeddings(::google::protobuf::RpcController* controller,
 60 |                   const proto::HttpRequest* request,
 61 |                   proto::HttpResponse* response,
 62 |                   ::google::protobuf::Closure* done) override;
 63 | 
 64 |   void Models(::google::protobuf::RpcController* controller,
 65 |               const proto::HttpRequest* request,
 66 |               proto::HttpResponse* response,
 67 |               ::google::protobuf::Closure* done) override;
 68 | 
 69 |   void Metrics(::google::protobuf::RpcController* controller,
 70 |                const proto::HttpRequest* request,
 71 |                proto::HttpResponse* response,
 72 |                ::google::protobuf::Closure* done) override;
 73 | 
 74 |  private:
 75 |   template <typename T>
 76 |   std::shared_ptr<Request> generate_request(T* req_pb,
 77 |                                             const std::string& method);
 78 | 
 79 |   template <typename T>
 80 |   void handle(std::shared_ptr<T> call_data,
 81 |               const std::string& req_attachment,
 82 |               std::shared_ptr<Request> request,
 83 |               const std::string& method);
 84 | 
 85 |   void get_serving(const std::string& serving_method,
 86 |                    ::google::protobuf::RpcController* controller,
 87 |                    const proto::HttpRequest* request,
 88 |                    proto::HttpResponse* response,
 89 |                    ::google::protobuf::Closure* done);
 90 | 
 91 |  private:
 92 |   Options options_;
 93 | 
 94 |   // not own
 95 |   Scheduler* scheduler_;
 96 | 
 97 |   bool initialized_ = false;
 98 | 
 99 |   std::unique_ptr<RequestTracer> request_tracer_;
100 | 
101 |   std::unique_ptr<ThreadPool> thread_pool_;
102 | 
103 |   // In disagg pd mode, we support receive generated token from
104 |   // prefill or from decode directly.
105 |   // 1.
106 |   // [service] ---req---> [prefill] ---req---> [decode]
107 |   // [service] <---first resp--- [prefill] ---first resp---> [decode]
108 |   // [service] <---resp--- [prefill] <---resp--- [decode]
109 |   //
110 |   // 2.
111 |   // [service] ---req---> [prefill] ---req---> [decode]
112 |   // [service] <---first resp-- [prefill] --first resp---> [decode]
113 |   // [service] <---resp-- [decode]
114 |   //
115 |   bool enable_decode_response_to_service_ = false;
116 | };
117 | 
118 | }  // namespace xllm_service
119 | 


--------------------------------------------------------------------------------
/xllm_service/master.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <brpc/server.h>
19 | 
20 | #include <thread>
21 | 
22 | #include "common/options.h"
23 | #include "http_service/service.h"
24 | #include "rpc_service/service.h"
25 | #include "scheduler/scheduler.h"
26 | 
27 | namespace xllm_service {
28 | 
29 | class Master {
30 |  public:
31 |   explicit Master(const Options& options);
32 |   ~Master();
33 | 
34 |   bool start();
35 |   void stop();
36 | 
37 |  private:
38 |   bool start_http_server();
39 |   bool start_rpc_server();
40 | 
41 |  private:
42 |   Options options_;
43 | 
44 |   // Scheduler for scheduling requests and instances
45 |   std::unique_ptr<Scheduler> scheduler_;
46 | 
47 |   // 1.For http service
48 |   std::string http_server_address_;
49 |   std::unique_ptr<xllm_service::XllmHttpServiceImpl> http_service_;
50 |   brpc::Server http_server_;
51 |   std::unique_ptr<std::thread> http_server_thread_;
52 | 
53 |   // 2.For rpc service
54 |   std::string rpc_server_address_;
55 |   std::unique_ptr<xllm_service::XllmRpcService> rpc_service_;
56 |   brpc::Server rpc_server_;
57 |   std::unique_ptr<std::thread> rpc_server_thread_;
58 | };
59 | 
60 | }  // namespace xllm_service
61 | 


--------------------------------------------------------------------------------
/xllm_service/proto/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(grpc_proto_library)
 2 | include(proto_library)
 3 | 
 4 | proto_library(
 5 |   NAME
 6 |     proto_rpc_service
 7 |   SRCS
 8 |     xllm_rpc_service.proto
 9 | )
10 | 
11 | proto_library(
12 |   NAME
13 |     proto_http_service
14 |   SRCS
15 |     xllm_http_service.proto
16 | )
17 | 
18 | proto_library(
19 |   NAME
20 |     proto_xllm
21 |   SRCS
22 |     xllm/chat.proto
23 |     xllm/common.proto
24 |     xllm/completion.proto
25 | )
26 | 


--------------------------------------------------------------------------------
/xllm_service/proto/xllm/common.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package xllm_service.llm.proto;
 4 | 
 5 | message Usage {
 6 |   // the number of tokens in the prompt.
 7 |   optional int32 prompt_tokens = 1 [json_name="prompt_tokens"];
 8 | 
 9 |   // the number of tokens in the generated completion.
10 |   optional int32 completion_tokens = 2 [json_name="completion_tokens"];
11 | 
12 |   // the total number of tokens used in the request (prompt + completion).
13 |   optional int32 total_tokens = 3 [json_name="total_tokens"];
14 | }
15 | 
16 | // Options for streaming response.
17 | message StreamOptions{
18 |   // if set, an additional chunk with usage will be streamed before the data: [DONE] message.
19 |   optional bool include_usage = 1;
20 | }
21 | 
22 | message Status {
23 |   bool ok = 1;
24 | }
25 | 
26 | message Routing {
27 |   string prefill_name = 1;
28 | 
29 |   string decode_name = 2;
30 | }
31 | 


--------------------------------------------------------------------------------
/xllm_service/proto/xllm_http_service.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package xllm_service.proto;
 4 | option cc_enable_arenas = true;
 5 | option cc_generic_services = true;
 6 | 
 7 | message HttpHelloRequest {
 8 |   string ping = 1;
 9 | }
10 | 
11 | message HttpHelloResponse {
12 |   string pong = 1;
13 | }
14 | 
15 | message HttpRequest {
16 | }
17 | 
18 | message HttpResponse {
19 | }
20 | 
21 | service XllmHttpService {
22 |   rpc Hello(HttpHelloRequest) returns (HttpHelloResponse) {}
23 |   rpc Completions (HttpRequest) returns (HttpResponse) {}
24 |   rpc ChatCompletions (HttpRequest) returns (HttpResponse) {}
25 |   rpc Embeddings (HttpRequest) returns (HttpResponse) {}
26 |   rpc Models (HttpRequest) returns (HttpResponse) {}
27 |   rpc Metrics (HttpRequest) returns (HttpResponse) {}
28 | }
29 | 


--------------------------------------------------------------------------------
/xllm_service/proto/xllm_rpc_service.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto3";
  2 | 
  3 | package xllm_service.proto;
  4 | option cc_generic_services = true;
  5 | 
  6 | message Empty {}
  7 | 
  8 | message Status {
  9 |   bool ok = 1;
 10 | }
 11 | 
 12 | message StatusSet {
 13 |   repeated Status all_status = 1;
 14 | }
 15 | 
 16 | message StatusCode {
 17 |   int32 status_code = 1;
 18 | }
 19 | 
 20 | enum InstanceType {
 21 |   DEFAULT = 0;
 22 |   PREFILL = 1;
 23 |   DECODE = 2;
 24 | }
 25 | 
 26 | message WorkerKVAddr {
 27 |   repeated uint64 layer_addrs = 1;
 28 | }
 29 | 
 30 | message InstanceMetaInfo {
 31 |   // http server address currently
 32 |   string name = 1;
 33 |   // rpc server address
 34 |   string rpc_address = 2;
 35 |   optional InstanceType type = 3;
 36 |   repeated uint64 cluster_ids = 4;
 37 |   repeated string addrs = 8;
 38 |   repeated int64 k_cache_ids = 5;
 39 |   repeated int64 v_cache_ids = 6;
 40 |   int32 dp_size = 7;
 41 | }
 42 | 
 43 | message KvCacheEvent {
 44 |   repeated bytes stored_cache = 1;
 45 |   repeated bytes removed_cache = 2;
 46 |   repeated bytes offload_cache = 3;
 47 | }
 48 | 
 49 | message LoadMetrics {
 50 |   uint64 waiting_requests_num = 1;
 51 |   float gpu_cache_usage_perc = 2;
 52 | }
 53 | 
 54 | message LatencyMetrics {
 55 |   int64 recent_max_ttft = 1;
 56 |   int64 recent_max_tbt = 2;
 57 | }
 58 | 
 59 | message HeartbeatRequest {
 60 |   string name = 1;
 61 |   KvCacheEvent cache_event = 2;
 62 |   LoadMetrics load_metrics = 3;
 63 |   LatencyMetrics latency_metrics = 4;
 64 | }
 65 | 
 66 | message InstanceID {
 67 |   string name = 1;
 68 | }
 69 | 
 70 | message InstanceIDs {
 71 |   repeated string names = 1;
 72 | }
 73 | 
 74 | 
 75 | message OutputUsage {
 76 |   // the number of tokens in the prompt.
 77 |   int32 num_prompt_tokens = 1;
 78 |   // the number of tokens in the generated completion.
 79 |   int32 num_generated_tokens = 2;
 80 |   // the total number of tokens used in the request (prompt + completion).
 81 |   int32 num_total_tokens = 3;
 82 | }
 83 | 
 84 | message LogProbData {
 85 |   // the text of the token
 86 |   string token = 1;
 87 |   // the token id
 88 |   int32 token_id = 2;
 89 |   // the log probability of the token
 90 |   float logprob = 3;
 91 |   // whether the token is finished
 92 |   bool finished_token = 4;
 93 | }
 94 | 
 95 | message LogProb {
 96 |   LogProbData log_prob_data = 1;
 97 |   repeated LogProbData top_logprobs = 2;
 98 | }
 99 | 
100 | message SequenceOutput {
101 |   // the index of the sequence in the request.
102 |   int32 index = 1;
103 |   // the generated/delta text.
104 |   // delta text is the text generated since the last response for streaming.
105 |   string text = 2;
106 |   // the token ids of the generated text.
107 |   repeated int32 token_ids = 3;
108 |   // the reason the sequence finished.
109 |   string finish_reason = 4;
110 |   // log probabilities of the generated tokens.
111 |   repeated LogProb logprobs = 5;
112 | }
113 | 
114 | message GenerationStatus {
115 |   int32 status_code = 1;
116 |   string status_msg = 2;
117 | }
118 | 
119 | // Stream response token to prefill instance from decode.
120 | message DisaggStreamGeneration {
121 |   // req id
122 |   string req_id = 1;
123 |   // req id which is generated in xllm service.
124 |   string service_req_id = 2;
125 |   // the status of the request
126 |   GenerationStatus gen_status = 3;
127 |   // maybe multi sequences in the request
128 |   repeated SequenceOutput outputs = 4;
129 |   OutputUsage usage = 5;
130 |   bool finished = 6;
131 | }
132 | 
133 | message DisaggStreamGenerations {
134 |   repeated DisaggStreamGeneration gens = 1;
135 | }
136 | 
137 | message ServiceConfig {
138 |   bool enable_decode_response_to_service = 1;
139 | }
140 | 
141 | service XllmRpcService {
142 |   rpc Hello(Empty) returns (Status) {}
143 |   rpc RegisterInstance(InstanceMetaInfo) returns (StatusCode) {}
144 |   rpc GetInstanceInfo(InstanceID) returns (InstanceMetaInfo) {}
145 |   rpc Heartbeat(HeartbeatRequest) returns (Status) {}
146 |   rpc GetStaticDecodeList(InstanceID) returns (InstanceIDs) {}
147 |   rpc GetConfig(Empty) returns (ServiceConfig) {}
148 | 
149 |   // xllm service receive response from decode instance directly in disagg pd mode.
150 |   // This can eliminate the cost brought by forwarding through prefill.
151 |   rpc Generations(DisaggStreamGenerations) returns (StatusSet) {}
152 | }
153 | 
154 | 


--------------------------------------------------------------------------------
/xllm_service/request/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     request
 6 |   HDRS
 7 |     request.h
 8 |   DEPS
 9 |     :common
10 | )


--------------------------------------------------------------------------------
/xllm_service/request/request.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "chat_template/jinja_chat_template.h"
19 | #include "common/types.h"
20 | #include "common/xllm/output.h"
21 | 
22 | namespace xllm_service {
23 | 
24 | // Store request-related data
25 | struct Request {
26 |   // model name
27 |   std::string model;
28 | 
29 |   // request id generated by service
30 |   std::string service_request_id;
31 | 
32 |   // whether to stream the response
33 |   bool stream = false;
34 | 
35 |   // whether to return usage
36 |   bool include_usage = false;
37 | 
38 |   // input prompt
39 |   std::string prompt;
40 | 
41 |   // input messages
42 |   ChatMessages messages;
43 | 
44 |   // token ids of prompt
45 |   std::vector<int32_t> token_ids;
46 | 
47 |   // instance routing
48 |   Routing routing;
49 | 
50 |   // the estimated TTFT obtained from the TTFT predictor
51 |   int64_t estimated_ttft = 0;
52 | 
53 |   // output callback
54 |   OutputCallback output_callback;
55 | 
56 |   // trace callback
57 |   std::function<void(const std::string&)> trace_callback = nullptr;
58 | };
59 | 
60 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/rpc_service/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME
 7 |   xllm_rpc_service
 8 |   HDRS
 9 |     service.h
10 |   SRCS
11 |     service.cpp
12 |   DEPS
13 |     :common
14 |     :scheduler
15 |     absl::random_random
16 |     absl::strings
17 |     glog::glog
18 |     proto::proto_rpc_service
19 |     proto_xllm
20 |     tokenizer
21 |     chat_template
22 | )
23 | target_link_libraries(xllm_rpc_service PRIVATE brpc-static)
24 | 
25 | cc_binary(
26 |   NAME 
27 |     xllm_rpc_service_test
28 |   SRCS 
29 |     rpc_service_test.cpp
30 |   DEPS
31 |     :xllm_rpc_service
32 |     gflags::gflags
33 |     glog::glog
34 |     GTest::gtest_main
35 | )
36 | add_test(NAME XllmRpcServiceTest COMMAND xllm_rpc_service_test)
37 | 
38 | cc_binary(
39 |   NAME 
40 |     xllm_rpc_serving
41 |   SRCS 
42 |     main.cpp
43 |   DEPS
44 |     :xllm_rpc_service
45 |     gflags::gflags   
46 | )
47 | 
48 | cc_library(
49 |   NAME
50 |   xllm_rpc_client
51 |   HDRS
52 |     client.h
53 |   SRCS
54 |     client.cpp
55 |   DEPS
56 |     :common
57 |     glog::glog
58 |     proto::proto_rpc_service
59 | )
60 | target_link_libraries(xllm_rpc_client PRIVATE brpc-static)
61 | 


--------------------------------------------------------------------------------
/xllm_service/rpc_service/client.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "rpc_service/client.h"
 17 | 
 18 | #include <glog/logging.h>
 19 | 
 20 | namespace xllm_service {
 21 | 
 22 | // magic number, TODO: move to config file or env var
 23 | static constexpr int kHeartbeatInterval = 3;  // in seconds
 24 | 
 25 | XllmRpcClient::XllmRpcClient(const std::string& instace_name,
 26 |                              const std::string& master_addr,
 27 |                              const ChannelOptions& options)
 28 |     : instance_name_(instace_name), master_addr_(master_addr) {
 29 |   brpc::ChannelOptions chan_options;
 30 |   chan_options.protocol = options.protocol;
 31 |   chan_options.connection_type = options.connection_type;
 32 |   chan_options.timeout_ms = options.timeout_ms /*milliseconds*/;
 33 |   chan_options.max_retry = options.max_retry;
 34 |   if (master_channel_.Init(master_addr_.c_str(),
 35 |                            options.load_balancer.c_str(),
 36 |                            &chan_options) != 0) {
 37 |     LOG(ERROR) << "Fail to initialize brpc channel to server " << master_addr_;
 38 |     return;
 39 |   }
 40 |   master_stub_ = std::make_unique<proto::XllmRpcService_Stub>(&master_channel_);
 41 | 
 42 |   // heartbeat thread
 43 |   heartbeat_thread_ =
 44 |       std::make_unique<std::thread>(&XllmRpcClient::heartbeat, this);
 45 | }
 46 | 
 47 | XllmRpcClient::~XllmRpcClient() {
 48 |   exited_ = true;
 49 |   if (heartbeat_thread_) {
 50 |     heartbeat_thread_->join();
 51 |   }
 52 | }
 53 | 
 54 | // TODO: send metainfo/metrics to master ?
 55 | void XllmRpcClient::heartbeat() {
 56 |   while (!exited_) {
 57 |     std::this_thread::sleep_for(std::chrono::seconds(kHeartbeatInterval));
 58 |     if (!register_inst_done_) continue;
 59 | 
 60 |     brpc::Controller cntl;
 61 |     proto::HeartbeatRequest req;
 62 |     req.set_name(instance_name_);
 63 |     // TODO: set req.cache_event and req.load_metrics
 64 |     proto::Status res;
 65 |     master_stub_->Heartbeat(&cntl, &req, &res, nullptr);
 66 |     if (cntl.Failed()) {
 67 |       LOG(ERROR) << instance_name_
 68 |                  << " failed to send heartbeat to master: " << cntl.ErrorText();
 69 |       ;
 70 |     } else if (!res.ok()) {
 71 |       LOG(ERROR) << instance_name_
 72 |                  << " failed to send heartbeat to master, status: " << res.ok();
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | ErrorCode XllmRpcClient::register_instance() {
 78 |   InstanceMetaInfo metainfo;
 79 |   metainfo.name = instance_name_;
 80 |   return register_instance(metainfo);
 81 | }
 82 | 
 83 | ErrorCode XllmRpcClient::register_instance(const InstanceMetaInfo& metainfo) {
 84 |   brpc::Controller cntl;
 85 |   proto::InstanceMetaInfo req;
 86 |   req.set_name(metainfo.name);
 87 |   if (metainfo.type == InstanceType::PREFILL) {
 88 |     req.set_type(proto::InstanceType::PREFILL);
 89 |   } else if (metainfo.type == InstanceType::DECODE) {
 90 |     req.set_type(proto::InstanceType::DECODE);
 91 |   } else {
 92 |     req.set_type(proto::InstanceType::DEFAULT);
 93 |   }
 94 |   proto::StatusCode res;
 95 |   master_stub_->RegisterInstance(&cntl, &req, &res, nullptr);
 96 |   if (cntl.Failed()) {
 97 |     LOG(ERROR) << instance_name_
 98 |                << " failed to send register_instance to master: "
 99 |                << cntl.ErrorText();
100 |     ;
101 |   } else if (res.status_code() != ConvertErrorCode::to_int(ErrorCode::OK)) {
102 |     LOG(ERROR) << instance_name_
103 |                << " failed to send register_instance to master: "
104 |                << "res = " << res.status_code();
105 |   } else {
106 |     // register instance success
107 |     register_inst_done_ = true;
108 |   }
109 |   return ConvertErrorCode::from_int(res.status_code());
110 | }
111 | 
112 | }  // namespace xllm_service
113 | 


--------------------------------------------------------------------------------
/xllm_service/rpc_service/client.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <brpc/channel.h>
19 | #include <butil/time.h>
20 | 
21 | #include <string>
22 | #include <thread>
23 | 
24 | #include "common/types.h"
25 | #include "xllm_rpc_service.pb.h"
26 | 
27 | namespace xllm_service {
28 | 
29 | struct ChannelOptions {
30 |   std::string protocol = "baidu_std";
31 |   std::string connection_type = "";
32 |   std::string load_balancer = "";
33 |   int timeout_ms = 100;
34 |   int max_retry = 3;
35 |   int interval_ms = 1000;
36 | };
37 | 
38 | class XllmRpcClient {
39 |  public:
40 |   XllmRpcClient(const std::string& instace_name,
41 |                 const std::string& master_addr,
42 |                 const ChannelOptions& options);
43 |   ~XllmRpcClient();
44 | 
45 |   ErrorCode register_instance();
46 |   ErrorCode register_instance(const InstanceMetaInfo& metainfo);
47 | 
48 |  private:
49 |   void heartbeat();
50 | 
51 |  private:
52 |   bool exited_ = false;
53 |   bool register_inst_done_ = false;
54 |   // instance rdma address or other info: ip port
55 |   std::string instance_name_;
56 |   std::string master_addr_;
57 |   brpc::Channel master_channel_;
58 |   std::unique_ptr<proto::XllmRpcService_Stub> master_stub_;
59 |   std::unique_ptr<std::thread> heartbeat_thread_;
60 | };
61 | 
62 | }  // namespace xllm_service
63 | 


--------------------------------------------------------------------------------
/xllm_service/rpc_service/main.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <brpc/server.h>
17 | #include <gflags/gflags.h>
18 | #include <glog/logging.h>
19 | 
20 | #include "common/global_gflags.h"
21 | #include "common/types.h"
22 | #include "common/utils.h"
23 | #include "rpc_service/service.h"
24 | 
25 | int main(int argc, char* argv[]) {
26 |   // Initialize gflags
27 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
28 | 
29 |   // Initialize glog
30 |   google::InitGoogleLogging(argv[0]);
31 | 
32 |   LOG(INFO) << "Dump all gflags: " << std::endl
33 |             << google::CommandlineFlagsIntoString();
34 |   google::FlushLogFiles(google::INFO);
35 | 
36 |   LOG(INFO) << "Starting xllm rpc service, port: " << FLAGS_port;
37 | 
38 |   if (!xllm_service::utils::is_port_available(FLAGS_port)) {
39 |     LOG(ERROR) << "Port " << FLAGS_port << " is already in use. "
40 |                << "Please specify a different port using --port flag.";
41 |     return 1;
42 |   }
43 | 
44 |   // create xllm service
45 |   xllm_service::Options service_options;
46 |   xllm_service::XllmRpcService service(service_options, nullptr);
47 | 
48 |   // Initialize brpc server
49 |   std::string server_address = "0.0.0.0:" + std::to_string(FLAGS_port);
50 |   brpc::Server server;
51 |   if (server.AddService(&service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
52 |     LOG(ERROR) << "Failed to add service to server";
53 |     return -1;
54 |   }
55 | 
56 |   butil::EndPoint endpoint;
57 |   if (!FLAGS_listen_addr.empty()) {
58 |     if (butil::str2endpoint(FLAGS_listen_addr.c_str(), &endpoint) < 0) {
59 |       LOG(ERROR) << "Invalid listen address:" << FLAGS_listen_addr;
60 |       return -1;
61 |     }
62 |   } else {
63 |     endpoint = butil::EndPoint(butil::IP_ANY, FLAGS_port);
64 |   }
65 | 
66 |   // Start the server.
67 |   brpc::ServerOptions options;
68 |   options.idle_timeout_sec = FLAGS_idle_timeout_s;
69 |   options.num_threads = FLAGS_num_threads;
70 |   options.max_concurrency = FLAGS_max_concurrency;
71 |   options.idle_timeout_sec = FLAGS_idle_timeout_s;
72 |   if (server.Start(endpoint, &options) != 0) {
73 |     LOG(ERROR) << "Fail to start Brpc rpc server";
74 |     return -1;
75 |   }
76 | 
77 |   LOG(INFO) << "Xllm rpc service listening on " << server_address;
78 | 
79 |   // Wait until Ctrl-C is pressed, then Stop() and Join() the server.
80 |   server.RunUntilAskedToQuit();
81 | 
82 |   return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/xllm_service/rpc_service/rpc_service_test.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <glog/logging.h>
17 | #include <gtest/gtest.h>
18 | 
19 | #include "rpc_service/service.h"
20 | 
21 | namespace xllm_service::test {
22 | 
23 | class XllmRpcServiceTest : public ::testing::Test {
24 |  protected:
25 |   void SetUp() override { google::InitGoogleLogging("XllmRpcServiceTest"); }
26 | 
27 |   void TearDown() override { google::ShutdownGoogleLogging(); }
28 | };
29 | // TODO
30 | // TEST_F(XllmRpcServiceTest, RegisterInstance) {
31 | //   RpcServiceConfig config;
32 | //   HttpServiceConfig http_config;
33 | //   ModelConfig model_config;
34 | //   auto xllm_service =
35 | //       std::make_shared<XllmRpcServiceImpl>(config, model_config,
36 | //       http_config);
37 | //   std::string inst_name = "127.0.0.1@nic0";
38 | //   InstanceMetaInfo metainfo(inst_name, "127.0.0.1:7777",
39 | //   InstanceType::PREFILL); EXPECT_EQ(ErrorCode::OK,
40 | //             xllm_service->register_instance(inst_name, metainfo));
41 | 
42 | //   metainfo.type = InstanceType::DECODE;
43 | //   EXPECT_EQ(ErrorCode::INSTANCE_EXISTED,
44 | //             xllm_service->register_instance(inst_name, metainfo));
45 | // }
46 | 
47 | // TEST_F(XllmRpcServiceTest, UpdateInstanceMetainfo) {
48 | //   RpcServiceConfig config;
49 | //   HttpServiceConfig http_config;
50 | //   ModelConfig model_config;
51 | //   auto xllm_service =
52 | //       std::make_shared<XllmRpcServiceImpl>(config, model_config,
53 | //       http_config);
54 | //   std::string inst_name = "127.0.0.1@nic0";
55 | //   InstanceMetaInfo metainfo(inst_name, "127.0.0.1:7777",
56 | //   InstanceType::PREFILL); EXPECT_EQ(ErrorCode::OK,
57 | //             xllm_service->register_instance(inst_name, metainfo));
58 | //   metainfo.type = InstanceType::DECODE;
59 | //   EXPECT_EQ(ErrorCode::OK,
60 | //             xllm_service->update_instance_metainfo(inst_name, metainfo));
61 | 
62 | //   std::string inst_name2 = "127.0.0.1@nic2";
63 | //   InstanceMetaInfo metainfo2(
64 | //       inst_name2, "127.0.0.1:7778", InstanceType::PREFILL);
65 | //   EXPECT_EQ(ErrorCode::INSTANCE_NOT_EXISTED,
66 | //             xllm_service->update_instance_metainfo(inst_name2, metainfo));
67 | // }
68 | 
69 | }  // namespace xllm_service::test
70 | 


--------------------------------------------------------------------------------
/xllm_service/rpc_service/service.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #pragma once
 17 | 
 18 | #include <mutex>
 19 | #include <unordered_map>
 20 | 
 21 | #include "chat.pb.h"
 22 | #include "common/options.h"
 23 | #include "common/types.h"
 24 | #include "common/xllm/output.h"
 25 | #include "common/xllm/status.h"
 26 | #include "completion.pb.h"
 27 | #include "xllm_rpc_service.pb.h"
 28 | 
 29 | namespace xllm_service {
 30 | 
 31 | struct ServiceConfig {
 32 |   ServiceConfig(bool decode_to_service)
 33 |       : enable_decode_response_to_service(decode_to_service) {}
 34 | 
 35 |   bool enable_decode_response_to_service = false;
 36 | };
 37 | 
 38 | class Scheduler;
 39 | class InstanceMgr;
 40 | 
 41 | class XllmRpcServiceImpl final {
 42 |  public:
 43 |   XllmRpcServiceImpl(const Options& options, Scheduler* scheduler);
 44 |   ~XllmRpcServiceImpl();
 45 | 
 46 |   void heartbeat(const proto::HeartbeatRequest* req);
 47 | 
 48 |   InstanceMetaInfo get_instance_info(const std::string& instance_name);
 49 | 
 50 |   ServiceConfig get_config();
 51 | 
 52 |   std::vector<std::string> get_static_decode_list(
 53 |       const std::string& prefill_name);
 54 | 
 55 |  public:
 56 |   // handle generations from prefill/decode instance
 57 |   bool handle_generation(const llm::RequestOutput& request_output);
 58 | 
 59 |  private:
 60 |   Options options_;
 61 | 
 62 |   // not own
 63 |   Scheduler* scheduler_;
 64 | 
 65 |   // In disagg pd mode, we support receive generated token from
 66 |   // prefill or from decode directly.
 67 |   // 1.
 68 |   // [service] ---req---> [prefill] ---req---> [decode]
 69 |   // [service] <---first resp--- [prefill] ---first resp---> [decode]
 70 |   // [service] <---resp--- [prefill] <---resp--- [decode]
 71 |   //
 72 |   // 2.
 73 |   // [service] ---req---> [prefill] ---req---> [decode]
 74 |   // [service] <---first resp-- [prefill] --first resp---> [decode]
 75 |   // [service] <---resp-- [decode]
 76 |   //
 77 |   bool enable_decode_response_to_service_ = false;
 78 | };
 79 | 
 80 | // parse proto data and call XllmRpcService
 81 | class XllmRpcService : public proto::XllmRpcService {
 82 |  public:
 83 |   explicit XllmRpcService(const Options& options, Scheduler* scheduler);
 84 |   virtual ~XllmRpcService();
 85 | 
 86 |   virtual void Hello(google::protobuf::RpcController* cntl_base,
 87 |                      const proto::Empty* req,
 88 |                      proto::Status* resp,
 89 |                      google::protobuf::Closure* done) override;
 90 | 
 91 |   virtual void Heartbeat(google::protobuf::RpcController* cntl_base,
 92 |                          const proto::HeartbeatRequest* req,
 93 |                          proto::Status* resp,
 94 |                          google::protobuf::Closure* done) override;
 95 | 
 96 |   virtual void GetInstanceInfo(google::protobuf::RpcController* cntl_base,
 97 |                                const proto::InstanceID* req,
 98 |                                proto::InstanceMetaInfo* resp,
 99 |                                google::protobuf::Closure* done) override;
100 | 
101 |   virtual void GetStaticDecodeList(google::protobuf::RpcController* cntl_base,
102 |                                    const proto::InstanceID* req,
103 |                                    proto::InstanceIDs* resp,
104 |                                    google::protobuf::Closure* done) override;
105 | 
106 |   // xllm service receive response from decode instance directly in disagg pd
107 |   // mode. This can eliminate the cost brought by forwarding through prefill.
108 |   virtual void Generations(google::protobuf::RpcController* cntl_base,
109 |                            const proto::DisaggStreamGenerations* req,
110 |                            proto::StatusSet* resp,
111 |                            google::protobuf::Closure* done) override;
112 | 
113 |   virtual void GetConfig(google::protobuf::RpcController* cntl_base,
114 |                          const proto::Empty* req,
115 |                          proto::ServiceConfig* resp,
116 |                          google::protobuf::Closure* done) override;
117 | 
118 |  private:
119 |   std::unique_ptr<XllmRpcServiceImpl> xllm_rpc_service_impl_;
120 | };
121 | 
122 | }  // namespace xllm_service
123 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | add_subdirectory(etcd_client)
 4 | add_subdirectory(managers)
 5 | add_subdirectory(loadbalance_policy)
 6 | 
 7 | cc_library(
 8 |   NAME
 9 |     scheduler
10 |   HDRS
11 |     response_handler.h
12 |     scheduler.h
13 |   SRCS
14 |     response_handler.cpp
15 |     scheduler.cpp
16 |   DEPS
17 |     :chat_template
18 |     :common
19 |     :etcd_client
20 |     :loadbalance_policy
21 |     :managers
22 |     :request
23 |     cpprest
24 |     etcd-cpp-api
25 |     glog::glog
26 |     nlohmann_json::nlohmann_json
27 | )


--------------------------------------------------------------------------------
/xllm_service/scheduler/etcd_client/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME
 6 |   etcd_client
 7 |   HDRS
 8 |     etcd_client.h
 9 |   SRCS
10 |     etcd_client.cpp
11 |   DEPS
12 |     :common
13 |     cpprest
14 |     etcd-cpp-api
15 |     glog::glog
16 |     nlohmann_json::nlohmann_json
17 | )


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME
 7 |     loadbalance_policy
 8 |   HDRS
 9 |     loadbalance_policy.h
10 |     round_robin.h
11 |     cache_aware_routing.h
12 |   SRCS
13 |     round_robin.cpp
14 |     cache_aware_routing.cpp
15 |   DEPS
16 |     :chat_template
17 |     :common
18 |     :managers
19 | )
20 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/cache_aware_routing.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "cache_aware_routing.h"
17 | 
18 | namespace xllm_service {
19 | 
20 | constexpr float MIN_SCORE = -2.0;
21 | 
22 | bool CacheAwareRouting::select_instances_pair(
23 |     std::shared_ptr<Request> request) {
24 |   LoadBalanceInfos lb_infos;
25 |   if (!request->token_ids.empty()) {
26 |     Slice<int32_t> token_ids(request->token_ids.data(),
27 |                              request->token_ids.size());
28 |     global_kvcache_mgr_->match(token_ids, &lb_infos.overlap_scores);
29 |     DLOG(INFO) << lb_infos.debug_string();
30 |   }
31 | 
32 |   instance_mgr_->get_load_metrics(&lb_infos);
33 |   DLOG(INFO) << lb_infos.debug_string();
34 | 
35 |   if (lb_infos.prefill_load_metrics.size() == 0) {
36 |     LOG(INFO) << "No node available!";
37 |     return false;
38 |   }
39 | 
40 |   // find preifll
41 |   cost_function(lb_infos.overlap_scores.hbm_instance_score,
42 |                 lb_infos.overlap_scores.max_block_num,
43 |                 lb_infos.prefill_load_metrics,
44 |                 lb_infos.prefill_max_waiting_requests_num,
45 |                 &request->routing.prefill_name);
46 | 
47 |   // find decode
48 |   if (lb_infos.decode_load_metrics.size()) {
49 |     cost_function(lb_infos.overlap_scores.hbm_instance_score,
50 |                   lb_infos.overlap_scores.max_block_num,
51 |                   lb_infos.decode_load_metrics,
52 |                   lb_infos.decode_max_waiting_requests_num,
53 |                   &request->routing.decode_name);
54 |   }
55 | 
56 |   return true;
57 | }
58 | 
59 | void CacheAwareRouting::cost_function(
60 |     const std::unordered_map<std::string, uint32_t>& overlap_scores,
61 |     const uint32_t& max_block_num,
62 |     const std::unordered_map<std::string, LoadMetrics>& load_metrics,
63 |     const int64_t& max_waiting_requests_num,
64 |     std::string* best_choice) {
65 |   float best_score = MIN_SCORE;
66 |   for (const auto& it : load_metrics) {
67 |     const auto matched_blocks_it = overlap_scores.find(it.first);
68 |     uint32_t matched_blocks = 0;
69 |     if (matched_blocks_it != overlap_scores.end()) {
70 |       matched_blocks = matched_blocks_it->second;
71 |     }
72 | 
73 |     auto score =
74 |         (max_block_num == 0 ? 0 : matched_blocks / max_block_num) -
75 |         it.second.gpu_cache_usage_perc -
76 |         (max_waiting_requests_num == 0
77 |              ? 0
78 |              : it.second.waiting_requests_num / max_waiting_requests_num);
79 | 
80 |     if (score > best_score) {
81 |       best_score = score;
82 |       *best_choice = it.first;
83 |     }
84 |   }
85 | }
86 | 
87 | }  // namespace xllm_service
88 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/cache_aware_routing.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "common/macros.h"
19 | #include "loadbalance_policy.h"
20 | #include "scheduler/managers/global_kvcache_mgr.h"
21 | 
22 | namespace xllm_service {
23 | 
24 | class CacheAwareRouting final : public LoadBalancePolicy {
25 |  public:
26 |   CacheAwareRouting(std::shared_ptr<InstanceMgr> instance_mgr,
27 |                     std::shared_ptr<GlobalKVCacheMgr> global_kvcache_mgr)
28 |       : global_kvcache_mgr_(global_kvcache_mgr),
29 |         LoadBalancePolicy(instance_mgr) {};
30 | 
31 |   virtual ~CacheAwareRouting() = default;
32 | 
33 |   bool select_instances_pair(std::shared_ptr<Request> request) override;
34 | 
35 |  private:
36 |   DISALLOW_COPY_AND_ASSIGN(CacheAwareRouting);
37 | 
38 |   void cost_function(
39 |       const std::unordered_map<std::string, uint32_t>& overlap_scores,
40 |       const uint32_t& max_block_num,
41 |       const std::unordered_map<std::string, LoadMetrics>& load_metrics,
42 |       const int64_t& max_waiting_requests_num,
43 |       std::string* best_choice);
44 | 
45 |   std::shared_ptr<GlobalKVCacheMgr> global_kvcache_mgr_;
46 | };
47 | 
48 | }  // namespace xllm_service
49 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/loadbalance_policy.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "common/types.h"
19 | #include "request/request.h"
20 | #include "scheduler/managers/instance_mgr.h"
21 | 
22 | namespace xllm_service {
23 | 
24 | class LoadBalancePolicy {
25 |  public:
26 |   LoadBalancePolicy(std::shared_ptr<InstanceMgr> instance_mgr)
27 |       : instance_mgr_(instance_mgr) {}
28 | 
29 |   virtual ~LoadBalancePolicy() = default;
30 | 
31 |   virtual bool select_instances_pair(std::shared_ptr<Request> request) = 0;
32 | 
33 |  protected:
34 |   std::shared_ptr<InstanceMgr> instance_mgr_;
35 | };
36 | 
37 | }  // namespace xllm_service
38 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/round_robin.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "round_robin.h"
17 | 
18 | namespace xllm_service {
19 | 
20 | bool RoundRobin::select_instances_pair(std::shared_ptr<Request> request) {
21 |   return instance_mgr_->get_next_instance_pair(&request->routing);
22 | }
23 | 
24 | }  // namespace xllm_service
25 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/loadbalance_policy/round_robin.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "common/macros.h"
19 | #include "loadbalance_policy.h"
20 | 
21 | namespace xllm_service {
22 | 
23 | class RoundRobin final : public LoadBalancePolicy {
24 |  public:
25 |   RoundRobin(std::shared_ptr<InstanceMgr> instance_mgr)
26 |       : LoadBalancePolicy(instance_mgr) {};
27 | 
28 |   virtual ~RoundRobin() = default;
29 | 
30 |   bool select_instances_pair(std::shared_ptr<Request> request) override;
31 | 
32 |  private:
33 |   DISALLOW_COPY_AND_ASSIGN(RoundRobin);
34 | };
35 | 
36 | }  // namespace xllm_service
37 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/managers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME
 6 |    managers
 7 |   HDRS
 8 |     instance_mgr.h
 9 |     global_kvcache_mgr.h
10 |   SRCS
11 |     instance_mgr.cpp
12 |     global_kvcache_mgr.cpp
13 |   DEPS
14 |     :chat_template
15 |     :common
16 |     :etcd_client
17 |     :request
18 |     absl::random_random
19 |     absl::strings
20 |     glog::glog
21 |     proto::proto_rpc_service
22 |     proto_xllm
23 | )
24 | target_link_libraries(managers PRIVATE brpc-static)
25 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/managers/global_kvcache_mgr.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <shared_mutex>
19 | #include <thread>
20 | 
21 | #include "../etcd_client/etcd_client.h"
22 | #include "common/hash_util.h"
23 | #include "common/macros.h"
24 | #include "common/options.h"
25 | #include "common/slice.h"
26 | #include "common/threadpool.h"
27 | #include "common/types.h"
28 | #include "xllm_rpc_service.pb.h"
29 | 
30 | namespace xllm_service {
31 | 
32 | class GlobalKVCacheMgr final {
33 |  public:
34 |   explicit GlobalKVCacheMgr(const Options& options,
35 |                             const std::shared_ptr<EtcdClient>& etcd_client,
36 |                             const bool is_master_service);
37 |   ~GlobalKVCacheMgr();
38 | 
39 |   void match(const Slice<int32_t>& token_ids, OverlapScores* overlap_scores);
40 | 
41 |   void record_updated_kvcaches(const std::string& instance_name,
42 |                                const proto::KvCacheEvent& kvcache_event);
43 |   bool upload_kvcache();
44 | 
45 |   void set_as_master();
46 | 
47 |  private:
48 |   DISALLOW_COPY_AND_ASSIGN(GlobalKVCacheMgr);
49 | 
50 |   void update_kvcache(const etcd::Response& response,
51 |                       const uint64_t prefix_len);
52 | 
53 |  private:
54 |   Options options_;
55 |   std::atomic_bool is_master_service_ = false;
56 |   bool exited_ = false;
57 |   std::shared_mutex kvcache_mutex_;
58 |   Murmur3KeyCacheMap kvcache_infos_;
59 |   std::shared_ptr<EtcdClient> etcd_client_;  // not own
60 | 
61 |   std::mutex update_mutex_;
62 |   Murmur3KeyCacheMap updated_kvcaches_;
63 | 
64 |   ThreadPool threadpool_;
65 | };
66 | 
67 | }  // namespace xllm_service
68 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/managers/instance_mgr.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #pragma once
 17 | 
 18 | #include <brpc/channel.h>
 19 | 
 20 | #include <shared_mutex>
 21 | #include <thread>
 22 | #include <unordered_map>
 23 | #include <unordered_set>
 24 | 
 25 | #include "common/macros.h"
 26 | #include "common/options.h"
 27 | #include "common/threadpool.h"
 28 | #include "common/ttft_predictor.h"
 29 | #include "common/types.h"
 30 | #include "request/request.h"
 31 | #include "scheduler/etcd_client/etcd_client.h"
 32 | #include "xllm_rpc_service.pb.h"
 33 | 
 34 | namespace xllm_service {
 35 | 
 36 | class InstanceMgr final {
 37 |  public:
 38 |   explicit InstanceMgr(const Options& options,
 39 |                        const std::shared_ptr<EtcdClient>& etcd_client,
 40 |                        const bool is_master_service);
 41 | 
 42 |   ~InstanceMgr();
 43 | 
 44 |   InstanceMetaInfo get_instance_info(const std::string& instance_name);
 45 | 
 46 |   bool get_next_instance_pair(Routing* routing);
 47 | 
 48 |   std::vector<std::string> get_static_decode_list(
 49 |       const std::string& instance_name);
 50 | 
 51 |   void get_load_metrics(LoadBalanceInfos* infos);
 52 | 
 53 |   std::shared_ptr<brpc::Channel> get_channel(const std::string& instance_name);
 54 | 
 55 |   void record_load_metrics_update(const std::string& instance_name,
 56 |                                   const proto::LoadMetrics& load_metrics);
 57 |   bool upload_load_metrics();
 58 | 
 59 |   // update the recent token latency metrics for the corresponding instance
 60 |   void update_latency_metrics(const std::string& instance_name,
 61 |                               const proto::LatencyMetrics& latency_metrics);
 62 | 
 63 |   // update request metrics under different actions
 64 |   void update_request_metrics(std::shared_ptr<Request> request,
 65 |                               RequestAction action);
 66 | 
 67 |   void set_as_master();
 68 | 
 69 |  private:
 70 |   DISALLOW_COPY_AND_ASSIGN(InstanceMgr);
 71 | 
 72 |   void init();
 73 | 
 74 |   bool create_channel(const std::string& target_uri);
 75 |   // use etcd as ServiceDiscovery
 76 |   void update_instance_metainfo(const etcd::Response& response,
 77 |                                 const uint64_t& prefix_len);
 78 | 
 79 |   void update_load_metrics(const etcd::Response& response,
 80 |                            const uint64_t& prefix_len);
 81 | 
 82 |  private:
 83 |   Options options_;
 84 | 
 85 |   bool exited_ = false;
 86 |   bool use_etcd_ = false;
 87 |   std::atomic_bool is_master_service_ = false;
 88 | 
 89 |   std::shared_ptr<EtcdClient> etcd_client_;
 90 | 
 91 |   std::shared_mutex inst_mutex_;
 92 |   std::unordered_map<std::string, InstanceMetaInfo> instances_;
 93 |   std::vector<std::string> prefill_index_;
 94 |   std::vector<std::string> decode_index_;
 95 |   uint64_t next_prefill_index_ = 0;
 96 |   uint64_t next_decode_index_ = 0;
 97 | 
 98 |   std::shared_mutex load_metric_mutex_;
 99 |   std::unordered_map<std::string, LoadMetrics> load_metrics_;
100 |   std::unordered_map<std::string, std::shared_ptr<brpc::Channel>>
101 |       cached_channels_;
102 | 
103 |   std::mutex update_mutex_;
104 |   std::unordered_map<std::string, LoadMetrics> updated_metrics_;
105 |   std::unordered_set<std::string> removed_instance_;
106 | 
107 |   // "instance name" -> "TtftPredictor" map
108 |   std::mutex ttft_predictor_mutex_;
109 |   std::unordered_map<std::string, TtftPredictor> ttft_predictors_;
110 | 
111 |   // Record the latest token latency metrics for each instance, including TTFT
112 |   // and TBT.
113 |   std::mutex latency_metrics_mutex_;
114 |   std::unordered_map<std::string, LatencyMetrics> latency_metrics_;
115 | 
116 |   // Record the request metrics for each instance, including prefill token
117 |   // count, prefill request count, estimated prefill execution time, decode
118 |   // token count, and decode request count.
119 |   std::mutex request_metrics_mutex_;
120 |   std::unordered_map<std::string, RequestMetrics> request_metrics_;
121 | 
122 |   ThreadPool threadpool_;
123 | };
124 | 
125 | }  // namespace xllm_service
126 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/response_handler.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <mutex>
19 | #include <unordered_map>
20 | 
21 | #include "common/call_data.h"
22 | #include "common/threadpool.h"
23 | #include "common/xllm/output.h"
24 | #include "common/xllm/status.h"
25 | 
26 | namespace xllm_service {
27 | 
28 | class ResponseHandler final {
29 |  public:
30 |   ResponseHandler() = default;
31 |   ~ResponseHandler() = default;
32 | 
33 |   bool send_delta_to_client(std::shared_ptr<ChatCallData> call_data,
34 |                             std::unordered_set<size_t>* first_message_sent,
35 |                             bool include_usage,
36 |                             const std::string& request_id,
37 |                             int64_t created_time,
38 |                             const std::string& model,
39 |                             const llm::RequestOutput& output);
40 |   bool send_result_to_client(std::shared_ptr<ChatCallData> call_data,
41 |                              const std::string& request_id,
42 |                              int64_t created_time,
43 |                              const std::string& model,
44 |                              const llm::RequestOutput& req_output);
45 | 
46 |   bool send_delta_to_client(std::shared_ptr<CompletionCallData> call_data,
47 |                             bool include_usage,
48 |                             const std::string& request_id,
49 |                             int64_t created_time,
50 |                             const std::string& model,
51 |                             const llm::RequestOutput& output);
52 |   bool send_result_to_client(std::shared_ptr<CompletionCallData> call_data,
53 |                              const std::string& request_id,
54 |                              int64_t created_time,
55 |                              const std::string& model,
56 |                              const llm::RequestOutput& req_output);
57 | };
58 | 
59 | }  // namespace xllm_service
60 | 


--------------------------------------------------------------------------------
/xllm_service/scheduler/scheduler.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #pragma once
 17 | 
 18 | #include "chat_template/jinja_chat_template.h"
 19 | #include "common/call_data.h"
 20 | #include "common/options.h"
 21 | #include "common/threadpool.h"
 22 | #include "common/xllm/output.h"
 23 | #include "etcd_client/etcd_client.h"
 24 | #include "loadbalance_policy/loadbalance_policy.h"
 25 | #include "managers/global_kvcache_mgr.h"
 26 | #include "managers/instance_mgr.h"
 27 | #include "request/request.h"
 28 | #include "response_handler.h"
 29 | #include "tokenizer/tokenizer.h"
 30 | #include "tokenizer/tokenizer_args.h"
 31 | 
 32 | namespace xllm_service {
 33 | 
 34 | // A scheduler for scheduling requests and instances
 35 | class Scheduler final {
 36 |  public:
 37 |   Scheduler(const Options& options);
 38 |   ~Scheduler();
 39 | 
 40 |   bool schedule(std::shared_ptr<Request> request);
 41 | 
 42 |   std::shared_ptr<brpc::Channel> get_channel(const std::string& target_name);
 43 | 
 44 |   InstanceMetaInfo get_instance_info(const std::string& instance_name);
 45 | 
 46 |   std::vector<std::string> get_static_decode_list(
 47 |       const std::string& instance_name);
 48 | 
 49 |   void handle_instance_heartbeat(const proto::HeartbeatRequest* req);
 50 | 
 51 |   void exited() { exited_ = true; }
 52 | 
 53 |   // register new requests from http service
 54 |   // keep http callback util request finished.
 55 |   // `handle_generation` will handle response with these callbacks.
 56 |   bool record_new_request(std::shared_ptr<ChatCallData> call_data,
 57 |                           std::shared_ptr<Request> request);
 58 |   bool record_new_request(std::shared_ptr<CompletionCallData> call_data,
 59 |                           std::shared_ptr<Request> request);
 60 |   void finish_request(const std::string& service_request_id,
 61 |                       bool error = false);
 62 | 
 63 |   // handle generations from prefill/decode instance
 64 |   bool handle_generation(const llm::RequestOutput& request_output);
 65 | 
 66 |   // update request metrics for prefill finished request
 67 |   void update_request_metrics_for_prefill(
 68 |       const std::string& service_request_id);
 69 | 
 70 |  private:
 71 |   DISALLOW_COPY_AND_ASSIGN(Scheduler);
 72 | 
 73 |   void update_master_service_heartbeat();
 74 | 
 75 |   void handle_master_service_watch(const etcd::Response& response,
 76 |                                    const uint64_t& prefix_len);
 77 | 
 78 |   Tokenizer* get_tls_tokenizer();
 79 | 
 80 |  private:
 81 |   Options options_;
 82 | 
 83 |   bool exited_ = false;
 84 | 
 85 |   bool is_master_service_ = false;
 86 | 
 87 |   TokenizerArgs tokenizer_args_;
 88 | 
 89 |   // chat template instance
 90 |   std::unique_ptr<JinjaChatTemplate> chat_template_;
 91 | 
 92 |   std::shared_ptr<EtcdClient> etcd_client_;
 93 | 
 94 |   std::unique_ptr<Tokenizer> tokenizer_;
 95 | 
 96 |   std::shared_ptr<InstanceMgr> instance_mgr_;
 97 | 
 98 |   std::shared_ptr<GlobalKVCacheMgr> global_kvcache_mgr_;
 99 | 
100 |   std::unique_ptr<LoadBalancePolicy> lb_policy_;
101 | 
102 |   std::unique_ptr<std::thread> heartbeat_thread_;
103 | 
104 |   // `service request id` -> `request` map
105 |   std::unordered_map<std::string, std::shared_ptr<Request>> requests_;
106 |   std::mutex request_mutex_;
107 | 
108 |   // use threadpool to handle all RequestOuputs queue
109 |   static constexpr size_t kOutputTheadNum_ = 128;  // magic num
110 |   ThreadPool output_threadpools_[kOutputTheadNum_];
111 |   // A request will be handled in the same thread to guarantee the token's
112 |   // order.
113 |   std::unordered_map<std::string, size_t> remote_requests_output_thread_map_;
114 |   size_t next_thread_idx = 0;
115 |   std::mutex thread_map_mutex_;
116 | 
117 |   // used when receive token from decode instance.
118 |   ResponseHandler response_handler_;
119 | };
120 | 
121 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/tokenizer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | add_subdirectory(tokenizers)
 5 | 
 6 | cc_library(
 7 |   NAME 
 8 |     tokenizer
 9 |   HDRS
10 |     tokenizer_args.h
11 |     tokenizer.h
12 |     tokenizer_factory.h
13 |     tiktoken_tokenizer.h
14 |     sentencepiece_tokenizer.h
15 |     fast_tokenizer.h
16 |   SRCS
17 |     tokenizer_args.cpp
18 |     tokenizer_factory.cpp
19 |     tiktoken_tokenizer.cpp
20 |     sentencepiece_tokenizer.cpp
21 |     fast_tokenizer.cpp
22 |   DEPS
23 |     :common
24 |     :sentencepiece
25 |     absl::flat_hash_map
26 |     absl::strings
27 |     glog::glog
28 |     rust_tokenizers
29 |     re2::re2
30 | )
31 | 
32 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/fast_tokenizer.cpp:
--------------------------------------------------------------------------------
 1 | #include "fast_tokenizer.h"
 2 | 
 3 | #include <glog/logging.h>
 4 | 
 5 | namespace xllm_service {
 6 | 
 7 | FastTokenizer::FastTokenizer(const std::string& tokenizer_json_path)
 8 |     : tokenizer_json_path_(tokenizer_json_path) {
 9 |   handle_ = tokenizers_new_from_path(tokenizer_json_path.c_str());
10 |   CHECK(handle_ != nullptr)
11 |       << "Failed to load tokenizer from file: " << tokenizer_json_path;
12 | }
13 | 
14 | std::unique_ptr<Tokenizer> FastTokenizer::clone() const {
15 |   return std::make_unique<FastTokenizer>(tokenizer_json_path_);
16 | }
17 | 
18 | FastTokenizer::~FastTokenizer() { tokenizers_free(handle_); }
19 | 
20 | bool FastTokenizer::encode(const std::string_view& text,
21 |                            std::vector<int32_t>* ids) const {
22 |   TokenizerEncodeResult result;
23 |   tokenizers_encode(
24 |       handle_, text.data(), text.size(), /*add_special_tokens=*/1, &result);
25 | 
26 |   std::vector<int32_t> ret(result.token_ids, result.token_ids + result.len);
27 |   *ids = std::move(ret);
28 | 
29 |   return true;
30 | }
31 | 
32 | std::string FastTokenizer::decode(const Slice<int32_t>& ids,
33 |                                   bool skip_special_tokens) const {
34 |   const char* data = nullptr;
35 |   size_t len = 0;
36 |   tokenizers_decode(handle_,
37 |                     reinterpret_cast<const uint32_t*>(ids.data()),
38 |                     ids.size(),
39 |                     skip_special_tokens,
40 |                     &data,
41 |                     &len);
42 |   return {data, len};
43 | }
44 | 
45 | std::optional<int32_t> FastTokenizer::token_to_id(
46 |     const std::string_view& token) const {
47 |   int32_t id = -1;
48 |   tokenizers_token_to_id(handle_, token.data(), token.size(), &id);
49 |   return id == -1 ? std::optional<int32_t>(std::nullopt)
50 |                   : std::optional<int32_t>(id);
51 | }
52 | 
53 | std::string FastTokenizer::id_to_token(int32_t id) const {
54 |   const char* data = nullptr;
55 |   size_t len = 0;
56 |   tokenizers_id_to_token(handle_, id, &data, &len);
57 |   return {data, len};
58 | }
59 | 
60 | size_t FastTokenizer::vocab_size() const {
61 |   size_t size;
62 |   tokenizers_get_vocab_size(handle_, &size);
63 |   CHECK(size > 0) << "vocab_size must be greater than 0.";
64 |   return size;
65 | }
66 | 
67 | }  // namespace xllm_service
68 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/fast_tokenizer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include "tokenizer.h"
20 | #include "tokenizers/tokenizers.h"
21 | 
22 | namespace xllm_service {
23 | 
24 | class FastTokenizer : public Tokenizer {
25 |  public:
26 |   FastTokenizer(const std::string& tokenizer_json_path);
27 | 
28 |   ~FastTokenizer() override;
29 | 
30 |   bool encode(const std::string_view& text,
31 |               std::vector<int32_t>* ids) const override;
32 | 
33 |   std::string decode(const Slice<int32_t>& ids,
34 |                      bool skip_special_tokens) const override;
35 | 
36 |   std::optional<int32_t> token_to_id(
37 |       const std::string_view& token) const override;
38 | 
39 |   std::string id_to_token(int32_t id) const override;
40 | 
41 |   size_t vocab_size() const override;
42 | 
43 |   std::unique_ptr<Tokenizer> clone() const override;
44 | 
45 |  private:
46 |   std::string tokenizer_json_path_;
47 | 
48 |   TokenizerHandle handle_ = nullptr;
49 | };
50 | 
51 | }  // namespace xllm_service
52 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/sentencepiece_tokenizer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <absl/container/flat_hash_map.h>
19 | #include <re2/re2.h>
20 | 
21 | #include <cstdint>
22 | 
23 | #include "sentencepiece/sentencepiece_processor.h"
24 | #include "tokenizer.h"
25 | #include "tokenizer_args.h"
26 | 
27 | namespace xllm_service {
28 | 
29 | // a tokenizer that uses google/SentencePiece
30 | class SentencePieceTokenizer : public Tokenizer {
31 |  public:
32 |   SentencePieceTokenizer(const std::string_view& dir_path,
33 |                          const TokenizerArgs& args);
34 | 
35 |   bool encode(const std::string_view& text,
36 |               std::vector<int32_t>* ids) const override;
37 | 
38 |   std::string decode(const Slice<int32_t>& ids,
39 |                      bool skip_special_tokens) const override;
40 | 
41 |   std::optional<int32_t> token_to_id(
42 |       const std::string_view& token) const override;
43 | 
44 |   std::string id_to_token(int32_t id) const override;
45 | 
46 |   size_t vocab_size() const override;
47 | 
48 |   std::unique_ptr<Tokenizer> clone() const override;
49 | 
50 |  private:
51 |   void load_special_tokens(const std::vector<SpecialToken>& special_tokens);
52 | 
53 |   bool encode_internal(const std::string_view& text,
54 |                        std::vector<int32_t>* ids) const;
55 |   void decode_internal(const Slice<int32_t>& ids,
56 |                        size_t start,
57 |                        size_t end,
58 |                        std::stringstream* ss) const;
59 | 
60 |   std::string dir_path_;
61 | 
62 |   TokenizerArgs args_;
63 | 
64 |   sentencepiece::SentencePieceProcessor sp_processor_;
65 | 
66 |   // special tokens to ids
67 |   absl::flat_hash_map<std::string, int32_t> special_token_encoder_;
68 | 
69 |   // special token ids to tokens
70 |   absl::flat_hash_map<int32_t, std::string> special_token_decoder_;
71 | 
72 |   // special token regex (optional)
73 |   std::unique_ptr<re2::RE2> special_token_regex_;
74 | 
75 |   // token ids to add to the beginning of the input sequence
76 |   std::vector<int32_t> prefix_token_ids_;
77 | };
78 | 
79 | }  // namespace xllm_service
80 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tiktoken_tokenizer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <absl/container/flat_hash_map.h>
20 | #include <re2/re2.h>
21 | 
22 | #include <vector>
23 | 
24 | #include "tokenizer.h"
25 | #include "tokenizer_args.h"
26 | 
27 | namespace xllm_service {
28 | 
29 | // a simple c++ implementation of the openai/tiktoken
30 | // https://github.com/openai/tiktoken
31 | class TiktokenTokenizer : public Tokenizer {
32 |  public:
33 |   TiktokenTokenizer(const std::string_view& dir_path,
34 |                     const TokenizerArgs& args);
35 | 
36 |   bool encode(const std::string_view& text,
37 |               std::vector<int32_t>* ids) const override;
38 | 
39 |   std::string decode(const Slice<int32_t>& ids,
40 |                      bool skip_special_tokens) const override;
41 | 
42 |   std::optional<int32_t> token_to_id(
43 |       const std::string_view& token) const override;
44 | 
45 |   std::string id_to_token(int32_t id) const override;
46 | 
47 |   size_t vocab_size() const override;
48 | 
49 |   std::unique_ptr<Tokenizer> clone() const override;
50 | 
51 |  private:
52 |   void load_special_tokens(const std::vector<SpecialToken>& special_tokens);
53 | 
54 |   void load_vocab(const std::string& vocab_file_path);
55 | 
56 |   void encode_internal(const std::string_view& text,
57 |                        std::vector<int32_t>* ids) const;
58 | 
59 |   void byte_pair_encode(const std::string_view& piece,
60 |                         std::vector<int32_t>* ids) const;
61 | 
62 |   std::string dir_path_;
63 | 
64 |   TokenizerArgs args_;
65 | 
66 |   // token to ids
67 |   absl::flat_hash_map<std::string, int32_t> encoder_;
68 |   // id to token
69 |   absl::flat_hash_map<int32_t, std::string> decoder_;
70 | 
71 |   // a regex pattern to tokenize text
72 |   // N.B. RE2 doesn't support look-around assertions.
73 |   // https://github.com/google/re2/wiki/Syntax
74 |   std::unique_ptr<re2::RE2> regex_;
75 | 
76 |   // special tokens to ids
77 |   absl::flat_hash_map<std::string, int32_t> special_token_encoder_;
78 | 
79 |   // special token ids to tokens
80 |   absl::flat_hash_map<int32_t, std::string> special_token_decoder_;
81 | 
82 |   // special token regex (optional)
83 |   std::unique_ptr<re2::RE2> special_token_regex_;
84 | 
85 |   // token ids to add to the beginning of the input sequence
86 |   std::vector<int32_t> prefix_token_ids_;
87 | };
88 | 
89 | }  // namespace xllm_service
90 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <cstdint>
19 | #include <memory>
20 | #include <optional>
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include "common/slice.h"
25 | 
26 | namespace xllm_service {
27 | 
28 | class Tokenizer {
29 |  public:
30 |   virtual ~Tokenizer() = default;
31 | 
32 |   virtual bool encode(const std::string_view& text,
33 |                       std::vector<int32_t>* ids) const = 0;
34 | 
35 |   virtual std::string decode(const Slice<int32_t>& ids,
36 |                              bool skip_special_tokens) const = 0;
37 | 
38 |   virtual std::optional<int32_t> token_to_id(
39 |       const std::string_view& token) const = 0;
40 | 
41 |   virtual std::string id_to_token(int32_t id) const = 0;
42 | 
43 |   virtual size_t vocab_size() const = 0;
44 | 
45 |   virtual std::unique_ptr<Tokenizer> clone() const = 0;
46 | };
47 | 
48 | }  // namespace xllm_service
49 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizer_args.cpp:
--------------------------------------------------------------------------------
 1 | #include "tokenizer_args.h"
 2 | 
 3 | #include <fstream>
 4 | 
 5 | #include "common/json_reader.h"
 6 | 
 7 | namespace xllm_service {
 8 | namespace {
 9 | std::optional<std::string> load_chat_template_file(const std::string& dir) {
10 |   // chat_template.json
11 |   const std::string chat_template_path = dir + "/chat_template.json";
12 |   JsonReader reader;
13 |   if (reader.parse(chat_template_path);
14 |       auto v = reader.value<std::string>("chat_template")) {
15 |     return v;
16 |   }
17 |   // chat_template.jinja
18 |   const std::string raw_chat_template_path = dir + "/chat_template.jinja";
19 |   std::ifstream file(raw_chat_template_path);
20 |   if (file.is_open()) {
21 |     std::ostringstream content;
22 |     content << file.rdbuf();
23 |     file.close();
24 |     return content.str();
25 |   }
26 |   return std::nullopt;
27 | }
28 | }  // namespace
29 | 
30 | void load_tokenizer_args(const std::string& model_weights_path,
31 |                          TokenizerArgs& tokenizer_args) {
32 |   // tokenizer args from tokenizer_config.json
33 |   JsonReader tokenizer_reader;
34 |   const std::string tokenizer_args_file_path =
35 |       model_weights_path + "/tokenizer_config.json";
36 |   if (tokenizer_reader.parse(tokenizer_args_file_path)) {
37 |     // read chat template if exists
38 |     if (auto v = load_chat_template_file(model_weights_path)) {
39 |       tokenizer_args.chat_template() = v.value();
40 |     } else if (auto v = tokenizer_reader.value<std::string>("chat_template")) {
41 |       tokenizer_args.chat_template() = v.value();
42 |     }
43 |     if (auto v = tokenizer_reader.value<bool>("add_bos_token")) {
44 |       tokenizer_args.add_bos_token() = v.value();
45 |     }
46 |     if (auto v = tokenizer_reader.value<bool>("add_eos_token")) {
47 |       tokenizer_args.add_eos_token() = v.value();
48 |     }
49 |     if (auto v = tokenizer_reader.value<std::string>("tokenizer_class")) {
50 |       tokenizer_args.tokenizer_class() = v.value();
51 |     }
52 |     // read bos_token
53 |     if (auto v = tokenizer_reader.value<std::string>("bos_token.content")) {
54 |       tokenizer_args.bos_token() = v.value();
55 |     } else if (auto v = tokenizer_reader.value<std::string>("bos_token")) {
56 |       tokenizer_args.bos_token() = v.value();
57 |     }
58 |     // read eos_token
59 |     if (auto v = tokenizer_reader.value<std::string>("eos_token.content")) {
60 |       tokenizer_args.eos_token() = v.value();
61 |     } else if (auto v = tokenizer_reader.value<std::string>("eos_token")) {
62 |       tokenizer_args.eos_token() = v.value();
63 |     }
64 |     // read pad_token
65 |     if (auto v = tokenizer_reader.value<std::string>("pad_token.content")) {
66 |       tokenizer_args.pad_token() = v.value();
67 |     } else if (auto v = tokenizer_reader.value<std::string>("pad_token")) {
68 |       tokenizer_args.pad_token() = v.value();
69 |     }
70 |   }
71 | }
72 | 
73 | }  // namespace xllm_service


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizer_args.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
  2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | ==============================================================================*/
 16 | 
 17 | #pragma once
 18 | #include <absl/strings/escaping.h>
 19 | #include <absl/strings/str_join.h>
 20 | 
 21 | #include <cstdint>
 22 | #include <ostream>
 23 | #include <string>
 24 | #include <vector>
 25 | 
 26 | #include "common/macros.h"
 27 | 
 28 | namespace xllm_service {
 29 | 
 30 | using SpecialToken = std::pair<std::string, int32_t>;
 31 | 
 32 | struct TokenizerArgs {
 33 |   // Type of tokenizer to use. valid values are "sentencepiece" and "tiktoken".
 34 |   PROPERTY(std::string, tokenizer_type) = "sentencepiece";
 35 | 
 36 |   // Vocab file name.
 37 |   PROPERTY(std::string, vocab_file) = "tokenizer.model";
 38 | 
 39 |   // Special tokens to add to the vocabulary.
 40 |   PROPERTY(std::vector<SpecialToken>, special_tokens);
 41 | 
 42 |   // Regex pattern used by tiktok tokenizer only.
 43 |   PROPERTY(std::string, pattern);
 44 | 
 45 |   // tokens to add to the beginning of the input sequence.
 46 |   PROPERTY(std::vector<std::string>, prefix_tokens);
 47 | 
 48 |   // chat template
 49 |   PROPERTY(std::string, chat_template);
 50 | 
 51 |   // add_bos_token
 52 |   PROPERTY(bool, add_bos_token) = false;
 53 | 
 54 |   // add_eos_token
 55 |   PROPERTY(bool, add_eos_token) = false;
 56 | 
 57 |   // bos_token
 58 |   PROPERTY(std::string, bos_token);
 59 | 
 60 |   // eos_token
 61 |   PROPERTY(std::string, eos_token);
 62 | 
 63 |   // pad_token
 64 |   PROPERTY(std::string, pad_token);
 65 | 
 66 |   // tokenizer_class
 67 |   PROPERTY(std::string, tokenizer_class);
 68 | };
 69 | 
 70 | inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) {
 71 |   os << "TokenizerArgs: [";
 72 |   os << "tokenizer_type: " << args.tokenizer_type();
 73 |   //  os << ", chat_template: " << args.chat_template();
 74 |   os << ", add_bos_token: " << args.add_bos_token();
 75 |   os << ", add_eos_token: " << args.add_eos_token();
 76 |   os << ", bos_token: " << args.bos_token();
 77 |   os << ", eos_token: " << args.eos_token();
 78 |   os << ", pad_token: " << args.pad_token();
 79 |   os << ", tokenizer_class: " << args.tokenizer_class();
 80 |   if (!args.special_tokens().empty()) {
 81 |     os << ", special_tokens: [";
 82 |     for (const auto& [token, id] : args.special_tokens()) {
 83 |       os << "(" << token << ", " << id << ") ";
 84 |     }
 85 |     os << "]";
 86 |   }
 87 |   os << ", pattern: " << absl::CEscape(args.pattern());
 88 |   if (!args.prefix_tokens().empty()) {
 89 |     os << ", prefix_tokens: [" << absl::StrJoin(args.prefix_tokens(), ", ")
 90 |        << "]";
 91 |   }
 92 |   os << "]";
 93 |   return os;
 94 | }
 95 | 
 96 | void load_tokenizer_args(const std::string& model_weights_path,
 97 |                          TokenizerArgs& tokenizer_args);
 98 | 
 99 | }  // namespace xllm_service
100 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizer_factory.cpp:
--------------------------------------------------------------------------------
 1 | #include "tokenizer_factory.h"
 2 | 
 3 | #include <filesystem>
 4 | 
 5 | #include "tokenizer_args.h"
 6 | 
 7 | namespace xllm_service {
 8 | 
 9 | std::unique_ptr<Tokenizer> TokenizerFactory::create_tokenizer(
10 |     const std::string& model_weights_path,
11 |     TokenizerArgs* tokenizer_args) {
12 |   load_tokenizer_args(model_weights_path, *tokenizer_args);
13 | 
14 |   const std::string tokenizer_json_path =
15 |       model_weights_path + "/tokenizer.json";
16 |   if (std::filesystem::exists(tokenizer_json_path)) {
17 |     // 1. fast tokenizer
18 |     LOG(INFO) << "Create fast tokenizer.";
19 |     return std::make_unique<FastTokenizer>(tokenizer_json_path);
20 |   } else if (tokenizer_args->tokenizer_type() == "tiktoken" ||
21 |              tokenizer_args->tokenizer_class() == "TikTokenTokenizer") {
22 |     // 2. create tiktoken tokenizer
23 |     LOG(INFO) << "Create Tiktoken tokenizer.";
24 |     return std::make_unique<TiktokenTokenizer>(model_weights_path,
25 |                                                *tokenizer_args);
26 |   } else {
27 |     // 3. create sentencepiece tokenizer
28 |     LOG(INFO) << "Create SentencePiece tokenizer.";
29 |     return std::make_unique<SentencePieceTokenizer>(model_weights_path,
30 |                                                     *tokenizer_args);
31 |   }
32 | }
33 | 
34 | }  // namespace xllm_service
35 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizer_factory.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "fast_tokenizer.h"
 4 | #include "sentencepiece_tokenizer.h"
 5 | #include "tiktoken_tokenizer.h"
 6 | #include "tokenizer_args.h"
 7 | 
 8 | namespace xllm_service {
 9 | 
10 | class TokenizerFactory {
11 |  public:
12 |   static std::unique_ptr<Tokenizer> create_tokenizer(
13 |       const std::string& model_weights_path,
14 |       TokenizerArgs* tokenizer_args);
15 | };
16 | 
17 | }  // namespace xllm_service
18 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cargo_shared_library)
 2 | 
 3 | cargo_shared_library(
 4 |   NAME
 5 |     rust_tokenizers
 6 |   HDRS
 7 |     tokenizers.h
 8 | )
 9 | 
10 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizers/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rust_tokenizers"
 3 | version = "0.21.0"
 4 | edition = "2018"
 5 | 
 6 | [lib]
 7 | name = "rust_tokenizers"
 8 | crate-type = ["cdylib"]
 9 | 
10 | [dependencies]
11 | tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] }
12 | 


--------------------------------------------------------------------------------
/xllm_service/tokenizer/tokenizers/tokenizers.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | // The C API
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | // The C interface to the hf-tokenizers library
25 | // ported from https://github.com/mlc-ai/tokenizers-cpp
26 | #include <stddef.h>
27 | #include <stdint.h>
28 | 
29 | typedef void* TokenizerHandle;
30 | 
31 | typedef struct {
32 |   int* token_ids;
33 |   size_t len;
34 | } TokenizerEncodeResult;
35 | 
36 | TokenizerHandle tokenizers_new_from_path(const char* path);
37 | 
38 | void tokenizers_encode(TokenizerHandle handle,
39 |                        const char* data,
40 |                        size_t len,
41 |                        int add_special_token,
42 |                        TokenizerEncodeResult* result);
43 | 
44 | void tokenizers_decode(TokenizerHandle handle,
45 |                        const uint32_t* data,
46 |                        size_t len,
47 |                        int skip_special_tokens,
48 |                        const char** decode_data,
49 |                        size_t* decode_len);
50 | 
51 | void tokenizers_id_to_token(TokenizerHandle handle,
52 |                             uint32_t id,
53 |                             const char** data,
54 |                             size_t* len);
55 | 
56 | // tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
57 | void tokenizers_token_to_id(TokenizerHandle handle,
58 |                             const char* token,
59 |                             size_t len,
60 |                             int32_t* id);
61 | 
62 | void tokenizers_free(TokenizerHandle handle);
63 | 
64 | void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size);
65 | 
66 | #ifdef __cplusplus
67 | }
68 | #endif
69 | 


--------------------------------------------------------------------------------