├── .clang-format ├── .github └── workflows │ └── check_format.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CONTRIBUTING.md ├── CONTRIBUTING_zh.md ├── LICENSE ├── NOTICE_Third_Party.md ├── README.md ├── README_zh.md ├── RELEASE.md ├── cmake ├── CMakeDetermineRustCompiler.cmake ├── CMakeRustCompiler.cmake.in ├── CMakeRustInformation.cmake ├── CMakeTestRustCompiler.cmake ├── FindRust.cmake ├── FindSentencePiece.cmake ├── cargo_library.cmake ├── cargo_shared_library.cmake ├── cc_binary.cmake ├── cc_library.cmake ├── cc_test.cmake ├── grpc_proto_library.cmake ├── proto_library.cmake └── static_analyzers.cmake ├── docs ├── assets │ ├── service_arch.png │ ├── wechat_qrcode1.png │ ├── wechat_qrcode2.png │ └── xllm_service_title.png ├── en │ ├── getting_started.md │ └── overview.md └── zh │ ├── getting_started.md │ └── overview.md ├── prepare.sh ├── third_party ├── CMakeLists.txt └── custom_cache │ └── cpprestsdk.patch ├── vcpkg.json └── xllm_service ├── CMakeLists.txt ├── chat_template ├── CMakeLists.txt ├── jinja_chat_template.cpp ├── jinja_chat_template.h └── jinja_chat_template_test.cpp ├── common ├── CMakeLists.txt ├── call_data.h ├── closure_guard.h ├── concurrent_queue.h ├── global_gflags.cpp ├── global_gflags.h ├── hash_util.cpp ├── hash_util.h ├── json_reader.cpp ├── json_reader.h ├── macros.h ├── options.h ├── slice.h ├── threadpool.cpp ├── threadpool.h ├── ttft_predictor.cpp ├── ttft_predictor.h ├── types.h ├── utils.cpp ├── utils.h └── xllm │ ├── output.h │ ├── status.h │ ├── uuid.cpp │ └── uuid.h ├── examples ├── CMakeLists.txt ├── curl_http_client.sh ├── http_client_test.cpp ├── rpc_client_test.cpp └── rpc_hello_client.cpp ├── http_service ├── CMakeLists.txt ├── main.cpp ├── request_tracer.cpp ├── request_tracer.h ├── service.cpp └── service.h ├── master.cpp ├── master.h ├── proto ├── CMakeLists.txt ├── xllm │ ├── chat.proto │ ├── common.proto │ └── completion.proto ├── xllm_http_service.proto └── xllm_rpc_service.proto ├── request ├── CMakeLists.txt └── request.h ├── rpc_service ├── CMakeLists.txt ├── client.cpp ├── client.h ├── main.cpp ├── rpc_service_test.cpp ├── service.cpp └── service.h ├── scheduler ├── CMakeLists.txt ├── etcd_client │ ├── CMakeLists.txt │ ├── etcd_client.cpp │ └── etcd_client.h ├── loadbalance_policy │ ├── CMakeLists.txt │ ├── cache_aware_routing.cpp │ ├── cache_aware_routing.h │ ├── loadbalance_policy.h │ ├── round_robin.cpp │ └── round_robin.h ├── managers │ ├── CMakeLists.txt │ ├── global_kvcache_mgr.cpp │ ├── global_kvcache_mgr.h │ ├── instance_mgr.cpp │ └── instance_mgr.h ├── response_handler.cpp ├── response_handler.h ├── scheduler.cpp └── scheduler.h └── tokenizer ├── CMakeLists.txt ├── fast_tokenizer.cpp ├── fast_tokenizer.h ├── sentencepiece_tokenizer.cpp ├── sentencepiece_tokenizer.h ├── tiktoken_tokenizer.cpp ├── tiktoken_tokenizer.h ├── tokenizer.h ├── tokenizer_args.cpp ├── tokenizer_args.h ├── tokenizer_factory.cpp ├── tokenizer_factory.h └── tokenizers ├── CMakeLists.txt ├── Cargo.toml ├── src └── lib.rs └── tokenizers.h /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: Google 3 | UseTab: Never 4 | IndentWidth: 2 5 | ColumnLimit: 80 6 | 7 | BinPackParameters: false 8 | BinPackArguments: false 9 | ExperimentalAutoDetectBinPacking: false 10 | AllowAllParametersOfDeclarationOnNextLine: false 11 | DerivePointerAlignment: false 12 | PointerAlignment: Left 13 | ... 14 | -------------------------------------------------------------------------------- /.github/workflows/check_format.yml: -------------------------------------------------------------------------------- 1 | name: CheckFormat 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: [main] 6 | paths: ['xllm_service/**'] 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | branches: [main] 10 | paths: ['xllm_service/**'] 11 | 12 | jobs: 13 | format-check: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Install clang-format 17 | run: | 18 | pip install clang-format==20.1.6 19 | clang-format --version 20 | 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | 26 | - name: Determine base commit for comparison 27 | id: get_base_commit 28 | run: | 29 | # pull_request action 30 | if [ "${{ github.event_name }}" = "pull_request" ]; then 31 | echo "base_commit=${{ github.event.pull_request.base.sha }}" >> $GITHUB_OUTPUT 32 | else 33 | # push action 34 | echo "base_commit=${{ github.sha }}~1" >> $GITHUB_OUTPUT 35 | fi 36 | 37 | - name: Verify clang-format configuration 38 | run: | 39 | if [ ! -f ".clang-format" ]; then 40 | echo "❌ .clang-format file not found in repository root" 41 | exit 1 42 | fi 43 | clang-format --style=file --dump-config > /dev/null || { 44 | echo "❌ .clang-format file has invalid format" 45 | exit 1 46 | } 47 | 48 | - name: Check code format 49 | shell: /usr/bin/bash {0} 50 | run: | 51 | BASE_COMMIT="${{ steps.get_base_commit.outputs.base_commit }}" 52 | CLANG_FORMAT_FILE="$(pwd)/.clang-format" 53 | 54 | # do clang-format 55 | diff=$(git-clang-format \ 56 | --style=file:"$CLANG_FORMAT_FILE" \ 57 | --extensions="c,h,cc,cp,cpp,c++,cxx,hh,hpp,hxx,inc,cu,cuh" \ 58 | --commit "$BASE_COMMIT" \ 59 | --diff) 60 | 61 | # check diff 62 | if [ "$diff" = "no modified files to format" ] || [ "$diff" = "clang-format did not modify any files" ]; then 63 | echo "✅ Code format is correct" 64 | exit 0 65 | fi 66 | 67 | printf "\n❌ You have introduced coding style breakages.\n" 68 | 69 | printf "\n\033[1mSuggested changes:\n\n" 70 | echo "$diff" 71 | exit 1 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Visual Studio Code 2 | /.vscode* 3 | 4 | # Idea 5 | /.idea 6 | /cmake-build-debug/ 7 | /cmake-build-release/ 8 | 9 | # CMake 10 | /build* 11 | 12 | # cache 13 | /.*cache 14 | 15 | # deps 16 | /.deps 17 | 18 | # gtest 19 | /Testing 20 | 21 | # rust 22 | Cargo.lock 23 | 24 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/brpc"] 2 | path = third_party/brpc 3 | url = https://gitcode.com/xLLM-AI/brpc.git 4 | branch = 1.12.1_cmake 5 | [submodule "third_party/etcd_cpp_apiv3"] 6 | path = third_party/etcd_cpp_apiv3 7 | url = https://gitcode.com/xLLM-AI/etcd-cpp-apiv3.git 8 | branch = v0.15.4 9 | [submodule "third_party/cpprestsdk"] 10 | path = third_party/cpprestsdk 11 | url = https://gitcode.com/xLLM-AI/cpprestsdk.git 12 | branch = v2.10.19 13 | [submodule "third_party/sentencepiece"] 14 | path = third_party/sentencepiece 15 | url = https://gitcode.com/xLLM-AI/sentencepiece.git 16 | [submodule "third_party/minja"] 17 | path = third_party/minja 18 | url = https://gitcode.com/xLLM-AI/minja.git 19 | [submodule "third_party/smhasher"] 20 | path = third_party/smhasher 21 | url = https://gitcode.com/xLLM-AI/smhasher.git 22 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit install 2 | # pre-commit run --all-files 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/mirrors-clang-format 6 | rev: v20.1.6 7 | hooks: 8 | - id: clang-format 9 | types_or: [c++, c, cuda] 10 | exclude: ^(cibuild/|tools/|third_party/|cmake/|build) 11 | 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md) 16 | 17 | # Contribute to xLLM-Service 18 | 19 | + Write / translate / fix our documentation 20 | + Raise questions / Answer questions 21 | + Provide demos, examples or test cases 22 | + Give suggestions or other comments 23 | + Paticipate in [issues](https://github.com/xxx/xLLM/issues) or [discussions](https://github.com/xxx/xLLM/discussions) 24 | + Pull requests 25 | + Sharing related research / application 26 | + Any other ways to improve xLLM 27 | 28 | For developers who want to contribute to our code, here is the guidance: 29 | 30 | ## 1. Choose an issue to contribute 31 | + Issues with label `PR welcome`, which means: 32 | + A reproducible bug 33 | + A function in plan 34 | 35 | ## 2. Install environment for development 36 | + We strongly suggest you to read our **[Document](http://xxx/docs/)** before developing 37 | + For setting environment, please check our **[Readme file](/README.md)** 38 | 39 | ## 3. Build our project 40 | + You could run our demo to check whether the requirements are successfully installed: 41 | 42 | ## 4. Test 43 | 44 | After the PR is submitted, we will format and test the code. 45 | Our tests are still far from perfect, so you are welcomed to add tests to our project! -------------------------------------------------------------------------------- /CONTRIBUTING_zh.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md) 16 | 17 | # xLLM-Service 贡献指南 18 | 19 | xLLM-Service致力于为每一位用户和开发者提供开放的XX,因此无论您是XX开发者还是专注于XX用户,我们都欢迎您参与我们的项目。 20 | 您可以通过以下方法为项目作出贡献: 21 | 22 | + 撰写/翻译/修改文档 23 | + 提出或回答问题 24 | + 提供使用或测试样例 25 | + 提供建议或其他评论 26 | + 参与[issues](https://github.com/xxx/xLLM/issues) 或[discussions](https://github.com/xxx/xLLM/discussions) 27 | + 提交Pull request 28 | + 分享相关研究或应用场景 29 | + 其他任何对xLLM-Service的帮助 30 | 31 | 如果您希望参与xLLM的开发,请参考以下提示: 32 | 33 | ## 1. 选择参与贡献的issue 34 | + 您可以选择带有`PR welcome`标签的issue,包括: 35 | + 可复现的bug 36 | + 计划实现的功能 37 | 38 | ## 2. 配置开发环境 39 | + 在开发之前,可以参考我们的 **[文档](http://xxx/docs/)** 40 | + 关于环境配置,参见 **[Readme file](/README.md)** 41 | 42 | ## 3. 项目构建和运行 43 | + 您可以运行如下样例: 44 | 45 | ## 4. 测试 46 | 47 | 在pr提交之后,我们会对代码进行格式化及进一步测试。 48 | 我们的测试目前还很不完善,因此欢迎开发者为测试作出贡献! -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | [English](./README.md) | [中文](./README_zh.md) 16 | 17 |

18 | xLLM 19 |

20 | 21 | 22 | ## 1. Project Overview 23 | **xLLM-service** is a service-layer framework developed based on the **xLLM** inference engine, providing efficient, fault-tolerant, and flexible LLM inference services for clustered deployment. 24 | 25 | xLLM-service targets to address key challenges in enterprise-level service scenarios: 26 | 27 | - How to ensure the SLA of online services and improve resource utilization of offline tasks in a hybrid online-offline deployment environment. 28 | 29 | - How to react to changing request loads in actual businesses, such as fluctuations in input/output lengths. 30 | 31 | - Resolving performance bottlenecks of multimodal model requests. 32 | 33 | - Ensuring high reliability of computing instances. 34 | 35 | --- 36 | 37 | ## 2. Key Features 38 | With management of computing resource pools, intelligent scheduling and preemption of hybrid requests, and real-time monitoring of computing instances, xLLM-service achieves the following key features: 39 | 40 | - Unified scheduling of online and offline requests, with preemptive execution for online requests and best-effort execution for offline requests. 41 | 42 | - Adaptive dynamic allocation of PD ratios, supporting efficient switching of instance PD roles. 43 | 44 | - EPD three-stage disaggregation for multimodal requests, with intelligent resource allocation for different stages. 45 | 46 | - Fault-tolerant architecture, fast detection of instance error and automatic rescheduling for interrupted requests. 47 | 48 | --- 49 | 50 | ## 3. Core Architecture 51 | 52 | ``` 53 | ├── xllm-service/ 54 | | : main source folder 55 | │ ├── chat_template/ # 56 | │ ├── common/ # 57 | │ ├── examples/ # 58 | │ ├── http_service/ # 59 | │ ├── rpc_service/ # 60 | | ├── tokenizers/ # 61 | | └── master.cpp # 62 | ``` 63 | 64 | --- 65 | 66 | 67 | ## 4. Quick Start 68 | #### Installation 69 | ```bash 70 | git clone git@coding.jd.com:xllm-ai/xllm_service.git 71 | cd xllm_service 72 | git submodule init 73 | git submodule update 74 | ``` 75 | #### Compilation 76 | compile xllm-service: 77 | ```bash 78 | sh prepare.sh # apply patch 79 | mkdir -p build && cd build 80 | cmake .. && make -j 8 81 | ``` 82 | 83 | --- 84 | 85 | ## 5. Contributing 86 | 87 | There are several ways you can contribute to xLLM: 88 | 89 | 1. Reporting Issues (Bugs & Errors) 90 | 2. Suggesting Enhancements 91 | 3. Improving Documentation 92 | + Fork the repository 93 | + Add your view in document 94 | + Send your pull request 95 | 4. Writing Code 96 | + Fork the repository 97 | + Create a new branch 98 | + Add your feature or improvement 99 | + Send your pull request 100 | 101 | We appreciate all kinds of contributions! 🎉🎉🎉 102 | If you have problems about development, please check our document: * **[Document](./docs/docs/readme.md)** 103 | 104 | --- 105 | 106 | ## 6. Community & Support 107 | 108 | If you encounter any issues along the way, you are welcomed to submit reproducible steps and log snippets in the project's Issues area, or contact the xLLM Core team directly via your internal Slack. 109 | 110 | Welcome to contact us: 111 | 112 |
113 | qrcode1 114 | qrcode2 115 |
116 | 117 | --- 118 | ## 7. About the Contributors 119 | 120 | Thanks to all the following [developers](https://github.com/jd-opensource/xllm-service/graphs/contributors) who have contributed to xLLM. 121 | 122 | 123 | 124 | 125 | --- 126 | 127 | ## 8. License 128 | 129 | [Apache License](LICENSE) 130 | 131 | #### xLLM is provided by JD.com 132 | #### Thanks for your Contributions! 133 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | [English](./README.md) | [中文](./README_zh.md) 16 | 17 | 18 |

19 | xLLM 20 |

21 | 22 | ## 1. 简介 23 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架,为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。 24 | 25 | xLLM-service 旨在解决企业级服务场景中的关键挑战: 26 | - 如何于在离线混合部署环境中,保障在线服务的SLA,提升离线任务的资源利用率。 27 | - 如何适应实际业务中动态变化的请求负载,如输入/输出长度出现剧烈波动。 28 | - 解决多模态模型请求的性能瓶颈。 29 | - 保障集群计算实例的高可靠性。 30 | 31 | --- 32 | 33 | ## 2. 核心特性 34 | 35 | xLLM-service 通过对计算资源池的动态管理、请求的智能调度与抢占,以及计算实例的实时监控,实现了以下核心能力: 36 | - 在线与离线任务的统一调度,在线请求的抢占式执行,离线请求best-effort执行; 37 | - PD比例的自适应动态调配,支持实例PD角色的高效切换; 38 | - 多模态请求的EPD三阶段分离,不同阶段的资源智能分配; 39 | - 多节点容错架构,快速感知实例错误信息,自动决策最优的被中断请求再调度方案。 40 | 41 | --- 42 | 43 | ## 3. 代码结构 44 | 45 | ``` 46 | ├── xllm-service/ 47 | | : 主代码目录 48 | │ ├── chat_template/ # 49 | │ ├── common/ # 50 | │ ├── examples/ # 51 | │ ├── http_service/ # 52 | │ ├── rpc_service/ # 53 | | ├── tokenizers/ # 54 | | └── master.cpp # 55 | ``` 56 | --- 57 | 58 | 59 | ## 4. 快速开始 60 | #### 安装 61 | ```bash 62 | git clone git@coding.jd.com:xllm-ai/xllm_service.git 63 | cd xllm_service 64 | git submodule init 65 | git submodule update 66 | ``` 67 | #### 编译 68 | 编译执行 69 | ```bash 70 | sh prepare.sh # 应用patch 71 | mkdir -p build && cd build 72 | cmake .. && make -j 8 73 | ``` 74 | 75 | --- 76 | ## 5. 成为贡献者 77 | 您可以通过以下方法为 xLLM-Service 作出贡献: 78 | 79 | 1. 在Issue中报告问题 80 | 2. 提供改进建议 81 | 3. 补充文档 82 | + Fork仓库 83 | + 修改文档 84 | + 提出pull request 85 | 4. 修改代码 86 | + Fork仓库 87 | + 创建新分支 88 | + 加入您的修改 89 | + 提出pull request 90 | 91 | 感谢您的贡献! 🎉🎉🎉 92 | 如果您在开发中遇到问题,请参阅**[xLLM-Service中文指南](./docs/docs_zh/readme.md)** 93 | 94 | --- 95 | 96 | ## 6. 社区支持 97 | 98 | 如果你在xLLM的开发或使用过程中遇到任何问题,欢迎在项目的Issue区域提交可复现的步骤或日志片段。 99 | 如果您有企业内部Slack,请直接联系xLLM Core团队。 100 | 101 | 欢迎沟通和联系我们: 102 | 103 |
104 | qrcode1 105 | qrcode2 106 |
107 | 108 | ## 7. 致谢 109 | 110 | 感谢以下为xLLM-Servic作出贡献的[开发者](https://github.com/jd-opensource/xllm-service/graphs/contributors) 111 | 112 | 113 | 114 | 115 | --- 116 | 117 | ## 8. 许可证 118 | [Apache License](LICENSE) 119 | 120 | #### xLLM-Service 由 JD.com 提供 121 | #### 感谢您对xLLM的关心与贡献! 122 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release xllm-service 0.1.0 2 | 3 | ## **Major Features and Improvements** 4 | 5 | - Support disaggregated prefill and decoding. 6 | - Support KV Cache aware routing. 7 | - Support KV Cache Pool. 8 | -------------------------------------------------------------------------------- /cmake/CMakeDetermineRustCompiler.cmake: -------------------------------------------------------------------------------- 1 | # ported from https://github.com/Devolutions/CMakeRust 2 | if(NOT CMAKE_Rust_COMPILER) 3 | find_package(Rust) 4 | if(RUST_FOUND) 5 | set(CMAKE_Rust_COMPILER "${RUSTC_EXECUTABLE}") 6 | set(CMAKE_Rust_COMPILER_ID "Rust") 7 | set(CMAKE_Rust_COMPILER_VERSION "${RUST_VERSION}") 8 | set(CMAKE_Rust_PLATFORM_ID "Rust") 9 | endif() 10 | endif() 11 | 12 | message(STATUS "Cargo Home: ${CARGO_HOME}") 13 | message(STATUS "Rust Compiler Version: ${RUSTC_VERSION}") 14 | 15 | mark_as_advanced(CMAKE_Rust_COMPILER) 16 | 17 | if(CMAKE_Rust_COMPILER) 18 | set(CMAKE_Rust_COMPILER_LOADED 1) 19 | endif(CMAKE_Rust_COMPILER) 20 | 21 | configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeRustCompiler.cmake.in 22 | ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${CMAKE_VERSION}/CMakeRustCompiler.cmake IMMEDIATE @ONLY) 23 | 24 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC") 25 | 26 | -------------------------------------------------------------------------------- /cmake/CMakeRustCompiler.cmake.in: -------------------------------------------------------------------------------- 1 | 2 | # ported from https://github.com/Devolutions/CMakeRust 3 | set(CMAKE_Rust_COMPILER "@CMAKE_Rust_COMPILER@") 4 | set(CMAKE_Rust_COMPILER_ID "@CMAKE_Rust_COMPILER_ID@") 5 | set(CMAKE_Rust_COMPILER_VERSION "@CMAKE_Rust_COMPILER_VERSION@") 6 | set(CMAKE_Rust_COMPILER_LOADED @CMAKE_Rust_COMPILER_LOADED@) 7 | set(CMAKE_Rust_PLATFORM_ID "@CMAKE_Rust_PLATFORM_ID@") 8 | 9 | SET(CMAKE_Rust_SOURCE_FILE_EXTENSIONS rs) 10 | SET(CMAKE_Rust_LINKER_PREFERENCE 40) 11 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC") 12 | 13 | -------------------------------------------------------------------------------- /cmake/CMakeRustInformation.cmake: -------------------------------------------------------------------------------- 1 | # ported from https://github.com/Devolutions/CMakeRust 2 | # 3 | # Usage: rustc [OPTIONS] INPUT 4 | # 5 | # Options: 6 | # -h --help Display this message 7 | # --cfg SPEC Configure the compilation environment 8 | # -L [KIND=]PATH Add a directory to the library search path. The 9 | # optional KIND can be one of dependency, crate, native, 10 | # framework or all (the default). 11 | # -l [KIND=]NAME Link the generated crate(s) to the specified native 12 | # library NAME. The optional KIND can be one of static, 13 | # dylib, or framework. If omitted, dylib is assumed. 14 | # --crate-type [bin|lib|rlib|dylib|cdylib|staticlib|metadata] 15 | # Comma separated list of types of crates for the 16 | # compiler to emit 17 | # --crate-name NAME Specify the name of the crate being built 18 | # --emit [asm|llvm-bc|llvm-ir|obj|link|dep-info] 19 | # Comma separated list of types of output for the 20 | # compiler to emit 21 | # --print [crate-name|file-names|sysroot|cfg|target-list|target-cpus|target-features|relocation-models|code-models] 22 | # Comma separated list of compiler information to print 23 | # on stdout 24 | # -g Equivalent to -C debuginfo=2 25 | # -O Equivalent to -C opt-level=2 26 | # -o FILENAME Write output to 27 | # --out-dir DIR Write output to compiler-chosen filename in 28 | # --explain OPT Provide a detailed explanation of an error message 29 | # --test Build a test harness 30 | # --target TARGET Target triple for which the code is compiled 31 | # -W --warn OPT Set lint warnings 32 | # -A --allow OPT Set lint allowed 33 | # -D --deny OPT Set lint denied 34 | # -F --forbid OPT Set lint forbidden 35 | # --cap-lints LEVEL Set the most restrictive lint level. More restrictive 36 | # lints are capped at this level 37 | # -C --codegen OPT[=VALUE] 38 | # Set a codegen option 39 | # -V --version Print version info and exit 40 | # -v --verbose Use verbose output 41 | # 42 | # Additional help: 43 | # -C help Print codegen options 44 | # -W help Print 'lint' options and default settings 45 | # -Z help Print internal options for debugging rustc 46 | # --help -v Print the full set of options rustc accepts 47 | # 48 | 49 | # 50 | 51 | include(CMakeLanguageInformation) 52 | 53 | if(UNIX) 54 | set(CMAKE_Rust_OUTPUT_EXTENSION .o) 55 | else() 56 | set(CMAKE_Rust_OUTPUT_EXTENSION .obj) 57 | endif() 58 | 59 | set(CMAKE_Rust_ECHO_ALL "echo \"TARGET: TARGET_BASE: ") 60 | set(CMAKE_Rust_ECHO_ALL "${CMAKE_Rust_ECHO_ALL} OBJECT: OBJECTS: OBJECT_DIR: SOURCE: SOURCES: ") 61 | set(CMAKE_Rust_ECHO_ALL "${CMAKE_Rust_ECHO_ALL} LINK_LIBRARIES: FLAGS: LINK_FLAGS: \"") 62 | 63 | if(NOT CMAKE_Rust_CREATE_SHARED_LIBRARY) 64 | set(CMAKE_Rust_CREATE_SHARED_LIBRARY 65 | "echo \"CMAKE_Rust_CREATE_SHARED_LIBRARY\"" 66 | "${CMAKE_Rust_ECHO_ALL}" 67 | ) 68 | endif() 69 | 70 | if(NOT CMAKE_Rust_CREATE_SHARED_MODULE) 71 | set(CMAKE_Rust_CREATE_SHARED_MODULE 72 | "echo \"CMAKE_Rust_CREATE_SHARED_MODULE\"" 73 | "${CMAKE_Rust_ECHO_ALL}" 74 | ) 75 | endif() 76 | 77 | if(NOT CMAKE_Rust_CREATE_STATIC_LIBRARY) 78 | set(CMAKE_Rust_CREATE_STATIC_LIBRARY 79 | "echo \"CMAKE_Rust_CREATE_STATIC_LIBRARY\"" 80 | "${CMAKE_Rust_ECHO_ALL}" 81 | ) 82 | endif() 83 | 84 | if(NOT CMAKE_Rust_COMPILE_OBJECT) 85 | set(CMAKE_Rust_COMPILE_OBJECT 86 | "echo \"CMAKE_Rust_COMPILE_OBJECT\"" 87 | "${CMAKE_Rust_ECHO_ALL}" 88 | "${CMAKE_Rust_COMPILER} --emit obj -o ") 89 | endif() 90 | 91 | if(NOT CMAKE_Rust_LINK_EXECUTABLE) 92 | set(CMAKE_Rust_LINK_EXECUTABLE 93 | "echo \"CMAKE_Rust_LINK_EXECUTABLE\"" 94 | "${CMAKE_Rust_ECHO_ALL}" 95 | ) 96 | endif() 97 | 98 | mark_as_advanced( 99 | CMAKE_Rust_FLAGS 100 | CMAKE_Rust_FLAGS_DEBUG 101 | CMAKE_Rust_FLAGS_MINSIZEREL 102 | CMAKE_Rust_FLAGS_RELEASE 103 | CMAKE_Rust_FLAGS_RELWITHDEBINFO) 104 | 105 | set(CMAKE_Rust_INFORMATION_LOADED 1) 106 | 107 | -------------------------------------------------------------------------------- /cmake/CMakeTestRustCompiler.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_Rust_COMPILER_WORKS 1 CACHE INTERNAL "") 2 | -------------------------------------------------------------------------------- /cmake/FindRust.cmake: -------------------------------------------------------------------------------- 1 | # ported from https://github.com/Devolutions/CMakeRust 2 | set(_CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ${CMAKE_FIND_ROOT_PATH_MODE_PROGRAM}) 3 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH) 4 | set(_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE}) 5 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH) 6 | 7 | if(CMAKE_HOST_WIN32) 8 | set(USER_HOME "$ENV{USERPROFILE}") 9 | else() 10 | set(USER_HOME "$ENV{HOME}") 11 | endif() 12 | 13 | if(NOT DEFINED CARGO_HOME) 14 | if("$ENV{CARGO_HOME}" STREQUAL "") 15 | set(CARGO_HOME "${USER_HOME}/.cargo") 16 | else() 17 | set(CARGO_HOME "$ENV{CARGO_HOME}") 18 | endif() 19 | endif() 20 | 21 | # Find cargo executable 22 | find_program(CARGO_EXECUTABLE cargo 23 | HINTS "${CARGO_HOME}" 24 | PATH_SUFFIXES "bin") 25 | mark_as_advanced(CARGO_EXECUTABLE) 26 | 27 | # Find rustc executable 28 | find_program(RUSTC_EXECUTABLE rustc 29 | HINTS "${CARGO_HOME}" 30 | PATH_SUFFIXES "bin") 31 | mark_as_advanced(RUSTC_EXECUTABLE) 32 | 33 | # Find rustdoc executable 34 | find_program(RUSTDOC_EXECUTABLE rustdoc 35 | HINTS "${CARGO_HOME}" 36 | PATH_SUFFIXES "bin") 37 | mark_as_advanced(RUSTDOC_EXECUTABLE) 38 | 39 | # Find rust-gdb executable 40 | find_program(RUST_GDB_EXECUTABLE rust-gdb 41 | HINTS "${CARGO_HOME}" 42 | PATH_SUFFIXES "bin") 43 | mark_as_advanced(RUST_GDB_EXECUTABLE) 44 | 45 | # Find rust-lldb executable 46 | find_program(RUST_LLDB_EXECUTABLE rust-lldb 47 | HINTS "${CARGO_HOME}" 48 | PATH_SUFFIXES "bin") 49 | mark_as_advanced(RUST_LLDB_EXECUTABLE) 50 | 51 | # Find rustup executable 52 | find_program(RUSTUP_EXECUTABLE rustup 53 | HINTS "${CARGO_HOME}" 54 | PATH_SUFFIXES "bin") 55 | mark_as_advanced(RUSTUP_EXECUTABLE) 56 | 57 | set(RUST_FOUND FALSE CACHE INTERNAL "") 58 | 59 | if(CARGO_EXECUTABLE AND RUSTC_EXECUTABLE AND RUSTDOC_EXECUTABLE) 60 | set(RUST_FOUND TRUE CACHE INTERNAL "") 61 | 62 | set(CARGO_HOME "${CARGO_HOME}" CACHE PATH "Rust Cargo Home") 63 | 64 | execute_process(COMMAND ${RUSTC_EXECUTABLE} --version OUTPUT_VARIABLE RUSTC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) 65 | string(REGEX REPLACE "rustc ([^ ]+) .*" "\\1" RUSTC_VERSION "${RUSTC_VERSION}") 66 | endif() 67 | 68 | if(NOT RUST_FOUND) 69 | message(FATAL_ERROR "Could not find Rust!") 70 | endif() 71 | 72 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ${_CMAKE_FIND_ROOT_PATH_MODE_PROGRAM}) 73 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE}) 74 | -------------------------------------------------------------------------------- /cmake/FindSentencePiece.cmake: -------------------------------------------------------------------------------- 1 | # FindSentencePiece.cmake 2 | # 3 | # Use this module as: 4 | # 5 | # find_package(SentencePiece) 6 | # find_package(SentencePiece REQUIRED) 7 | # 8 | # This module provides the following imported targets, if found: 9 | # 10 | # SentencePiece::sentencepiece 11 | # The Google SentencePiece library 12 | # 13 | 14 | find_package(PkgConfig QUIET) 15 | if(PKG_CONFIG_FOUND) 16 | pkg_check_modules(SENTENCEPIECE QUIET sentencepiece) 17 | endif() 18 | 19 | find_path(SentencePiece_INCLUDE_DIR 20 | NAMES sentencepiece_processor.h 21 | PATH_SUFFIXES include 22 | HINTS ${SENTENCEPIECE_INCLUDE_DIRS} 23 | ) 24 | mark_as_advanced(SentencePiece_INCLUDE_DIR) 25 | 26 | find_library(SentencePiece_LIBRARY 27 | NAMES sentencepiece 28 | PATH_SUFFIXES lib 29 | HINTS ${SENTENCEPIECE_LIBRARY_DIRS} 30 | ) 31 | mark_as_advanced(SentencePiece_LIBRARY) 32 | 33 | 34 | include(FindPackageHandleStandardArgs) 35 | find_package_handle_standard_args( 36 | SentencePiece 37 | DEFAULT_MSG 38 | SentencePiece_LIBRARY 39 | SentencePiece_INCLUDE_DIR 40 | ) 41 | 42 | if(NOT SentencePiece_FOUND) 43 | if(SentencePiece_FIND_REQUIRED) 44 | message(FATAL_ERROR "Cannot find SentencePiece library") 45 | else() 46 | message(WARNING "SentencePiece library is not found!") 47 | endif() 48 | else() 49 | if(SentencePiece_FOUND AND NOT TARGET SentencePiece::sentencepiece) 50 | add_library(SentencePiece::sentencepiece UNKNOWN IMPORTED) 51 | set_target_properties(SentencePiece::sentencepiece PROPERTIES 52 | IMPORTED_LOCATION "${SentencePiece_LIBRARY}" 53 | INTERFACE_INCLUDE_DIRECTORIES "${SentencePiece_INCLUDE_DIR}" 54 | ) 55 | endif() 56 | endif() 57 | -------------------------------------------------------------------------------- /cmake/cargo_library.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | # inspired by https://github.com/abseil/abseil-cpp 4 | # cc_library() 5 | # CMake function to imitate Bazel's cc_library rule. 6 | function(cargo_library) 7 | cmake_parse_arguments( 8 | CARGO # prefix 9 | "" # options 10 | "NAME" # one value args 11 | "HDRS" # multi value args 12 | ${ARGN} 13 | ) 14 | 15 | string(REPLACE "-" "_" LIB_NAME ${CARGO_NAME}) 16 | # set(CARGO_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR}) 17 | 18 | # figure out the target triple 19 | if(WIN32) 20 | if(CMAKE_SIZEOF_VOID_P EQUAL 8) 21 | set(LIB_TARGET "x86_64-pc-windows-msvc") 22 | else() 23 | set(LIB_TARGET "i686-pc-windows-msvc") 24 | endif() 25 | elseif(ANDROID) 26 | if(ANDROID_SYSROOT_ABI STREQUAL "x86") 27 | set(LIB_TARGET "i686-linux-android") 28 | elseif(ANDROID_SYSROOT_ABI STREQUAL "x86_64") 29 | set(LIB_TARGET "x86_64-linux-android") 30 | elseif(ANDROID_SYSROOT_ABI STREQUAL "arm") 31 | set(LIB_TARGET "arm-linux-androideabi") 32 | elseif(ANDROID_SYSROOT_ABI STREQUAL "arm64") 33 | set(LIB_TARGET "aarch64-linux-android") 34 | endif() 35 | elseif(IOS) 36 | set(LIB_TARGET "universal") 37 | elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin) 38 | set(LIB_TARGET "x86_64-apple-darwin") 39 | else() 40 | if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") 41 | set(LIB_TARGET "aarch64-unknown-linux-gnu") 42 | elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) 43 | set(LIB_TARGET "x86_64-unknown-linux-gnu") 44 | else() 45 | set(LIB_TARGET "i686-unknown-linux-gnu") 46 | endif() 47 | endif() 48 | 49 | if(CMAKE_BUILD_TYPE STREQUAL "Debug") 50 | set(LIB_BUILD_TYPE "debug") 51 | else() 52 | set(LIB_BUILD_TYPE "release") 53 | endif() 54 | 55 | if(IOS) 56 | set(CARGO_ARGS "lipo") 57 | else() 58 | set(CARGO_ARGS "build") 59 | list(APPEND CARGO_ARGS "--target" ${LIB_TARGET}) 60 | endif() 61 | 62 | if(${LIB_BUILD_TYPE} STREQUAL "release") 63 | list(APPEND CARGO_ARGS "--release") 64 | endif() 65 | 66 | file(GLOB_RECURSE LIB_SOURCES "*.rs") 67 | 68 | set(CARGO_ENV_COMMAND ${CMAKE_COMMAND} -E env "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}") 69 | 70 | # build the library target with cargo 71 | set(STATIC_LIB_NAME 72 | "${CMAKE_STATIC_LIBRARY_PREFIX}${LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") 73 | set(LIB_FILE 74 | "${CMAKE_CURRENT_BINARY_DIR}/${LIB_TARGET}/${LIB_BUILD_TYPE}/${STATIC_LIB_NAME}") 75 | 76 | message(STATUS "running: ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS}") 77 | 78 | add_custom_command( 79 | OUTPUT ${LIB_FILE} 80 | COMMAND ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS} 81 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 82 | DEPENDS ${LIB_SOURCES} 83 | COMMENT "Building cargo library ${LIB_FILE}" 84 | ) 85 | add_custom_target(${CARGO_NAME}_target ALL DEPENDS ${LIB_FILE}) 86 | 87 | # add the library target 88 | add_library(${CARGO_NAME} STATIC IMPORTED GLOBAL) 89 | add_dependencies(${CARGO_NAME} ${CARGO_NAME}_target) 90 | set_target_properties(${CARGO_NAME} PROPERTIES 91 | IMPORTED_LOCATION ${LIB_FILE} 92 | ) 93 | target_sources(${CARGO_NAME} INTERFACE ${CARGO_HDRS}) 94 | endfunction() 95 | -------------------------------------------------------------------------------- /cmake/cargo_shared_library.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | # inspired by https://github.com/abseil/abseil-cpp 4 | # cc_library() 5 | # CMake function to imitate Bazel's cc_library rule. 6 | function(cargo_shared_library) 7 | cmake_parse_arguments( 8 | CARGO # prefix 9 | "" # options 10 | "NAME" # one value args 11 | "HDRS" # multi value args 12 | ${ARGN} 13 | ) 14 | 15 | string(REPLACE "-" "_" LIB_NAME ${CARGO_NAME}) 16 | # set(CARGO_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR}) 17 | 18 | # figure out the target triple 19 | if(WIN32) 20 | if(CMAKE_SIZEOF_VOID_P EQUAL 8) 21 | set(LIB_TARGET "x86_64-pc-windows-msvc") 22 | else() 23 | set(LIB_TARGET "i686-pc-windows-msvc") 24 | endif() 25 | elseif(ANDROID) 26 | if(ANDROID_SYSROOT_ABI STREQUAL "x86") 27 | set(LIB_TARGET "i686-linux-android") 28 | elseif(ANDROID_SYSROOT_ABI STREQUAL "x86_64") 29 | set(LIB_TARGET "x86_64-linux-android") 30 | elseif(ANDROID_SYSROOT_ABI STREQUAL "arm") 31 | set(LIB_TARGET "arm-linux-androideabi") 32 | elseif(ANDROID_SYSROOT_ABI STREQUAL "arm64") 33 | set(LIB_TARGET "aarch64-linux-android") 34 | endif() 35 | elseif(IOS) 36 | set(LIB_TARGET "universal") 37 | elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin) 38 | set(LIB_TARGET "x86_64-apple-darwin") 39 | else() 40 | if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") 41 | set(LIB_TARGET "aarch64-unknown-linux-gnu") 42 | elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) 43 | set(LIB_TARGET "x86_64-unknown-linux-gnu") 44 | else() 45 | set(LIB_TARGET "i686-unknown-linux-gnu") 46 | endif() 47 | endif() 48 | 49 | if(CMAKE_BUILD_TYPE STREQUAL "Debug") 50 | set(LIB_BUILD_TYPE "debug") 51 | else() 52 | set(LIB_BUILD_TYPE "release") 53 | endif() 54 | 55 | if(IOS) 56 | set(CARGO_ARGS "lipo") 57 | else() 58 | set(CARGO_ARGS "build") 59 | list(APPEND CARGO_ARGS "--target" ${LIB_TARGET}) 60 | endif() 61 | 62 | if(${LIB_BUILD_TYPE} STREQUAL "release") 63 | list(APPEND CARGO_ARGS "--release") 64 | endif() 65 | 66 | file(GLOB_RECURSE LIB_SOURCES "*.rs") 67 | 68 | set(CARGO_ENV_COMMAND ${CMAKE_COMMAND} -E env "CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR}") 69 | 70 | # build the library target with cargo 71 | set(SHARED_LIB_NAME 72 | "${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}") 73 | set(LIB_FILE 74 | "${CMAKE_CURRENT_BINARY_DIR}/${LIB_TARGET}/${LIB_BUILD_TYPE}/${SHARED_LIB_NAME}") 75 | 76 | add_custom_command( 77 | OUTPUT ${LIB_FILE} 78 | COMMAND ${CARGO_ENV_COMMAND} ${CARGO_EXECUTABLE} ARGS ${CARGO_ARGS} 79 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 80 | DEPENDS ${LIB_SOURCES} 81 | COMMENT "Building cargo library ${LIB_FILE}" 82 | ) 83 | add_custom_target(${CARGO_NAME}_target ALL DEPENDS ${LIB_FILE}) 84 | 85 | # add the library target 86 | add_library(${CARGO_NAME} SHARED IMPORTED GLOBAL) 87 | add_dependencies(${CARGO_NAME} ${CARGO_NAME}_target) 88 | set_target_properties(${CARGO_NAME} PROPERTIES 89 | IMPORTED_LOCATION ${LIB_FILE} 90 | INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR} 91 | IMPORTED_NO_SONAME TRUE 92 | ) 93 | endfunction() 94 | -------------------------------------------------------------------------------- /cmake/cc_binary.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | # inspired by https://github.com/abseil/abseil-cpp 4 | # cc_binary() 5 | # CMake function to imitate Bazel's cc_binary rule. 6 | # 7 | # Parameters: 8 | # NAME: name of target 9 | # HDRS: List of public header files for the library 10 | # SRCS: List of source files for the library 11 | # COPTS: List of private compile options 12 | # DEFINES: List of public defines 13 | # LINKOPTS: List of link options 14 | # DEPS: List of other libraries to be linked in to the binary targets 15 | # 16 | # cc_library( 17 | # NAME 18 | # awesome 19 | # HDRS 20 | # "a.h" 21 | # SRCS 22 | # "a.cc" 23 | # ) 24 | # cc_binary( 25 | # NAME 26 | # fantastic 27 | # SRCS 28 | # "b.cc" 29 | # DEPS 30 | # :awesome 31 | # ) 32 | # 33 | function(cc_binary) 34 | cmake_parse_arguments( 35 | CC_BINARY # prefix 36 | "" # options 37 | "NAME" # one value args 38 | "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args 39 | ${ARGN} 40 | ) 41 | 42 | add_executable(${CC_BINARY_NAME} "") 43 | target_sources(${CC_BINARY_NAME} 44 | PRIVATE ${CC_BINARY_SRCS} ${CC_BINARY_HDRS} 45 | ) 46 | target_link_libraries(${CC_BINARY_NAME} 47 | PUBLIC 48 | ${CC_BINARY_DEPS} 49 | PRIVATE 50 | ${CC_BINARY_LINKOPTS} 51 | ) 52 | target_include_directories(${CC_BINARY_NAME} 53 | PUBLIC 54 | "$" 55 | ) 56 | target_compile_options(${CC_BINARY_NAME} PRIVATE ${CC_BINARY_COPTS}) 57 | target_compile_definitions(${CC_BINARY_NAME} PUBLIC ${CC_BINARY_DEFINES}) 58 | 59 | add_executable(:${CC_BINARY_NAME} ALIAS ${CC_BINARY_NAME}) 60 | endfunction() 61 | 62 | -------------------------------------------------------------------------------- /cmake/cc_library.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | # inspired by https://github.com/abseil/abseil-cpp 4 | # cc_library() 5 | # CMake function to imitate Bazel's cc_library rule. 6 | # 7 | # Parameters: 8 | # NAME: name of target 9 | # HDRS: List of public header files for the library 10 | # SRCS: List of source files for the library 11 | # DEPS: List of other libraries to be linked in to the binary targets 12 | # COPTS: List of private compile options 13 | # DEFINES: List of public defines 14 | # LINKOPTS: List of link options 15 | # 16 | # cc_library( 17 | # NAME 18 | # awesome 19 | # HDRS 20 | # "a.h" 21 | # SRCS 22 | # "a.cc" 23 | # ) 24 | # cc_library( 25 | # NAME 26 | # fantastic_lib 27 | # SRCS 28 | # "b.cc" 29 | # DEPS 30 | # :awesome 31 | # ) 32 | # 33 | function(cc_library) 34 | cmake_parse_arguments( 35 | CC_LIB # prefix 36 | "TESTONLY" # options 37 | "NAME" # one value args 38 | "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS;INCLUDES" # multi value args 39 | ${ARGN} 40 | ) 41 | 42 | if(CC_LIB_TESTONLY AND (NOT BUILD_TESTING)) 43 | return() 44 | endif() 45 | 46 | # Check if this is a header only library 47 | set(_CC_SRCS "${CC_LIB_SRCS}") 48 | foreach(src_file IN LISTS _CC_SRCS) 49 | if(${src_file} MATCHES ".*\\.(h|inc)") 50 | list(REMOVE_ITEM _CC_SRCS "${src_file}") 51 | endif() 52 | endforeach() 53 | 54 | if(_CC_SRCS STREQUAL "") 55 | set(CC_LIB_IS_INTERFACE 1) 56 | else() 57 | set(CC_LIB_IS_INTERFACE 0) 58 | endif() 59 | 60 | if(NOT CC_LIB_IS_INTERFACE) 61 | add_library(${CC_LIB_NAME} STATIC) 62 | target_sources(${CC_LIB_NAME} 63 | PRIVATE ${CC_LIB_SRCS} ${CC_LIB_HDRS}) 64 | target_link_libraries(${CC_LIB_NAME} 65 | PUBLIC ${CC_LIB_DEPS} 66 | PRIVATE ${CC_LIB_LINKOPTS} 67 | ) 68 | target_include_directories(${CC_LIB_NAME} 69 | PUBLIC 70 | "$" 71 | ${CC_LIB_INCLUDES} 72 | ) 73 | target_compile_options(${CC_LIB_NAME} PRIVATE ${CC_LIB_COPTS}) 74 | target_compile_definitions(${CC_LIB_NAME} PUBLIC ${CC_LIB_DEFINES}) 75 | else() 76 | # Generating header only library 77 | add_library(${CC_LIB_NAME} INTERFACE) 78 | target_include_directories(${CC_LIB_NAME} 79 | INTERFACE 80 | "$" 81 | ${CC_LIB_INCLUDES} 82 | ) 83 | 84 | target_link_libraries(${CC_LIB_NAME} 85 | INTERFACE ${CC_LIB_DEPS} ${CC_LIB_LINKOPTS} 86 | ) 87 | target_compile_definitions(${CC_LIB_NAME} INTERFACE ${CC_LIB_DEFINES}) 88 | endif() 89 | 90 | # add alias for the library target 91 | add_library(:${CC_LIB_NAME} ALIAS ${CC_LIB_NAME}) 92 | endfunction() 93 | -------------------------------------------------------------------------------- /cmake/cc_test.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | # inspired by https://github.com/abseil/abseil-cpp 4 | # cc_test() 5 | # CMake function to imitate Bazel's cc_test rule. 6 | # 7 | # Parameters: 8 | # NAME: name of target (see Usage below) 9 | # SRCS: List of source files for the binary 10 | # DEPS: List of other libraries to be linked in to the binary targets 11 | # COPTS: List of private compile options 12 | # LINKOPTS: List of link options 13 | # ARGS: Command line arguments to test case 14 | # 15 | # Usage: 16 | # cc_library( 17 | # NAME 18 | # awesome 19 | # HDRS 20 | # "a.h" 21 | # SRCS 22 | # "a.cc" 23 | # ) 24 | # 25 | # cc_test( 26 | # NAME 27 | # awesome_test 28 | # SRCS 29 | # "awesome_test.cc" 30 | # DEPS 31 | # :awesome 32 | # GTest::gmock 33 | # ) 34 | # 35 | function(cc_test) 36 | if(NOT BUILD_TESTING) 37 | return() 38 | endif() 39 | 40 | cmake_parse_arguments( 41 | CC_TEST # prefix 42 | "" # options 43 | "NAME" # one value args 44 | "SRCS;COPTS;LINKOPTS;DEPS;INCLUDES;ARGS;DATA" # multi value args 45 | ${ARGN} 46 | ) 47 | 48 | # place test data in build directory 49 | if(CC_TEST_DATA) 50 | foreach(data ${CC_TEST_DATA}) 51 | configure_file(${data} ${CMAKE_CURRENT_BINARY_DIR}/${data} COPYONLY) 52 | endforeach() 53 | endif() 54 | 55 | add_executable(${CC_TEST_NAME}) 56 | target_sources(${CC_TEST_NAME} PRIVATE ${CC_TEST_SRCS}) 57 | target_include_directories(${CC_TEST_NAME} 58 | PUBLIC 59 | "$" 60 | ${CC_TEST_INCLUDES} 61 | ) 62 | 63 | target_compile_options(${CC_TEST_NAME} 64 | PRIVATE ${CC_TEST_COPTS} 65 | ) 66 | 67 | target_link_libraries(${CC_TEST_NAME} 68 | PUBLIC ${CC_TEST_DEPS} 69 | PRIVATE ${CC_TEST_LINKOPTS} 70 | ) 71 | 72 | gtest_add_tests( 73 | TARGET ${CC_TEST_NAME} 74 | EXTRA_ARGS ${CC_TEST_ARGS} 75 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 76 | ) 77 | endfunction() 78 | -------------------------------------------------------------------------------- /cmake/grpc_proto_library.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | include(CMakePrintHelpers) 3 | 4 | # inspired by https://github.com/abseil/abseil-cpp 5 | # grpc_proto_library() 6 | # CMake function to imitate Bazel's grpc_proto_library rule. 7 | # 8 | # Parameters: 9 | # NAME: name of target 10 | # SRCS: List of proto source files for the library 11 | # DEPS: List of other libraries to be linked in to the binary targets 12 | # COPTS: List of private compile options 13 | # DEFINES: List of public defines 14 | # LINKOPTS: List of link options 15 | # 16 | # grpc_proto_library( 17 | # NAME 18 | # proto_lib 19 | # SRCS 20 | # "b.proto" 21 | # ) 22 | # 23 | function(grpc_proto_library) 24 | cmake_parse_arguments( 25 | PROTO_LIB # prefix 26 | "" # options 27 | "NAME" # one value args 28 | "SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args 29 | ${ARGN} 30 | ) 31 | 32 | # Add Library target with protobuf sources 33 | add_library(${PROTO_LIB_NAME} ${PROTO_LIB_SRCS}) 34 | 35 | # Link dependencies 36 | target_link_libraries(${PROTO_LIB_NAME} 37 | PUBLIC 38 | protobuf::libprotobuf 39 | gRPC::grpc 40 | gRPC::grpc++ 41 | gRPC::grpc++_reflection 42 | PRIVATE 43 | ${PROTO_LIB_DEPS} 44 | ) 45 | 46 | # Set include directories 47 | target_include_directories(${PROTO_LIB_NAME} 48 | PUBLIC 49 | ${Protobuf_INCLUDE_DIRS} 50 | ${CMAKE_CURRENT_BINARY_DIR} 51 | ) 52 | 53 | # Set compile options 54 | target_compile_options(${PROTO_LIB_NAME} 55 | PRIVATE 56 | ${PROTO_LIB_COPTS} 57 | -Wno-unused-parameter 58 | ) 59 | 60 | # Set compile definitions 61 | target_compile_definitions(${PROTO_LIB_NAME} 62 | PUBLIC ${PROTO_LIB_DEFINES} 63 | ) 64 | 65 | # Compile protobuf and grpc files 66 | protobuf_generate( 67 | TARGET ${PROTO_LIB_NAME} 68 | IMPORT_DIRS . 69 | LANGUAGE cpp) 70 | 71 | # Get grpc_cpp_plugin location 72 | get_target_property(grpc_cpp_plugin_location gRPC::grpc_cpp_plugin LOCATION) 73 | 74 | # Generate grpc files from protobuf files using grpc_cpp_plugin 75 | protobuf_generate( 76 | TARGET ${PROTO_LIB_NAME} 77 | LANGUAGE grpc 78 | IMPORT_DIRS . 79 | GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc 80 | PLUGIN "protoc-gen-grpc=${grpc_cpp_plugin_location}" 81 | ) 82 | 83 | # Set alias for library 84 | add_library(grpc_proto::${PROTO_LIB_NAME} ALIAS ${PROTO_LIB_NAME}) 85 | endfunction() 86 | -------------------------------------------------------------------------------- /cmake/proto_library.cmake: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | include(CMakePrintHelpers) 3 | 4 | # inspired by https://github.com/abseil/abseil-cpp 5 | # proto_library() 6 | # CMake function to imitate Bazel's proto_library rule. 7 | # 8 | # Parameters: 9 | # NAME: name of target 10 | # SRCS: List of proto source files for the library 11 | # DEPS: List of other libraries to be linked in to the binary targets 12 | # COPTS: List of private compile options 13 | # DEFINES: List of public defines 14 | # LINKOPTS: List of link options 15 | # 16 | # cc_library( 17 | # NAME 18 | # awesome 19 | # HDRS 20 | # "a.h" 21 | # SRCS 22 | # "a.cc" 23 | # ) 24 | # proto_library( 25 | # NAME 26 | # proto_lib 27 | # SRCS 28 | # "b.proto" 29 | # DEPS 30 | # :awesome 31 | # ) 32 | # 33 | function(proto_library) 34 | # parse arguments and set variables 35 | cmake_parse_arguments( 36 | PROTO_LIB # prefix 37 | "" # options 38 | "NAME" # one value args 39 | "SRCS;COPTS;DEFINES;LINKOPTS;DEPS" # multi value args 40 | ${ARGN} 41 | ) 42 | # generate cpp and hpp files from proto files using protoc compiler 43 | protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_LIB_SRCS}) 44 | 45 | add_library(${PROTO_LIB_NAME} STATIC) 46 | target_sources(${PROTO_LIB_NAME} 47 | PRIVATE ${PROTO_SRCS} ${PROTO_HDRS} 48 | ) 49 | 50 | target_link_libraries(${PROTO_LIB_NAME} 51 | PUBLIC protobuf::libprotobuf 52 | ) 53 | target_include_directories(${PROTO_LIB_NAME} 54 | PUBLIC 55 | ${Protobuf_INCLUDE_DIRS} 56 | ${CMAKE_CURRENT_BINARY_DIR} 57 | ) 58 | target_compile_options(${PROTO_LIB_NAME} 59 | PRIVATE 60 | ${PROTO_LIB_COPTS} 61 | -Wno-unused-parameter 62 | ) 63 | target_compile_definitions(${PROTO_LIB_NAME} 64 | PUBLIC 65 | ${PROTO_LIB_DEFINES} 66 | ) 67 | 68 | add_library(proto::${PROTO_LIB_NAME} ALIAS ${PROTO_LIB_NAME}) 69 | endfunction() 70 | 71 | -------------------------------------------------------------------------------- /cmake/static_analyzers.cmake: -------------------------------------------------------------------------------- 1 | option(ENABLE_CPPCHECK "Enable static analysis with cppcheck" OFF) 2 | option(ENABLE_CLANG_TIDY "Enable static analysis with clang-tidy" OFF) 3 | option(ENABLE_INCLUDE_WHAT_YOU_USE "Enable static analysis with include-what-you-use" OFF) 4 | 5 | if(ENABLE_CPPCHECK) 6 | find_program(CPPCHECK cppcheck) 7 | 8 | if(CPPCHECK) 9 | set(CMAKE_CXX_CPPCHECK ${CPPCHECK} 10 | --suppressions-list=${CMAKE_CURRENT_SOURCE_DIR}/.cppcheck-suppress 11 | --enable=all 12 | --inconclusive 13 | --inline-suppr) 14 | message(STATUS "Using cppcheck: " ${CPPCHECK}) 15 | else() 16 | message(SEND_ERROR "cppcheck requested but executable not found") 17 | endif() 18 | endif() 19 | 20 | if(ENABLE_CLANG_TIDY) 21 | find_program(CLANGTIDY clang-tidy) 22 | 23 | if(CLANGTIDY) 24 | set(CMAKE_CXX_CLANG_TIDY ${CLANGTIDY} -extra-arg=-Wno-unknown-warning-option) 25 | message(STATUS "Using clang-tidy: " ${CLANGTIDY}) 26 | else() 27 | message(SEND_ERROR "clang-tidy requested but executable not found") 28 | endif() 29 | endif() 30 | 31 | if(ENABLE_INCLUDE_WHAT_YOU_USE) 32 | find_program(INCLUDE_WHAT_YOU_USE include-what-you-use) 33 | 34 | if(INCLUDE_WHAT_YOU_USE) 35 | set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE ${INCLUDE_WHAT_YOU_USE}) 36 | message(STATUS "Using include-what-you-use: " ${INCLUDE_WHAT_YOU_USE}) 37 | else() 38 | message(SEND_ERROR "include-what-you-use requested but executable not found") 39 | endif() 40 | endif() 41 | 42 | -------------------------------------------------------------------------------- /docs/assets/service_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/service_arch.png -------------------------------------------------------------------------------- /docs/assets/wechat_qrcode1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/wechat_qrcode1.png -------------------------------------------------------------------------------- /docs/assets/wechat_qrcode2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/wechat_qrcode2.png -------------------------------------------------------------------------------- /docs/assets/xllm_service_title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm-service/e98847eb35a809b32bd6756c3f4b49a4facc425c/docs/assets/xllm_service_title.png -------------------------------------------------------------------------------- /docs/en/getting_started.md: -------------------------------------------------------------------------------- 1 | # Compilation and Execution 2 | 3 | ## Container 4 | First, download the image we provide: 5 | ```bash 6 | docker pull xllm-ai/xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64 7 | ``` 8 | Then create the corresponding container: 9 | ```bash 10 | sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host --device=/dev/davinci0 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /var/queue_schedule:/var/queue_schedule -v /mnt/cfs/9n-das-admin/llm_models:/mnt/cfs/9n-das-admin/llm_models -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi -v /usr/local/sbin/:/usr/local/sbin/ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf -v /var/log/npu/slog/:/var/log/npu/slog -v /export/home:/export/home -w /export/home -v ~/.ssh:/root/.ssh -v /var/log/npu/profiling/:/var/log/npu/profiling -v /var/log/npu/dump/:/var/log/npu/dump -v /home/:/home/ -v /runtime/:/runtime/ xllm-ai:xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64 11 | ``` 12 | 13 | ## Compilation 14 | ```bash 15 | git clone https://github.com/jd-opensource/xllm-service 16 | cd xllm_service 17 | git submodule init 18 | git submodule update 19 | ``` 20 | 21 | ### etcd Installation 22 | Use the installation script provided by etcd official: 23 | ```bash 24 | mv /tmp/etcd-download-test/etcd /path/to/your/etcd 25 | ``` 26 | 27 | ### Adding a Patch 28 | `etcd_cpp_apiv3` depends on the cpprest static library, but cpprest is built as a dynamic library by default. Therefore, you need to add a patch to the CMakeLists.txt of cpprest: 29 | ```bash 30 | bash prepare.sh 31 | ``` 32 | 33 | ### xLLM Service Compilation 34 | ```bash 35 | mkdir -p build 36 | cd build 37 | cmake .. 38 | make -j 8 39 | cd .. 40 | ``` 41 | !!! warning "Possible Errors" 42 | Here may encounter installation errors about `boost-locale` and `boost-interprocess`: `vcpkg-src/packages/boost-locale_x64-linux/include: No such file or directory`,`/vcpkg-src/packages/boost-interprocess_x64-linux/include: No such file or directory` 43 | We use `vcpkg` to reinstall these packages: 44 | ```bash 45 | /path/to/vcpkg remove boost-locale boost-interprocess 46 | /path/to/vcpkg install boost-locale:x64-linux 47 | /path/to/vcpkg install boost-interprocess:x64-linux 48 | ``` 49 | 50 | ## Execution 51 | 1. First, start the etcd service: 52 | ```bash 53 | ./etcd-download-test/etcd --listen-peer-urls 'http://localhost:2390' --listen-client-urls 'http://localhost:2389' --advertise-client-urls 'http://localhost:2391' 54 | ``` 55 | 56 | 2. Then start the xllm-service service: 57 | ```bash 58 | ENABLE_DECODE_RESPONSE_TO_SERVICE=0 \ 59 | ENABLE_XLLM_DEBUG_LOG=1 \ 60 | ./build/xllm_service/xllm_master_serving \ 61 | --etcd_addr="127.0.0.1:2389" \ 62 | --http_server_port=9888 \ 63 | --rpc_server_port=9889 \ 64 | --tokenizer_path /path/to/tokenizer_config/ 65 | ``` 66 | 67 | xllm-service needs to start an http service and an rpc service. The http service is used to receive and process user requests, and the rpc service is used to interact with xllm instances. 68 | 69 | The complete usage process needs to be used with xllm, please refer to the link: [xLLM PD Disaggregated Deployment](https://xllm.readthedocs.io/zh-cn/latest/zh/getting_started/PD_disagg/) 70 | 71 | ### service Parameters 72 | http service:It is used to receive and process user requests. 73 | | Parameter | Description | Default Value | 74 | | --- | --- | --- | 75 | | http_server_host | http service address | "" | 76 | | http_server_port | http service port | 8888 | 77 | | http_server_idle_timeout_s | http service timeout | -1 | 78 | | http_server_num_threads | http service thread number | 32 | 79 | | http_server_max_concurrency | http service max concurrency | 128 | 80 | 81 | rpc service:It is used to interact with xllm, manage the status of xllm instance clusters, etc. 82 | | Parameter | Description | Default Value | 83 | | --- | --- | --- | 84 | | rpc_server_host | rpc service address | "" | 85 | | rpc_server_port | rpc service port | 8889 | 86 | | rpc_server_idle_timeout_s | rpc service timeout | -1 | 87 | | rpc_server_num_threads | rpc service thread number | 32 | 88 | | rpc_server_max_concurrency | rpc service max concurrency | 128 | 89 | 90 | Environment Variables: 91 | ENABLE_DECODE_RESPONSE_TO_SERVICE: In the PD disaggregated scenario, whether to return the decoding result to the service directly(without forwarding through the P instance), 0 means "no", 1 means "yes". 92 | ENABLE_XLLM_DEBUG_LOG: Whether to enable xllm debug log, 0 means "no", 1 means "yes". 93 | -------------------------------------------------------------------------------- /docs/en/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 |

7 | xLLM-service 8 |

9 | 10 | ## 1. Project Overview 11 | 12 | **xLLM-service** is a service-layer framework developed based on the **xLLM** inference engine, providing efficient, fault-tolerant, and flexible LLM inference services for clustered deployment. 13 | 14 | xLLM-service targets to address key challenges in enterprise-level service scenarios: 15 | 16 | - How to ensure the SLA of online services and improve resource utilization of offline tasks in a hybrid online-offline deployment environment. 17 | 18 | - How to react to changing request loads in actual businesses, such as fluctuations in input/output lengths. 19 | 20 | - Resolving performance bottlenecks of multimodal model requests. 21 | 22 | - Ensuring high reliability of computing instances. 23 | 24 | #### Background 25 | 26 | LLM with parameter scales ranging from tens of billions to trillions are being rapidly deployed in core business scenarios such as intelligent customer service, real-time recommendation, and content generation. Efficient support for domestic computing hardware has become a core requirement for low-cost inference deployment. Existing inference engines struggle to effectively adapt to the architectural characteristics of dedicated accelerators like domestic chips. Performance issues such as low utilization of computing units, load imbalance and communication overhead bottlenecks under the MoE architecture, and difficulties in kv cache management have restricted the efficient inference of requests and the scalability of the system. The xLLM-service + xLLM inference engine improves the efficiency of the entire performance link and currently supports JD\.com's online services across multiple scenarios and with multiple models. 27 | 28 | --- 29 | 30 | ## 2. Overall Architecture 31 | The overall architecture of xLLM-service is shown in the figure below: 32 | 33 | ![1](../assets/service_arch.png) 34 | 35 | ## 3. Core Components 36 | 37 | ### ETCD Cluster 38 | It is used for metadata management, including the storage and management of metadata such as models, xllm instances, and requests. It also provides xllm node registration and discovery services. 39 | 40 | ### Fault Tolerance 41 | xLLM-service provides fault tolerance management to ensure service quality and stability. 42 | 43 | ### Global Scheduler 44 | It implements globally aware scheduling. Based on the current system status, it accurately dispatches requests to the optimal instances for execution, effectively improving the overall service response efficiency and resource utilization. 45 | 46 | ### Global KV Cache Manager 47 | It is responsible for global KV Cache management. Its core capabilities include distributed KV cache awareness, Prefix matching, and dynamic migration of KV Cache, which optimize the efficiency of cache resource usage. 48 | 49 | ### Instance Manager 50 | It focuses on the full-lifecycle management of instances. All xllm instances must register to service after startup. Based on preset policies, the module provides support for instances such as scheduling adaptation and fault tolerance handling. 51 | 52 | ### Event Plane 53 | As the metrics and event hub, it receives Metrics data reported by various instances, uniformly collects and organizes statistical indicators, and provides data support for decisions such as service scheduling, fault tolerance, and scaling. 54 | 55 | ### Planner 56 | It undertakes the functions of strategy analysis and decision-making. Based on the Metrics data reported by the Event Plane (including instance runtime indicators, machine load indicators, etc.), it analyzes the service scaling needs and the necessity of expanding hot instances, and outputs resource adjustment and instance optimization strategies. 57 | -------------------------------------------------------------------------------- /docs/zh/getting_started.md: -------------------------------------------------------------------------------- 1 | # 编译与运行 2 | 3 | ## 容器 4 | 首先下载我们提供的镜像: 5 | ```bash 6 | docker pull xllm-ai/xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64 7 | ``` 8 | 然后创建对应的容器 9 | ```bash 10 | sudo docker run -it --ipc=host -u 0 --privileged --name mydocker --network=host --device=/dev/davinci0 --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /var/queue_schedule:/var/queue_schedule -v /mnt/cfs/9n-das-admin/llm_models:/mnt/cfs/9n-das-admin/llm_models -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi -v /usr/local/sbin/:/usr/local/sbin/ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf -v /var/log/npu/slog/:/var/log/npu/slog -v /export/home:/export/home -w /export/home -v ~/.ssh:/root/.ssh -v /var/log/npu/profiling/:/var/log/npu/profiling -v /var/log/npu/dump/:/var/log/npu/dump -v /home/:/home/ -v /runtime/:/runtime/ xllm-ai:xllm-0.6.0-dev-800I-A3-py3.11-openeuler24.03-lts-aarch64 11 | ``` 12 | 13 | ## 编译 14 | ```bash 15 | git clone https://github.com/jd-opensource/xllm-service 16 | cd xllm_service 17 | git submodule init 18 | git submodule update 19 | ``` 20 | 21 | ### etcd安装 22 | 使用etcd官方提供的[安装脚本](https://github.com/etcd-io/etcd/releases)进行安装,其脚本提供的默认安装路径是`/tmp/etcd-download-test/etcd`,我们可以手动修改其脚本中的安装路径,也可以运行完脚本之后手动迁移: 23 | ```bash 24 | mv /tmp/etcd-download-test/etcd /path/to/your/etcd 25 | ``` 26 | 27 | ### 添加补丁 28 | etcd_cpp_apiv3 依赖 cpprest 静态库,但 cpprest 编译产生的是动态库,因此需要给 cpprest 的 CMakeLists.txt 加一个补丁: 29 | ```bash 30 | bash prepare.sh 31 | ``` 32 | 33 | ### xLLM Service编译 34 | 再执行编译: 35 | ```bash 36 | mkdir -p build 37 | cd build 38 | cmake .. 39 | make -j 8 40 | cd .. 41 | ``` 42 | !!! warning "可能的错误" 43 | 这里能会遇到关于`boost-locale`和`boost-interprocess`的安装错误:`vcpkg-src/packages/boost-locale_x64-linux/include: No such file or directory`,`/vcpkg-src/packages/boost-interprocess_x64-linux/include: No such file or directory` 44 | 我们使用`vcpkg`重新安装这些包: 45 | ```bash 46 | /path/to/vcpkg remove boost-locale boost-interprocess 47 | /path/to/vcpkg install boost-locale:x64-linux 48 | /path/to/vcpkg install boost-interprocess:x64-linux 49 | ``` 50 | 51 | ## 运行 52 | 1. 首先需要启动etcd服务: 53 | ```bash 54 | ./etcd-download-test/etcd --listen-peer-urls 'http://localhost:2390' --listen-client-urls 'http://localhost:2389' --advertise-client-urls 'http://localhost:2391' 55 | ``` 56 | 57 | 2. 然后启动service服务: 58 | ```bash 59 | ENABLE_DECODE_RESPONSE_TO_SERVICE=0 \ 60 | ENABLE_XLLM_DEBUG_LOG=1 \ 61 | ./build/xllm_service/xllm_master_serving \ 62 | --etcd_addr="127.0.0.1:2389" \ 63 | --http_server_port=9888 \ 64 | --rpc_server_port=9889 \ 65 | --tokenizer_path /path/to/tokenizer_config/ 66 | ``` 67 | 68 | xllm-service需要启动一个http服务和一个rpc服务,http服务用于对外接收与处理用户请求,rpc服务用于和xllm实例进行交互。 69 | 70 | 完整的使用流程需要结合xllm一起使用,请查看链接: [xLLM PD分离部署](https://xllm.readthedocs.io/zh-cn/latest/zh/getting_started/PD_disagg/) 71 | 72 | ### service参数 73 | http服务:用于对外接收以及处理用户请求。 74 | | 参数 | 说明 | 默认值 | 75 | | --- | --- | --- | 76 | | http_server_host | http 服务地址 | "" | 77 | | http_server_port | http 服务端口 | 8888 | 78 | | http_server_idle_timeout_s | http 服务超时时间 | -1 | 79 | | http_server_num_threads | http 服务线程数 | 32 | 80 | | http_server_max_concurrency | http 服务最大请求并发数 | 128 | 81 | 82 | rpc服务:用于与xllm之间交互,管理xllm实例集群状态等。 83 | | 参数 | 说明 | 默认值 | 84 | | --- | --- | --- | 85 | | rpc_server_host | rpc 服务地址 | "" | 86 | | rpc_server_port | rpc 服务端口 | 8889 | 87 | | rpc_server_idle_timeout_s | rpc 服务超时时间 | -1 | 88 | | rpc_server_num_threads | rpc 服务线程数 | 32 | 89 | | rpc_server_max_concurrency | rpc 服务最大请求并发数 | 128 | 90 | 91 | 环境参数: 92 | ENABLE_DECODE_RESPONSE_TO_SERVICE: 在PD分离场景下,是否将解码结果直接返回给service(不需要经过P实例转发),0表示“否”,1表示“是”。 93 | ENABLE_XLLM_DEBUG_LOG: 是否开启xllm debug log,0表示不开启,1表示开启。 94 | -------------------------------------------------------------------------------- /docs/zh/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 |

7 | xLLM-service 8 |

9 | 10 | ## 1. 简介 11 | 12 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架,为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。 13 | 14 | xLLM-service 旨在解决企业级服务场景中的关键挑战: 15 | - 如何于在离线混合部署环境中,保障在线服务的SLA,提升离线任务的资源利用率。 16 | - 如何适应实际业务中动态变化的请求负载,如输入/输出长度出现剧烈波动。 17 | - 解决多模态模型请求的性能瓶颈。 18 | - 保障集群计算实例的高可靠性。 19 | 20 | #### 背景 21 | 当前,百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景,对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性,硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题,制约了请求的高效推理与系统的可扩展性。xLLM-service + xLLM推理引擎提升了全链路效率,目前已支撑京东多场景、多模型的线上服务。 22 | 23 | --- 24 | 25 | ## 2. 整体架构 26 | xLLM-service 整体架构如图所示: 27 | 28 | ![1](../assets/service_arch.png) 29 | 30 | ## 3. 核心组件 31 | 32 | ### ETCD Cluster 33 | 用于元信息管理,包括模型,xllm实例,请求等元信息的存储与管理。同时提供xllm节点注册与发现服务。 34 | 35 | ### Fault Tolerance 36 | xLLM-service 提供容错管理,保障服务质量以及稳定性。 37 | 38 | ### Global Scheduler 39 | 实现全局感知调度,根据当前系统状态,将请求精准调度至最优实例执行,有效提升整体服务响应效率与资源利用率。 40 | 41 | ### Global KV Cache Manager 42 | 负责全局 KV Cache 管理,核心能力包括分布式 KV 缓存感知、Prefix 前缀匹配、KV Cache 动态迁移等,优化缓存资源使用效率。 43 | 44 | ### Instance Manager 45 | 聚焦实例全生命周期管理,所有 xllm 实例启动后需向本模块注册,模块基于预设策略,为实例提供调度适配、容错处理等支持。 46 | 47 | ### Event Plane 48 | 作为指标与事件中枢,接收各实例上报的 Metrics 数据,对统计指标进行统一收集与整理,为服务调度、容错、扩缩容等决策提供数据支撑。 49 | 50 | ### Planner 51 | 承担策略分析与决策职能,基于 Event Plane 上报的 Metrics 数据(含实例运行时指标、机器负载指标等),分析服务扩缩容需求、热点实例扩展必要性,输出资源调整与实例优化策略。 -------------------------------------------------------------------------------- /prepare.sh: -------------------------------------------------------------------------------- 1 | cd ./third_party/cpprestsdk 2 | git apply ../custom_cache/cpprestsdk.patch -------------------------------------------------------------------------------- /third_party/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | add_subdirectory(cpprestsdk) 4 | set(CPPREST_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cpprestsdk/Release/include) 5 | set(CPPREST_LIB ${CMAKE_BINARY_DIR}/third_party/cpprestsdk/Release/Binaries/libcpprest.a) 6 | add_subdirectory(etcd_cpp_apiv3) 7 | add_subdirectory(brpc) 8 | add_subdirectory(minja) 9 | add_subdirectory(sentencepiece) 10 | add_subdirectory(smhasher/src) -------------------------------------------------------------------------------- /third_party/custom_cache/cpprestsdk.patch: -------------------------------------------------------------------------------- 1 | diff --git a/Release/CMakeLists.txt b/Release/CMakeLists.txt 2 | index 14e43ced..428e0038 100644 3 | --- a/Release/CMakeLists.txt 4 | +++ b/Release/CMakeLists.txt 5 | @@ -26,7 +26,7 @@ set(CPPREST_INSTALL ON CACHE BOOL "Add install commands.") 6 | if(IOS OR ANDROID) 7 | set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries") 8 | else() 9 | - set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared libraries") 10 | + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries") 11 | endif() 12 | 13 | if(IOS OR ANDROID OR WINDOWS_STORE OR WINDOWS_PHONE) 14 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "xllm", 3 | "version": "0.0.1", 4 | "dependencies": [ 5 | { 6 | "name": "abseil", 7 | "version>=": "20230125.3" 8 | }, 9 | { 10 | "name": "boost-asio", 11 | "version>=": "1.84.0" 12 | }, 13 | { 14 | "name": "boost-algorithm", 15 | "version>=": "1.84.0" 16 | }, 17 | { 18 | "name": "boost-beast", 19 | "version>=": "1.84.0" 20 | }, 21 | { 22 | "name": "boost-thread", 23 | "version>=": "1.84.0" 24 | }, 25 | { 26 | "name": "boost-filesystem", 27 | "version>=": "1.84.0" 28 | }, 29 | { 30 | "name": "boost-chrono", 31 | "version>=": "1.84.0" 32 | }, 33 | { 34 | "name": "boost-atomic", 35 | "version>=": "1.84.0" 36 | }, 37 | { 38 | "name": "boost-random", 39 | "version>=": "1.84.0" 40 | }, 41 | { 42 | "name": "boost-serialization", 43 | "version>=": "1.84.0" 44 | }, 45 | { 46 | "name": "boost-locale", 47 | "version>=": "1.84.0" 48 | }, 49 | { 50 | "name": "boost-interprocess", 51 | "version>=": "1.84.0" 52 | }, 53 | { 54 | "name": "eigen3", 55 | "version>=": "3.4.0" 56 | }, 57 | { 58 | "name": "protobuf", 59 | "version>=": "3.21.12", 60 | "features": ["zlib"] 61 | }, 62 | { 63 | "name": "gflags", 64 | "version>=": "2.2.2#7" 65 | }, 66 | { 67 | "name": "gtest", 68 | "version>=": "1.13.0" 69 | }, 70 | { 71 | "name": "glog", 72 | "version>=": "0.6.0#2" 73 | }, 74 | { 75 | "name": "grpc", 76 | "version>=": "1.51.1", 77 | "default-features": false 78 | }, 79 | { 80 | "name": "leveldb", 81 | "version>=": "1.23", 82 | "default-features": false 83 | }, 84 | { 85 | "name": "openssl", 86 | "version>=": "3.2.1" 87 | }, 88 | { 89 | "name": "snappy", 90 | "version>=": "1.1.10" 91 | }, 92 | { 93 | "name": "nlohmann-json", 94 | "version>=": "3.11.2", 95 | "default-features": false 96 | }, 97 | { 98 | "name": "zlib" , 99 | "version>=": "1.3.1" 100 | }, 101 | { 102 | "name": "re2", 103 | "version>=": "2023-07-01", 104 | "default-features": false 105 | } 106 | ], 107 | "builtin-baseline": "fba75d09065fcc76a25dcf386b1d00d33f5175af" 108 | } 109 | 110 | -------------------------------------------------------------------------------- /xllm_service/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | add_subdirectory(proto) 6 | add_subdirectory(common) 7 | add_subdirectory(request) 8 | add_subdirectory(rpc_service) 9 | add_subdirectory(tokenizer) 10 | add_subdirectory(chat_template) 11 | add_subdirectory(http_service) 12 | add_subdirectory(scheduler) 13 | 14 | cc_binary( 15 | NAME 16 | xllm_master_serving 17 | HDRS 18 | master.h 19 | SRCS 20 | master.cpp 21 | DEPS 22 | :xllm_http_service 23 | :xllm_rpc_service 24 | ) 25 | 26 | add_subdirectory(examples) 27 | -------------------------------------------------------------------------------- /xllm_service/chat_template/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library ( 5 | NAME 6 | chat_template 7 | HDRS 8 | jinja_chat_template.h 9 | SRCS 10 | jinja_chat_template.cpp 11 | DEPS 12 | :minja 13 | :tokenizer 14 | nlohmann_json::nlohmann_json 15 | glog::glog 16 | ) 17 | 18 | cc_test ( 19 | NAME 20 | chat_template_test 21 | SRCS 22 | jinja_chat_template_test.cpp 23 | DEPS 24 | :chat_template 25 | GTest::gtest_main 26 | ) 27 | -------------------------------------------------------------------------------- /xllm_service/chat_template/jinja_chat_template.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "jinja_chat_template.h" 17 | 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | namespace xllm_service { 25 | 26 | JinjaChatTemplate::JinjaChatTemplate(const TokenizerArgs& args) : args_(args) { 27 | try { 28 | template_ = std::make_unique( 29 | args_.chat_template(), args_.bos_token(), args_.eos_token()); 30 | LOG(INFO) << "Jinja chat template init succeed."; 31 | 32 | } catch (const std::exception& e) { 33 | LOG(FATAL) << "Failed to parse jinja chat template, TokenizerArgs: " 34 | << args_ << std::endl 35 | << "Error message: " << e.what(); 36 | } 37 | } 38 | 39 | std::optional JinjaChatTemplate::apply( 40 | const ChatMessages& messages) const { 41 | const std::vector empty_tools; 42 | return apply(messages, empty_tools); 43 | } 44 | 45 | std::optional JinjaChatTemplate::apply( 46 | nlohmann::ordered_json& messages) const { 47 | // Call the overloaded method with empty tools 48 | nlohmann::ordered_json empty_tools = nlohmann::json::array(); 49 | return apply(messages, empty_tools); 50 | } 51 | 52 | std::optional JinjaChatTemplate::apply( 53 | const ChatMessages& messages, 54 | const std::vector& json_tools) const { 55 | // convert the messages to json object 56 | nlohmann::ordered_json messages_json = nlohmann::json::array(); 57 | for (const auto& message : messages) { 58 | nlohmann::ordered_json message_json; 59 | message_json["role"] = message.role; 60 | 61 | if (std::holds_alternative(message.content)) { 62 | message_json["content"] = std::get(message.content); 63 | } else if (std::holds_alternative(message.content)) { 64 | message_json["content"] = 65 | get_mm_content(std::get(message.content)); 66 | } 67 | 68 | messages_json.push_back(message_json); 69 | } 70 | 71 | nlohmann::ordered_json tools_json = nlohmann::json::array(); 72 | for (const auto& json_tool : json_tools) { 73 | nlohmann::ordered_json tool_json; 74 | tool_json["type"] = json_tool.type; 75 | 76 | nlohmann::ordered_json function_json; 77 | function_json["name"] = json_tool.function.name; 78 | function_json["description"] = json_tool.function.description; 79 | function_json["parameters"] = json_tool.function.parameters; 80 | 81 | tool_json["function"] = function_json; 82 | tools_json.push_back(tool_json); 83 | } 84 | // apply the template 85 | return apply(messages_json, tools_json); 86 | } 87 | 88 | std::optional JinjaChatTemplate::apply( 89 | nlohmann::ordered_json& messages, 90 | const nlohmann::ordered_json& tools) const { 91 | minja::chat_template_inputs input; 92 | input.messages = messages; 93 | input.tools = tools; 94 | input.add_generation_prompt = true; 95 | minja::chat_template_options options; 96 | 97 | return template_->apply(input, options); 98 | } 99 | 100 | nlohmann::ordered_json JinjaChatTemplate::get_mm_content( 101 | const Message::MMContentVec& vec) const { 102 | nlohmann::ordered_json content_json = nlohmann::json::array(); 103 | 104 | for (const auto& item : vec) { 105 | nlohmann::ordered_json item_json; 106 | item_json["type"] = item.type; 107 | 108 | if (item.type == "text") { 109 | item_json["text"] = item.text; 110 | } else { 111 | item_json[item.type] = "mm place holder"; 112 | } 113 | 114 | content_json.emplace_back(item_json); 115 | } 116 | 117 | return std::move(content_json); 118 | } 119 | 120 | } // namespace xllm_service 121 | -------------------------------------------------------------------------------- /xllm_service/chat_template/jinja_chat_template.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "common/types.h" 26 | #include "tokenizer/tokenizer_args.h" 27 | 28 | namespace xllm_service { 29 | 30 | struct Message { 31 | struct MMUrl { 32 | std::string url; 33 | }; 34 | 35 | struct MMContent { 36 | MMContent(const std::string& type) : type(type) {} 37 | MMContent(const std::string& type, const std::string& text) 38 | : type(type), text(text) {} 39 | 40 | std::string type; 41 | 42 | std::string text; 43 | MMUrl image_url; // image place holder 44 | 45 | MMUrl video_url; // video place holder 46 | MMUrl audio_url; // audio place holder 47 | }; 48 | 49 | using MMContentVec = std::vector; 50 | using Content = std::variant; 51 | 52 | Message() = default; 53 | Message(const std::string& role, const std::string& content) 54 | : role(role), content(content) {} 55 | 56 | Message(const std::string& role, const MMContentVec& content) 57 | : role(role), content(content) {} 58 | 59 | std::string role; 60 | Content content; 61 | }; 62 | using ChatMessages = std::vector; 63 | 64 | // A chat template implementation that uses jinja2 as the template engine. 65 | class JinjaChatTemplate { 66 | public: 67 | JinjaChatTemplate(const TokenizerArgs& args); 68 | 69 | std::optional apply(const ChatMessages& messages) const; 70 | 71 | std::optional apply( 72 | const ChatMessages& messages, 73 | const std::vector& json_tools) const; 74 | 75 | // expose this function for testing 76 | // apply the template to the values in the json object 77 | std::optional apply(nlohmann::ordered_json& messages) const; 78 | 79 | std::optional apply(nlohmann::ordered_json& messages, 80 | const nlohmann::ordered_json& tools) const; 81 | 82 | private: 83 | nlohmann::ordered_json get_mm_content(const Message::MMContentVec& vec) const; 84 | 85 | private: 86 | TokenizerArgs args_; 87 | std::unique_ptr template_; 88 | }; 89 | 90 | } // namespace xllm_service 91 | -------------------------------------------------------------------------------- /xllm_service/chat_template/jinja_chat_template_test.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "jinja_chat_template.h" 17 | 18 | #include 19 | 20 | namespace xllm_service { 21 | 22 | TEST(JinjaChatTemplate, OpenChatModel) { 23 | // clang-format off 24 | const std::string template_str = 25 | "" 26 | "{% for message in messages %}" 27 | "{{ 'GPT4 Correct ' + message['role'] + ': ' + message['content'] + '<|end_of_turn|>'}}" 28 | "{% endfor %}" 29 | "{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}"; 30 | 31 | nlohmann::ordered_json messages = { 32 | {{"role", "system"}, {"content", "you are a helpful assistant."}}, 33 | {{"role", "user"}, {"content", "hi"}}, 34 | {{"role", "assistant"}, {"content", "what i can do for you?"}}, 35 | {{"role", "user"}, {"content", "how are you?"}}}; 36 | const std::string expected = 37 | "" 38 | "GPT4 Correct system: you are a helpful assistant.<|end_of_turn|>" 39 | "GPT4 Correct user: hi<|end_of_turn|>" 40 | "GPT4 Correct assistant: what i can do for you?<|end_of_turn|>" 41 | "GPT4 Correct user: how are you?<|end_of_turn|>" 42 | "GPT4 Correct Assistant:"; 43 | // clang-format on 44 | 45 | TokenizerArgs args; 46 | args.chat_template(template_str); 47 | args.bos_token(""); 48 | args.eos_token("<|end_of_turn|>"); 49 | JinjaChatTemplate template_(args); 50 | auto result = template_.apply(messages); 51 | ASSERT_TRUE(result.has_value()); 52 | 53 | EXPECT_EQ(result.value(), expected); 54 | } 55 | 56 | } // namespace xllm_service 57 | -------------------------------------------------------------------------------- /xllm_service/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | common 6 | HDRS 7 | call_data.h 8 | closure_guard.h 9 | concurrent_queue.h 10 | global_gflags.h 11 | json_reader.h 12 | macros.h 13 | slice.h 14 | threadpool.h 15 | ttft_predictor.h 16 | types.h 17 | utils.h 18 | hash_util.h 19 | xllm/output.h 20 | xllm/status.h 21 | xllm/uuid.h 22 | SRCS 23 | global_gflags.cpp 24 | json_reader.cpp 25 | threadpool.cpp 26 | ttft_predictor.cpp 27 | utils.cpp 28 | hash_util.cpp 29 | xllm/uuid.cpp 30 | DEPS 31 | absl::random_random 32 | absl::strings 33 | glog::glog 34 | gflags::gflags 35 | nlohmann_json::nlohmann_json 36 | SMHasherSupport 37 | proto_xllm 38 | ) 39 | add_dependencies(common brpc-static) 40 | -------------------------------------------------------------------------------- /xllm_service/common/closure_guard.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include "butil/macros.h" 22 | 23 | namespace xllm_service { 24 | 25 | // RAII: Call Run() of the closure on destruction. 26 | class ClosureGuard { 27 | public: 28 | ClosureGuard() : done_(nullptr) {} 29 | 30 | // Constructed with a closure which will be Run() inside dtor. 31 | explicit ClosureGuard(google::protobuf::Closure* done) : done_(done) {} 32 | 33 | // Run internal closure if it's not NULL. 34 | ~ClosureGuard() { 35 | if (done_) { 36 | done_->Run(); 37 | } 38 | } 39 | 40 | // Run internal closure if it's not NULL and set it to `done'. 41 | void reset(google::protobuf::Closure* done) { 42 | if (done_) { 43 | done_->Run(); 44 | } 45 | done_ = done; 46 | } 47 | 48 | // Return and set internal closure to NULL. 49 | google::protobuf::Closure* release() { 50 | google::protobuf::Closure* const prev_done = done_; 51 | done_ = nullptr; 52 | return prev_done; 53 | } 54 | 55 | // True if no closure inside. 56 | bool empty() const { return done_ == nullptr; } 57 | 58 | // Exchange closure with another guard. 59 | void swap(ClosureGuard& other) { std::swap(done_, other.done_); } 60 | 61 | private: 62 | // Copying this object makes no sense. 63 | DISALLOW_COPY_AND_ASSIGN(ClosureGuard); 64 | 65 | google::protobuf::Closure* done_ = nullptr; 66 | }; 67 | 68 | } // namespace xllm_service 69 | -------------------------------------------------------------------------------- /xllm_service/common/concurrent_queue.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | 23 | #if __has_attribute(guarded_by) 24 | #define GUARDED_BY(x) __attribute__((guarded_by(x))) 25 | #else 26 | #define GUARDED_BY(x) 27 | #endif 28 | 29 | namespace xllm_service { 30 | 31 | // a simple thread-safe queue that supports multiple producers and multiple 32 | // consumers concurrently the queue is implemented as a queue with condition 33 | // variable and mutex lock 34 | template 35 | class ConcurrentQueue { 36 | public: 37 | // constructor 38 | ConcurrentQueue() = default; 39 | 40 | explicit ConcurrentQueue(size_t capacity) : capacity_(capacity) {} 41 | 42 | // destructor 43 | ~ConcurrentQueue() = default; 44 | 45 | // push an element to the queue 46 | void push(T value) { 47 | absl::MutexLock lock(&mutex_); 48 | if (capacity_ > 0) { 49 | auto not_full = [this]() { return queue_.size() < capacity_; }; 50 | mutex_.Await(absl::Condition(¬_full)); 51 | } 52 | queue_.push(std::move(value)); 53 | } 54 | 55 | template 56 | void emplace(Args&&... args) { 57 | absl::MutexLock lock(&mutex_); 58 | if (capacity_ > 0) { 59 | auto not_full = [this]() { return queue_.size() < capacity_; }; 60 | mutex_.Await(absl::Condition(¬_full)); 61 | } 62 | queue_.emplace(std::forward(args)...); 63 | } 64 | 65 | // pop an element from the queue, block if the queue is empty 66 | T pop() { 67 | absl::MutexLock lock(&mutex_); 68 | 69 | auto not_empty = [this]() { return !queue_.empty(); }; 70 | mutex_.Await(absl::Condition(¬_empty)); 71 | 72 | T value = std::move(queue_.front()); 73 | queue_.pop(); 74 | return value; 75 | } 76 | 77 | // return the size of the queue 78 | size_t size() { 79 | absl::MutexLock lock(&mutex_); 80 | return queue_.size(); 81 | } 82 | 83 | // return true if the queue is empty 84 | bool empty() { 85 | absl::MutexLock lock(&mutex_); 86 | return queue_.empty(); 87 | } 88 | 89 | private: 90 | // the underlying queue 91 | std::queue queue_ GUARDED_BY(mutex_); 92 | // mutex lock for the queue 93 | absl::Mutex mutex_; 94 | 95 | // maximum capacity of the queue, 0 means no limit. 96 | // when the queue is full, push will block 97 | size_t capacity_ = 0; 98 | }; 99 | 100 | } // namespace xllm_service 101 | -------------------------------------------------------------------------------- /xllm_service/common/global_gflags.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "common/global_gflags.h" 17 | 18 | DEFINE_string(server_host, 19 | "", 20 | "Server listen address, may be IPV4/IPV6/UDS." 21 | " If this is set, the flag port will be ignored"); 22 | 23 | DEFINE_int32(http_server_port, 8888, "Port for xllm http service to listen on"); 24 | 25 | DEFINE_int32(http_server_idle_timeout_s, 26 | -1, 27 | "Connection will be closed if there is no " 28 | "read/write operations during the last `idle_timeout_s'"); 29 | 30 | DEFINE_int32(http_server_num_threads, 32, "Maximum number of threads to use"); 31 | 32 | DEFINE_int32(http_server_max_concurrency, 33 | 128, 34 | "Limit number of requests processed in parallel"); 35 | 36 | DEFINE_int32(rpc_server_port, 8889, "Port for xllm rpc service to listen on"); 37 | 38 | DEFINE_int32(rpc_server_idle_timeout_s, 39 | -1, 40 | "Connection will be closed if there is no " 41 | "read/write operations during the last `idle_timeout_s'"); 42 | 43 | DEFINE_int32(rpc_server_num_threads, 32, "Maximum number of threads to use"); 44 | 45 | DEFINE_int32(rpc_server_max_concurrency, 46 | 128, 47 | "Limit number of requests processed in parallel"); 48 | 49 | DEFINE_string(etcd_addr, 50 | "0.0.0.0:2379", 51 | "etcd adderss for save instance meta info"); 52 | 53 | DEFINE_uint32(murmur_hash3_seed, 1024, "default Murmur Hash seed"); 54 | 55 | DEFINE_int32(port, 8888, "Port for xllm service to listen on"); 56 | 57 | DEFINE_int32(num_threads, 32, "Number of threads to process requests"); 58 | 59 | DEFINE_int32(max_concurrency, 60 | 128, 61 | "Limit number of requests processed in parallel"); 62 | 63 | DEFINE_int32(timeout_ms, 64 | -1, 65 | "Max duration of bRPC Channel. -1 means wait indefinitely."); 66 | 67 | DEFINE_string(listen_addr, 68 | "", 69 | "Server listen address, may be IPV4/IPV6/UDS." 70 | " If this is set, the flag port will be ignored"); 71 | 72 | DEFINE_int32(idle_timeout_s, 73 | -1, 74 | "Connection will be closed if there is no " 75 | "read/write operations during the last `idle_timeout_s'"); 76 | 77 | DEFINE_string(load_balance_policy, 78 | "RR", 79 | "Disaggregated prefill-decode policy."); 80 | 81 | DEFINE_int32(detect_disconnected_instance_interval, 82 | 15, 83 | "The interval that server detect the disconnected instance."); 84 | 85 | DEFINE_int32(block_size, 86 | 16, 87 | "Number of slots per kv cache block. Default is 16."); 88 | 89 | DEFINE_string(tokenizer_path, "", "tokenizer config path."); 90 | 91 | DEFINE_bool(enable_request_trace, false, "Whether to enable request trace"); 92 | -------------------------------------------------------------------------------- /xllm_service/common/global_gflags.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | DECLARE_string(server_host); 21 | 22 | DECLARE_int32(http_server_port); 23 | 24 | DECLARE_int32(http_server_idle_timeout_s); 25 | 26 | DECLARE_int32(http_server_num_threads); 27 | 28 | DECLARE_int32(http_server_max_concurrency); 29 | 30 | DECLARE_int32(rpc_server_port); 31 | 32 | DECLARE_int32(rpc_server_idle_timeout_s); 33 | 34 | DECLARE_int32(rpc_server_num_threads); 35 | 36 | DECLARE_int32(rpc_server_max_concurrency); 37 | 38 | DECLARE_uint32(murmur_hash3_seed); 39 | 40 | DECLARE_int32(timeout_ms); 41 | 42 | DECLARE_string(listen_addr); 43 | 44 | DECLARE_int32(port); 45 | 46 | DECLARE_int32(idle_timeout_s); 47 | 48 | DECLARE_int32(num_threads); 49 | 50 | DECLARE_int32(max_concurrency); 51 | 52 | DECLARE_string(etcd_addr); 53 | 54 | DECLARE_string(load_balance_policy); 55 | 56 | DECLARE_int32(detect_disconnected_instance_interval); 57 | 58 | DECLARE_int32(block_size); 59 | 60 | DECLARE_string(tokenizer_path); 61 | 62 | DECLARE_bool(enable_request_trace); 63 | -------------------------------------------------------------------------------- /xllm_service/common/hash_util.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "common/hash_util.h" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "common/global_gflags.h" 14 | 15 | namespace xllm_service { 16 | 17 | void murmur_hash3(const uint8_t* pre_hash_value, 18 | const Slice& token_ids, 19 | uint8_t* hash_value) { 20 | if (pre_hash_value == nullptr) { 21 | MurmurHash3_x64_128(reinterpret_cast(token_ids.data()), 22 | sizeof(int32_t) * token_ids.size(), 23 | FLAGS_murmur_hash3_seed, 24 | hash_value); 25 | } else { 26 | uint8_t key[1024]; 27 | 28 | int32_t data_len = 29 | sizeof(int32_t) * token_ids.size() + MURMUR_HASH3_VALUE_LEN; 30 | assert(sizeof(key) > data_len); 31 | 32 | memcpy(key, pre_hash_value, MURMUR_HASH3_VALUE_LEN); 33 | memcpy(key + MURMUR_HASH3_VALUE_LEN, 34 | reinterpret_cast(token_ids.data()), 35 | sizeof(int32_t) * token_ids.size()); 36 | 37 | // print_hex_array(key, data_len); 38 | MurmurHash3_x64_128(reinterpret_cast(key), 39 | data_len, 40 | FLAGS_murmur_hash3_seed, 41 | hash_value); 42 | } 43 | } 44 | 45 | void print_hex_array(uint8_t* array) { 46 | for (size_t i = 0; i < MURMUR_HASH3_VALUE_LEN; ++i) { 47 | unsigned char uc = static_cast(array[i]); 48 | std::cout << std::hex << std::setw(2) << std::setfill('0') 49 | << static_cast(uc); 50 | 51 | if (i % MURMUR_HASH3_VALUE_LEN == MURMUR_HASH3_VALUE_LEN - 1) { 52 | std::cout << std::endl; 53 | } 54 | 55 | else { 56 | std::cout << " "; 57 | } 58 | } 59 | std::cout << std::dec << std::endl; 60 | } 61 | 62 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/hash_util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "common/slice.h" 11 | 12 | namespace xllm_service { 13 | constexpr uint32_t MURMUR_HASH3_VALUE_LEN = 16; 14 | 15 | struct Murmur3Key { 16 | uint8_t data[MURMUR_HASH3_VALUE_LEN]; 17 | 18 | Murmur3Key() {} 19 | Murmur3Key(const uint8_t* const input_data) { 20 | memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN); 21 | } 22 | Murmur3Key(const char* const input_data) { 23 | memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN); 24 | } 25 | 26 | std::string to_string() const { 27 | return std::string(reinterpret_cast(data), 28 | MURMUR_HASH3_VALUE_LEN); 29 | } 30 | 31 | bool operator==(const Murmur3Key& other) { 32 | return strncmp(reinterpret_cast(data), 33 | reinterpret_cast(other.data), 34 | MURMUR_HASH3_VALUE_LEN); 35 | } 36 | }; 37 | 38 | struct FixedStringKeyHash { 39 | size_t operator()(const Murmur3Key& key) const { 40 | return std::hash()(std::string_view( 41 | reinterpret_cast(key.data), sizeof(key.data))); 42 | } 43 | }; 44 | 45 | struct FixedStringKeyEqual { 46 | bool operator()(const Murmur3Key& left, const Murmur3Key& right) const { 47 | return strncmp(reinterpret_cast(left.data), 48 | reinterpret_cast(right.data), 49 | sizeof(left.data)) == 0; 50 | } 51 | }; 52 | 53 | void print_hex_array(uint8_t* array); 54 | 55 | void murmur_hash3(const uint8_t* pre_hash_value, 56 | const Slice& token_ids, 57 | uint8_t* hash_value); 58 | 59 | } // namespace xllm_service 60 | -------------------------------------------------------------------------------- /xllm_service/common/json_reader.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "common/json_reader.h" 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | namespace xllm_service { 26 | 27 | bool JsonReader::parse(const std::string& json_file_path) { 28 | if (!std::filesystem::exists(json_file_path)) { 29 | return false; 30 | } 31 | 32 | std::ifstream ifs(json_file_path); 33 | if (!ifs.is_open()) { 34 | return false; 35 | } 36 | 37 | data_ = nlohmann::json::parse(ifs); 38 | return true; 39 | } 40 | 41 | bool JsonReader::contains(const std::string& key) const { 42 | // slipt the key by '.' then traverse the json object 43 | std::vector keys = absl::StrSplit(key, '.'); 44 | nlohmann::json data = data_; 45 | for (const auto& k : keys) { 46 | if (!data.contains(k)) { 47 | return false; 48 | } 49 | data = data[k]; 50 | } 51 | return true; 52 | } 53 | 54 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/json_reader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace xllm_service { 26 | 27 | // an thin wrapper around nlohmann/json to read json files. 28 | // it supports read keys with dot notation from json. 29 | // for exmaple: value_or("a.b.c", 0) will return 100 for following json: 30 | // { 31 | // "a": { 32 | // "b": { 33 | // "c": 100 34 | // } 35 | // } 36 | // } 37 | // 38 | class JsonReader { 39 | public: 40 | // parse the json file, return true if success 41 | bool parse(const std::string& json_file_path); 42 | 43 | // check if the json contains the key, key can be nested with dot notation 44 | bool contains(const std::string& key) const; 45 | 46 | template 47 | T value_or(const std::vector& keys, T2 default_value) const { 48 | for (const auto& key : keys) { 49 | if (auto data = value(key)) { 50 | return data.value(); 51 | } 52 | } 53 | // may introduce implicit conversion from T2 to T 54 | return default_value; 55 | } 56 | 57 | template 58 | T value_or(const std::string& key, T2 default_value) const { 59 | if (auto data = value(key)) { 60 | return data.value(); 61 | } 62 | // may introduce implicit conversion from T2 to T 63 | return default_value; 64 | } 65 | 66 | template 67 | std::optional value(const std::string& key) const { 68 | // slipt the key by '.' then traverse the json object 69 | const std::vector keys = absl::StrSplit(key, '.'); 70 | nlohmann::json data = data_; 71 | for (const auto& k : keys) { 72 | if (data.contains(k)) { 73 | data = data[k]; 74 | } else { 75 | return std::nullopt; 76 | } 77 | } 78 | 79 | if (data.is_null() || data.is_structured()) { 80 | // cannot convert null or structured data to T 81 | return std::nullopt; 82 | } 83 | return data.get(); 84 | } 85 | 86 | nlohmann::json data() const { return data_; } 87 | 88 | private: 89 | nlohmann::json data_; 90 | }; 91 | 92 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/macros.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | namespace xllm_service { 20 | #define PROPERTY(T, property) \ 21 | public: \ 22 | [[nodiscard]] const T& property() const& noexcept { return property##_; } \ 23 | [[nodiscard]] T& property() & noexcept { return property##_; } \ 24 | [[nodiscard]] T&& property() && noexcept { return std::move(property##_); } \ 25 | \ 26 | auto property(const T& value) & -> decltype(*this) { \ 27 | property##_ = value; \ 28 | return *this; \ 29 | } \ 30 | \ 31 | auto property(T&& value) & -> decltype(*this) { \ 32 | property##_ = std::move(value); \ 33 | return *this; \ 34 | } \ 35 | \ 36 | void property(const T& value) && = delete; \ 37 | void property(T&& value) && = delete; \ 38 | \ 39 | T property##_ 40 | 41 | #ifndef UNUSED_PARAMETER 42 | #define UNUSED_PARAMETER(x) ((void)(x)) 43 | #endif 44 | 45 | #if __has_attribute(guarded_by) 46 | #define GUARDED_BY(x) __attribute__((guarded_by(x))) 47 | #else 48 | #define GUARDED_BY(x) 49 | #endif 50 | 51 | // concatenate two strings 52 | #define LLM_STR_CAT(s1, s2) s1##s2 53 | 54 | // create an anonymous variable 55 | #define LLM_ANON_VAR(str) LLM_STR_CAT(str, __LINE__) 56 | 57 | #define REQUIRES(...) std::enable_if_t<(__VA_ARGS__)>* = nullptr 58 | 59 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ 60 | TypeName(const TypeName&) = delete; \ 61 | void operator=(const TypeName&) = delete 62 | 63 | // Define a macro to simplify adding elements from a vector to a repeated field 64 | #define ADD_VECTOR_TO_PROTO(proto_field, vec) \ 65 | do { \ 66 | proto_field->Reserve(vec.size()); \ 67 | for (const auto& value : vec) { \ 68 | *proto_field->Add() = value; \ 69 | } \ 70 | } while (0) 71 | 72 | #define CALLBACK_WITH_ERROR(CODE, MSG) callback(Status{CODE, MSG}); 73 | 74 | } // namespace xllm_service 75 | -------------------------------------------------------------------------------- /xllm_service/common/options.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include "common/macros.h" 21 | 22 | namespace xllm_service { 23 | 24 | class Options { 25 | public: 26 | Options() = default; 27 | ~Options() = default; 28 | 29 | // http server options 30 | PROPERTY(std::string, server_host); 31 | 32 | PROPERTY(int32_t, http_port) = 9998; 33 | 34 | PROPERTY(int32_t, http_idle_timeout_s) = -1; 35 | 36 | PROPERTY(int32_t, http_num_threads) = 32; 37 | 38 | PROPERTY(int32_t, http_max_concurrency) = 0; 39 | 40 | // rpc server options 41 | PROPERTY(int32_t, rpc_port) = 9999; 42 | 43 | PROPERTY(int32_t, rpc_idle_timeout_s) = -1; 44 | 45 | PROPERTY(int32_t, rpc_num_threads) = 32; 46 | 47 | PROPERTY(int32_t, rpc_max_concurrency) = 0; 48 | 49 | PROPERTY(int32_t, num_threads) = 32; 50 | 51 | PROPERTY(int32_t, max_concurrency) = 32; 52 | 53 | PROPERTY(int32_t, timeout_ms) = 32; 54 | 55 | // instance manager options 56 | PROPERTY(std::string, etcd_addr); 57 | 58 | PROPERTY(int32_t, detect_disconnected_instance_interval) = 15; 59 | 60 | // scheduler options 61 | PROPERTY(std::string, load_balance_policy); 62 | 63 | PROPERTY(int32_t, block_size) = 128; 64 | 65 | PROPERTY(uint32_t, murmur_hash3_seed) = 1024; 66 | 67 | PROPERTY(std::string, service_name); 68 | 69 | // tokenizer options 70 | PROPERTY(std::string, tokenizer_path); 71 | 72 | // trace options 73 | PROPERTY(bool, enable_request_trace) = false; 74 | }; 75 | 76 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/slice.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | 23 | namespace xllm_service { 24 | 25 | template 26 | class Slice final { 27 | public: 28 | Slice() = default; 29 | 30 | Slice(const T* data, size_t size) : data_(data), size_(size) {} 31 | 32 | // it is on purpose to allow implicit conversion from vector to slice 33 | Slice(const std::vector& data) : data_(data.data()), size_(data.size()) {} 34 | 35 | Slice(const std::vector& data, size_t size) 36 | : data_(data.data()), size_(size) { 37 | CHECK_LE(size, data.size()); 38 | } 39 | 40 | // iterator for the slice 41 | const T* begin() const { return data_; } 42 | const T* end() const { return data_ + size_; } 43 | 44 | // get the size of the slice 45 | size_t size() const { return size_; } 46 | 47 | // check if the slice is empty 48 | bool empty() const { return size_ == 0; } 49 | 50 | // get the data pointer 51 | const T* data() const { return data_; } 52 | 53 | // index operator 54 | const T& operator[](size_t i) const { return data_[i]; } 55 | 56 | const T& front() const { return data_[0]; } 57 | 58 | const T& back() const { return data_[size_ - 1]; } 59 | 60 | // get a sub slice 61 | Slice slice(size_t start) const { 62 | CHECK_LE(start, size_); 63 | return {data_ + start, size_ - start}; 64 | } 65 | 66 | Slice slice(size_t start, size_t end) const { 67 | CHECK(start <= end && end <= size_); 68 | return {data_ + start, end - start}; 69 | } 70 | 71 | // it is safe to allow implicit conversion to vector 72 | operator std::vector() const { return {data_, data_ + size_}; } 73 | 74 | private: 75 | const T* data_ = nullptr; 76 | size_t size_ = 0; 77 | }; 78 | 79 | // help comparison operators between slices and std::vector 80 | template 81 | inline bool operator==(const Slice& lhs, const std::vector& rhs) { 82 | return lhs.size() == rhs.size() && 83 | (lhs.data() == rhs.data() || 84 | std::equal(lhs.begin(), lhs.end(), rhs.begin())); 85 | } 86 | 87 | template 88 | inline bool operator==(const std::vector& lhs, const Slice& rhs) { 89 | return lhs.size() == rhs.size() && 90 | (lhs.data() == rhs.data() || 91 | std::equal(lhs.begin(), lhs.end(), rhs.begin())); 92 | } 93 | 94 | template 95 | inline bool operator==(const Slice& lhs, const Slice& rhs) { 96 | return lhs.size() == rhs.size() && 97 | (lhs.data() == rhs.data() || 98 | std::equal(lhs.begin(), lhs.end(), rhs.begin())); 99 | } 100 | 101 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/threadpool.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "common/threadpool.h" 18 | 19 | #include 20 | 21 | #include "common/concurrent_queue.h" 22 | 23 | namespace xllm_service { 24 | 25 | ThreadPool::ThreadPool(size_t num_threads) { 26 | for (size_t i = 0; i < num_threads; ++i) { 27 | threads_.emplace_back([this]() { internal_loop(); }); 28 | } 29 | } 30 | 31 | ThreadPool::~ThreadPool() { 32 | // push nullptr to the queue to signal threads to exit 33 | for (size_t i = 0; i < threads_.size(); ++i) { 34 | queue_.push(nullptr); 35 | } 36 | // wait for all threads to finish 37 | for (auto& thread : threads_) { 38 | thread.join(); 39 | } 40 | } 41 | 42 | // schedule a task to be executed 43 | void ThreadPool::schedule(Task task) { 44 | if (task == nullptr) { 45 | return; 46 | } 47 | queue_.push(std::move(task)); 48 | } 49 | 50 | void ThreadPool::internal_loop() { 51 | while (true) { 52 | Task task = queue_.pop(); 53 | if (task == nullptr) { 54 | // nullptr is a signal to exit 55 | break; 56 | } 57 | task(); 58 | } 59 | } 60 | 61 | } // namespace xllm_service 62 | -------------------------------------------------------------------------------- /xllm_service/common/threadpool.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include "concurrent_queue.h" 23 | 24 | namespace xllm_service { 25 | 26 | class ThreadPool final { 27 | public: 28 | using Task = std::function; 29 | 30 | // constructors 31 | ThreadPool() : ThreadPool(1) {} 32 | 33 | // disable copy/move constructor and assignment 34 | ThreadPool(const ThreadPool&) = delete; 35 | ThreadPool& operator=(const ThreadPool&) = delete; 36 | ThreadPool(ThreadPool&&) = delete; 37 | ThreadPool& operator=(ThreadPool&&) = delete; 38 | 39 | explicit ThreadPool(size_t num_threads); 40 | 41 | // destructor 42 | ~ThreadPool(); 43 | 44 | // schedule a task to be executed 45 | void schedule(Task task); 46 | 47 | private: 48 | void internal_loop(); 49 | 50 | std::vector threads_; 51 | ConcurrentQueue queue_; 52 | }; 53 | 54 | } // namespace xllm_service 55 | -------------------------------------------------------------------------------- /xllm_service/common/ttft_predictor.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ttft_predictor.h" 17 | 18 | static constexpr int32_t kDegree = 2; 19 | 20 | namespace xllm_service { 21 | 22 | TtftPredictor::TtftPredictor( 23 | const std::vector>& ttft_profiling_data) { 24 | if (!ttft_profiling_data.empty()) { 25 | // construct Vandermonde matrix 26 | int32_t m = ttft_profiling_data.size(); 27 | int32_t n = kDegree + 1; 28 | Eigen::MatrixXd matrix(m, n); 29 | for (int32_t i = 0; i < m; ++i) { 30 | for (int32_t j = 0; j < n; ++j) { 31 | matrix(i, j) = std::pow(ttft_profiling_data[i].first, j); 32 | } 33 | } 34 | 35 | // construct target vector 36 | Eigen::VectorXd target(m); 37 | for (int32_t i = 0; i < m; ++i) { 38 | target(i) = ttft_profiling_data[i].second; 39 | } 40 | 41 | // get coefficients 42 | coefficients_ = matrix.colPivHouseholderQr().solve(target); 43 | } else { 44 | coefficients_ = Eigen::VectorXd::Zero(1); 45 | } 46 | } 47 | 48 | int64_t TtftPredictor::predict_ttft(int32_t length) { 49 | double result = 0.0; 50 | double power = 1.0; 51 | for (int32_t i = 0; i < coefficients_.size(); ++i) { 52 | result += coefficients_(i) * power; 53 | power *= length; 54 | } 55 | 56 | return static_cast(result); 57 | } 58 | 59 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/ttft_predictor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | namespace xllm_service { 21 | 22 | // Predictor for predicting TTFT based on input length 23 | class TtftPredictor final { 24 | public: 25 | TtftPredictor( 26 | const std::vector>& ttft_profiling_data); 27 | ~TtftPredictor() = default; 28 | 29 | int64_t predict_ttft(int32_t length); 30 | 31 | private: 32 | Eigen::VectorXd coefficients_; 33 | }; 34 | 35 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/common/utils.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "common/utils.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | 26 | namespace xllm_service { 27 | namespace utils { 28 | 29 | bool enable_debug_log() { 30 | static bool debug_log_enabled = false; 31 | static std::once_flag debug_flag; 32 | std::call_once(debug_flag, []() { 33 | const char* enable_debug_env = std::getenv("ENABLE_XLLM_DEBUG_LOG"); 34 | if (enable_debug_env != nullptr && std::string(enable_debug_env) == "1") { 35 | debug_log_enabled = true; 36 | } 37 | }); 38 | 39 | return debug_log_enabled; 40 | } 41 | 42 | bool is_port_available(int port) { 43 | int fd = socket(AF_INET, SOCK_STREAM, 0); 44 | if (fd < 0) { 45 | LOG(ERROR) << "create socket failed."; 46 | return false; 47 | } 48 | 49 | int opt = 1; 50 | if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) { 51 | LOG(WARNING) << "set socket options failed."; 52 | } 53 | 54 | struct sockaddr_in addr; 55 | memset(&addr, 0, sizeof(addr)); 56 | addr.sin_family = AF_INET; 57 | addr.sin_addr.s_addr = INADDR_ANY; 58 | addr.sin_port = htons(port); 59 | if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) != 0) { 60 | return false; 61 | } 62 | close(fd); 63 | 64 | return true; 65 | } 66 | 67 | bool get_bool_env(const std::string& key, bool defaultValue) { 68 | const char* val = std::getenv(key.c_str()); 69 | if (val == nullptr) { 70 | return defaultValue; 71 | } 72 | std::string strVal(val); 73 | return (strVal == "1" || strVal == "true" || strVal == "TRUE" || 74 | strVal == "True"); 75 | } 76 | 77 | std::string get_local_ip() { 78 | using namespace boost::asio; 79 | io_service io; 80 | ip::tcp::resolver resolver(io); 81 | ip::tcp::resolver::query query(ip::host_name(), ""); 82 | ip::tcp::resolver::iterator iter = resolver.resolve(query); 83 | ip::tcp::resolver::iterator end; 84 | 85 | while (iter != end) { 86 | ip::address addr = iter->endpoint().address(); 87 | if (!addr.is_loopback() && addr.is_v4()) { 88 | return addr.to_string(); 89 | } 90 | ++iter; 91 | } 92 | 93 | LOG(FATAL) << "Get local ip faill!"; 94 | return ""; 95 | } 96 | 97 | } // namespace utils 98 | } // namespace xllm_service 99 | -------------------------------------------------------------------------------- /xllm_service/common/utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | namespace xllm_service { 21 | namespace utils { 22 | 23 | bool enable_debug_log(); 24 | bool is_port_available(int port); 25 | bool get_bool_env(const std::string& key, bool defaultValue); 26 | std::string get_local_ip(); 27 | 28 | } // namespace utils 29 | } // namespace xllm_service 30 | -------------------------------------------------------------------------------- /xllm_service/common/xllm/output.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "status.h" 26 | 27 | namespace xllm_service { 28 | namespace llm { 29 | 30 | // "stop" - the model hit a natural stop point or a provided stop sequence. 31 | // "length" - the maximum number of tokens specified in the request was reached. 32 | // "function_call" - the model called a function. 33 | enum class FinishReason { 34 | NONE = 0, 35 | STOP = 1, 36 | LENGTH, 37 | FUNCTION_CALL, 38 | }; 39 | 40 | struct Usage { 41 | // the number of tokens in the prompt. 42 | size_t num_prompt_tokens = 0; 43 | 44 | // the number of tokens in the generated completion. 45 | size_t num_generated_tokens = 0; 46 | 47 | // the total number of tokens used in the request (prompt + completion). 48 | size_t num_total_tokens = 0; 49 | }; 50 | 51 | struct LogProbData { 52 | // the text of the token. 53 | std::string token; 54 | // the token id. 55 | int32_t token_id; 56 | // the log probability of the token. 57 | float logprob = -9999.0f; 58 | // whether the token is finished. 59 | bool finished_token = true; 60 | }; 61 | 62 | struct LogProb : public LogProbData { 63 | // the top log probabilities. 64 | std::optional> top_logprobs; 65 | }; 66 | 67 | // TODO: support embeddings later 68 | struct SequenceOutput { 69 | // the index of the sequence in the request. 70 | size_t index; 71 | 72 | // the generated/delta text. 73 | // delta text is the text generated since the last response for streaming. 74 | std::string text; 75 | 76 | // the token ids of the generated text. 77 | std::vector token_ids; 78 | 79 | // the reason the sequence finished. 80 | std::optional finish_reason; 81 | 82 | // log probabilities of the generated tokens. 83 | std::optional> logprobs; 84 | }; 85 | 86 | struct RequestOutput { 87 | RequestOutput() = default; 88 | 89 | RequestOutput(Status&& _status) : status(std::move(_status)) {} 90 | 91 | // the id of the request. 92 | std::string request_id; 93 | 94 | std::string service_request_id; 95 | 96 | // the prompt text for the request. 97 | std::optional prompt; 98 | 99 | // the status of the request. 100 | std::optional status; 101 | 102 | // the output for each sequence in the request. 103 | std::vector outputs; 104 | 105 | // the statistics for the request. 106 | std::optional usage; 107 | 108 | // whether the request is finished. 109 | bool finished = false; 110 | }; 111 | 112 | inline std::optional to_string(FinishReason reason) { 113 | switch (reason) { 114 | case FinishReason::NONE: 115 | return std::nullopt; 116 | case FinishReason::STOP: 117 | return "stop"; 118 | case FinishReason::LENGTH: 119 | return "length"; 120 | case FinishReason::FUNCTION_CALL: 121 | return "function_call"; 122 | default: 123 | LOG(WARNING) << "Unknown finish reason: " << static_cast(reason); 124 | } 125 | return std::nullopt; 126 | } 127 | 128 | } // namespace llm 129 | 130 | using OutputCallback = std::function; 131 | 132 | } // namespace xllm_service 133 | -------------------------------------------------------------------------------- /xllm_service/common/xllm/status.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | namespace xllm_service { 24 | namespace llm { 25 | 26 | enum class StatusCode : uint8_t { 27 | // Not an error; returned on success. 28 | OK = 0, 29 | // The request was cancelled. (by user/server) 30 | CANCELLED = 1, 31 | // Unknown error. 32 | UNKNOWN = 2, 33 | // Client specified an invalid argument. 34 | INVALID_ARGUMENT = 3, 35 | // Deadline expired before operation could complete. for example, timeout. 36 | DEADLINE_EXCEEDED = 4, 37 | // Some resource has been exhausted. 38 | RESOURCE_EXHAUSTED = 5, 39 | // The request does not have valid authentication credentials. 40 | UNAUTHENTICATED = 6, 41 | // The service is currently unavailable. 42 | UNAVAILABLE = 7, 43 | // Not implemented or not supported in this service. 44 | UNIMPLEMENTED = 8, 45 | }; 46 | 47 | class Status final { 48 | public: 49 | Status() = default; 50 | 51 | Status(StatusCode code) : code_(code) {} 52 | 53 | Status(StatusCode code, std::string msg) 54 | : code_(code), msg_(std::move(msg)) {} 55 | 56 | StatusCode code() const { return code_; } 57 | 58 | const std::string& message() const { return msg_; } 59 | 60 | bool ok() const { return code_ == StatusCode::OK; } 61 | 62 | private: 63 | StatusCode code_ = StatusCode::OK; 64 | std::string msg_; 65 | }; 66 | 67 | inline std::ostream& operator<<(std::ostream& os, const Status& status) { 68 | os << "Status, code: " << static_cast(status.code()) 69 | << ", message: " << status.message(); 70 | return os; 71 | } 72 | 73 | } // namespace llm 74 | } // namespace xllm_service 75 | -------------------------------------------------------------------------------- /xllm_service/common/xllm/uuid.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "uuid.h" 18 | 19 | #include 20 | 21 | namespace xllm_service { 22 | namespace llm { 23 | 24 | std::string ShortUUID::random(size_t len) { 25 | if (len == 0) { 26 | len = 22; 27 | } 28 | 29 | std::string uuid(len, ' '); 30 | for (size_t i = 0; i < len; i++) { 31 | const size_t rand = absl::Uniform( 32 | absl::IntervalClosedOpen, gen_, 0, alphabet_.size()); 33 | uuid[i] = alphabet_[rand]; 34 | } 35 | return uuid; 36 | } 37 | 38 | } // namespace llm 39 | } // namespace xllm_service 40 | -------------------------------------------------------------------------------- /xllm_service/common/xllm/uuid.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | 20 | #include 21 | 22 | namespace xllm_service { 23 | namespace llm { 24 | 25 | class ShortUUID { 26 | public: 27 | ShortUUID() = default; 28 | 29 | std::string random(size_t len = 0); 30 | 31 | private: 32 | std::string alphabet_ = 33 | "23456789ABCDEFGHJKLMNPQRSTUVWXYZ" 34 | "abcdefghijkmnopqrstuvwxyz"; 35 | absl::BitGen gen_; 36 | }; 37 | 38 | } // namespace llm 39 | } // namespace xllm_service 40 | -------------------------------------------------------------------------------- /xllm_service/examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | 3 | cc_binary( 4 | NAME 5 | rpc_hello_client 6 | SRCS 7 | rpc_hello_client.cpp 8 | DEPS 9 | :xllm_rpc_service 10 | gflags::gflags 11 | glog::glog 12 | ) 13 | 14 | cc_binary( 15 | NAME 16 | rpc_client_test 17 | SRCS 18 | rpc_client_test.cpp 19 | DEPS 20 | :xllm_rpc_client 21 | gflags::gflags 22 | glog::glog 23 | ) 24 | 25 | cc_binary( 26 | NAME 27 | http_client_test 28 | SRCS 29 | http_client_test.cpp 30 | DEPS 31 | gflags::gflags 32 | ) 33 | target_link_libraries(http_client_test PRIVATE brpc-static) 34 | -------------------------------------------------------------------------------- /xllm_service/examples/curl_http_client.sh: -------------------------------------------------------------------------------- 1 | curl -d '{"ping":"hello"}' 127.0.0.1:8888/hello 2 | -------------------------------------------------------------------------------- /xllm_service/examples/rpc_client_test.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | 19 | #include "rpc_service/client.h" 20 | 21 | DEFINE_string(server_address, "localhost:9999", "Grpc server address."); 22 | DEFINE_string(client_name, "127.0.0.1@9999", "client name."); 23 | DEFINE_string(protocol, 24 | "baidu_std", 25 | "Protocol type. Defined in src/brpc/options.proto"); 26 | DEFINE_string(connection_type, 27 | "", 28 | "Connection type. Available values: single, pooled, short"); 29 | DEFINE_string(server, "0.0.0.0:8000", "IP Address of server"); 30 | DEFINE_string(load_balancer, "", "The algorithm for load balancing"); 31 | DEFINE_int32(timeout_ms, 100, "RPC timeout in milliseconds"); 32 | DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); 33 | DEFINE_int32(interval_ms, 1000, "Milliseconds between consecutive requests"); 34 | 35 | int main(int argc, char* argv[]) { 36 | // initialize glog and gflags 37 | google::InitGoogleLogging(argv[0]); 38 | gflags::ParseCommandLineFlags(&argc, &argv, true); 39 | 40 | // Define the server address and port 41 | std::string server_address(FLAGS_server_address); 42 | 43 | xllm_service::ChannelOptions options; 44 | 45 | // Create a client instance 46 | xllm_service::XllmRpcClient client( 47 | FLAGS_client_name, server_address, options); 48 | 49 | // Register the instance 50 | auto ret = client.register_instance(); 51 | if (ret != xllm_service::ErrorCode::OK) { 52 | LOG(ERROR) << "Register instance failed."; 53 | return -1; 54 | } 55 | 56 | // Keep the client running 57 | while (true) { 58 | sleep(1); 59 | } 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /xllm_service/examples/rpc_hello_client.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "xllm_rpc_service.pb.h" 26 | 27 | DEFINE_string(server_address, "localhost:9999", "Grpc server address."); 28 | DEFINE_string(protocol, 29 | "baidu_std", 30 | "Protocol type. Defined in src/brpc/options.proto"); 31 | DEFINE_string(connection_type, 32 | "", 33 | "Connection type. Available values: single, pooled, short"); 34 | DEFINE_string(load_balancer, "", "The algorithm for load balancing"); 35 | DEFINE_int32(timeout_ms, 100, "RPC timeout in milliseconds"); 36 | DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); 37 | DEFINE_int32(interval_ms, 1000, "Milliseconds between consecutive requests"); 38 | 39 | namespace xllm_service { 40 | namespace test { 41 | 42 | struct ChannelOptions { 43 | std::string protocol = "baidu_std"; 44 | std::string connection_type = ""; 45 | std::string load_balancer = ""; 46 | int timeout_ms = 100; 47 | int max_retry = 3; 48 | int interval_ms = 1000; 49 | }; 50 | 51 | class HelloClient final { 52 | public: 53 | HelloClient(const std::string& addr, ChannelOptions options) { 54 | brpc::ChannelOptions chan_options; 55 | chan_options.protocol = options.protocol; 56 | chan_options.connection_type = options.connection_type; 57 | chan_options.timeout_ms = options.timeout_ms /*milliseconds*/; 58 | chan_options.max_retry = options.max_retry; 59 | if (master_channel_.Init( 60 | addr.c_str(), options.load_balancer.c_str(), &chan_options) != 0) { 61 | LOG(ERROR) << "Fail to initialize brpc channel to server " << addr; 62 | return; 63 | } 64 | master_stub_ = 65 | std::make_unique(&master_channel_); 66 | } 67 | 68 | void hello() { 69 | // Create a message to send to the server 70 | brpc::Controller cntl; 71 | proto::Empty request; 72 | proto::Status response; 73 | master_stub_->Hello(&cntl, &request, &response, nullptr); 74 | if (cntl.Failed()) { 75 | LOG(ERROR) << "Send to server faild, err msg:" << cntl.ErrorText(); 76 | return; 77 | } 78 | 79 | std::cout << "Get server response: " << response.ok() << "\n"; 80 | } 81 | 82 | private: 83 | brpc::Channel master_channel_; 84 | std::unique_ptr master_stub_; 85 | }; 86 | 87 | } // namespace test 88 | } // namespace xllm_service 89 | 90 | int main(int argc, char* argv[]) { 91 | // initialize glog and gflags 92 | google::InitGoogleLogging(argv[0]); 93 | gflags::ParseCommandLineFlags(&argc, &argv, true); 94 | 95 | // Define the server address and port 96 | std::string server_address(FLAGS_server_address); 97 | 98 | xllm_service::test::ChannelOptions opt; 99 | opt.protocol = FLAGS_protocol; 100 | opt.connection_type = FLAGS_connection_type; 101 | opt.load_balancer = FLAGS_load_balancer; 102 | opt.timeout_ms = FLAGS_timeout_ms; 103 | opt.max_retry = FLAGS_max_retry; 104 | opt.interval_ms = FLAGS_interval_ms; 105 | 106 | // Create a chat client 107 | xllm_service::test::HelloClient client(server_address, opt); 108 | 109 | client.hello(); 110 | 111 | return 0; 112 | } 113 | -------------------------------------------------------------------------------- /xllm_service/http_service/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | 4 | cc_library( 5 | NAME 6 | xllm_http_service 7 | HDRS 8 | service.h 9 | request_tracer.h 10 | SRCS 11 | service.cpp 12 | request_tracer.cpp 13 | DEPS 14 | :common 15 | :request 16 | :scheduler 17 | absl::random_random 18 | absl::synchronization 19 | glog::glog 20 | nlohmann_json::nlohmann_json 21 | proto::proto_http_service 22 | proto_xllm 23 | ) 24 | target_link_libraries(xllm_http_service PRIVATE brpc-static leveldb::leveldb ZLIB::ZLIB protobuf::libprotobuf OpenSSL::SSL OpenSSL::Crypto) 25 | 26 | cc_binary( 27 | NAME 28 | xllm_http_serving 29 | SRCS 30 | main.cpp 31 | DEPS 32 | :xllm_http_service 33 | gflags::gflags 34 | ) 35 | -------------------------------------------------------------------------------- /xllm_service/http_service/main.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "common/global_gflags.h" 22 | #include "common/options.h" 23 | #include "http_service/service.h" 24 | 25 | int main(int argc, char** argv) { 26 | // Initialize gflags 27 | gflags::ParseCommandLineFlags(&argc, &argv, true); 28 | 29 | // Initialize glog 30 | google::InitGoogleLogging(argv[0]); 31 | FLAGS_logtostderr = true; 32 | 33 | LOG(INFO) << "Starting xllm http service, port: " << FLAGS_port; 34 | 35 | xllm_service::Options service_options; 36 | xllm_service::XllmHttpServiceImpl service_impl(service_options, nullptr); 37 | 38 | // register http methods here 39 | brpc::Server server; 40 | if (server.AddService(&service_impl, 41 | brpc::SERVER_DOESNT_OWN_SERVICE, 42 | "/hello => Hello," 43 | "/v1/completions => Completions,") != 0) { 44 | LOG(ERROR) << "Fail to add brpc http service"; 45 | return false; 46 | } 47 | 48 | brpc::ServerOptions options; 49 | options.idle_timeout_sec = FLAGS_idle_timeout_s; 50 | options.num_threads = FLAGS_num_threads; 51 | options.max_concurrency = FLAGS_max_concurrency; 52 | if (server.Start(FLAGS_port, &options) != 0) { 53 | LOG(ERROR) << "Failed to start brpc http server on port " << FLAGS_port; 54 | return false; 55 | } 56 | 57 | LOG(INFO) << "Xllm http server started on port " << FLAGS_port 58 | << ", idle_timeout_sec: " << FLAGS_idle_timeout_s 59 | << ", num_threads: " << FLAGS_num_threads 60 | << ", max_concurrency: " << FLAGS_max_concurrency; 61 | 62 | // Wait until Ctrl-C is pressed, then Stop() and Join() the server. 63 | server.RunUntilAskedToQuit(); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /xllm_service/http_service/request_tracer.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "http_service/request_tracer.h" 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace xllm_service { 26 | 27 | static std::string get_current_timestamp() { 28 | auto now = std::chrono::system_clock::now(); 29 | auto in_time_t = std::chrono::system_clock::to_time_t(now); 30 | 31 | std::stringstream ss; 32 | ss << std::put_time(std::localtime(&in_time_t), "%Y-%m-%d %H:%M:%S"); 33 | return ss.str(); 34 | } 35 | 36 | RequestTracer::RequestTracer(bool enable_request_trace) 37 | : enable_request_trace_(enable_request_trace) { 38 | if (!enable_request_trace_) return; 39 | std::filesystem::create_directories("trace"); 40 | log_stream_.open("trace/trace.json", std::ios::app); 41 | if (!log_stream_.is_open()) { 42 | LOG(ERROR) << "Failed to open log file: trace/trace.json"; 43 | } 44 | } 45 | 46 | void RequestTracer::log(const std::string& service_request_id, 47 | const std::string& input_or_output) { 48 | if (!enable_request_trace_) return; 49 | 50 | std::lock_guard lock(mutex_); 51 | std::string timestamp = get_current_timestamp(); 52 | 53 | nlohmann::json j; 54 | j["timestamp"] = timestamp; 55 | j["service_request_id"] = service_request_id; 56 | j["data"] = input_or_output; 57 | 58 | log_stream_ << j.dump() << "\n"; 59 | } 60 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/http_service/request_tracer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | #include 19 | #include 20 | 21 | namespace xllm_service { 22 | 23 | class RequestTracer { 24 | public: 25 | RequestTracer(bool enable_request_trace); 26 | ~RequestTracer() = default; 27 | RequestTracer(const RequestTracer&) = delete; 28 | RequestTracer& operator=(const RequestTracer&) = delete; 29 | RequestTracer(RequestTracer&&) = delete; 30 | RequestTracer& operator=(RequestTracer&&) = delete; 31 | void log(const std::string& service_request_id, 32 | const std::string& input_or_output); 33 | 34 | private: 35 | std::ofstream log_stream_; 36 | std::mutex mutex_; 37 | bool enable_request_trace_ = false; 38 | }; 39 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/http_service/service.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #include "chat.pb.h" 24 | #include "common/call_data.h" 25 | #include "common/options.h" 26 | #include "common/threadpool.h" 27 | #include "common/types.h" 28 | #include "completion.pb.h" 29 | #include "request/request.h" 30 | #include "request_tracer.h" 31 | #include "xllm_http_service.pb.h" 32 | 33 | namespace xllm_service { 34 | 35 | class Scheduler; 36 | class InstanceMgr; 37 | class ClosureGuard; 38 | 39 | class XllmHttpServiceImpl : public proto::XllmHttpService { 40 | public: 41 | XllmHttpServiceImpl(const Options& options, Scheduler* scheduler); 42 | ~XllmHttpServiceImpl(); 43 | 44 | void Hello(::google::protobuf::RpcController* controller, 45 | const proto::HttpHelloRequest* request, 46 | proto::HttpHelloResponse* response, 47 | ::google::protobuf::Closure* done) override; 48 | 49 | void Completions(::google::protobuf::RpcController* controller, 50 | const proto::HttpRequest* request, 51 | proto::HttpResponse* response, 52 | ::google::protobuf::Closure* done) override; 53 | 54 | void ChatCompletions(::google::protobuf::RpcController* controller, 55 | const proto::HttpRequest* request, 56 | proto::HttpResponse* response, 57 | ::google::protobuf::Closure* done) override; 58 | 59 | void Embeddings(::google::protobuf::RpcController* controller, 60 | const proto::HttpRequest* request, 61 | proto::HttpResponse* response, 62 | ::google::protobuf::Closure* done) override; 63 | 64 | void Models(::google::protobuf::RpcController* controller, 65 | const proto::HttpRequest* request, 66 | proto::HttpResponse* response, 67 | ::google::protobuf::Closure* done) override; 68 | 69 | void Metrics(::google::protobuf::RpcController* controller, 70 | const proto::HttpRequest* request, 71 | proto::HttpResponse* response, 72 | ::google::protobuf::Closure* done) override; 73 | 74 | private: 75 | template 76 | std::shared_ptr generate_request(T* req_pb, 77 | const std::string& method); 78 | 79 | template 80 | void handle(std::shared_ptr call_data, 81 | const std::string& req_attachment, 82 | std::shared_ptr request, 83 | const std::string& method); 84 | 85 | void get_serving(const std::string& serving_method, 86 | ::google::protobuf::RpcController* controller, 87 | const proto::HttpRequest* request, 88 | proto::HttpResponse* response, 89 | ::google::protobuf::Closure* done); 90 | 91 | private: 92 | Options options_; 93 | 94 | // not own 95 | Scheduler* scheduler_; 96 | 97 | bool initialized_ = false; 98 | 99 | std::unique_ptr request_tracer_; 100 | 101 | std::unique_ptr thread_pool_; 102 | 103 | // In disagg pd mode, we support receive generated token from 104 | // prefill or from decode directly. 105 | // 1. 106 | // [service] ---req---> [prefill] ---req---> [decode] 107 | // [service] <---first resp--- [prefill] ---first resp---> [decode] 108 | // [service] <---resp--- [prefill] <---resp--- [decode] 109 | // 110 | // 2. 111 | // [service] ---req---> [prefill] ---req---> [decode] 112 | // [service] <---first resp-- [prefill] --first resp---> [decode] 113 | // [service] <---resp-- [decode] 114 | // 115 | bool enable_decode_response_to_service_ = false; 116 | }; 117 | 118 | } // namespace xllm_service 119 | -------------------------------------------------------------------------------- /xllm_service/master.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | 22 | #include "common/options.h" 23 | #include "http_service/service.h" 24 | #include "rpc_service/service.h" 25 | #include "scheduler/scheduler.h" 26 | 27 | namespace xllm_service { 28 | 29 | class Master { 30 | public: 31 | explicit Master(const Options& options); 32 | ~Master(); 33 | 34 | bool start(); 35 | void stop(); 36 | 37 | private: 38 | bool start_http_server(); 39 | bool start_rpc_server(); 40 | 41 | private: 42 | Options options_; 43 | 44 | // Scheduler for scheduling requests and instances 45 | std::unique_ptr scheduler_; 46 | 47 | // 1.For http service 48 | std::string http_server_address_; 49 | std::unique_ptr http_service_; 50 | brpc::Server http_server_; 51 | std::unique_ptr http_server_thread_; 52 | 53 | // 2.For rpc service 54 | std::string rpc_server_address_; 55 | std::unique_ptr rpc_service_; 56 | brpc::Server rpc_server_; 57 | std::unique_ptr rpc_server_thread_; 58 | }; 59 | 60 | } // namespace xllm_service 61 | -------------------------------------------------------------------------------- /xllm_service/proto/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(grpc_proto_library) 2 | include(proto_library) 3 | 4 | proto_library( 5 | NAME 6 | proto_rpc_service 7 | SRCS 8 | xllm_rpc_service.proto 9 | ) 10 | 11 | proto_library( 12 | NAME 13 | proto_http_service 14 | SRCS 15 | xllm_http_service.proto 16 | ) 17 | 18 | proto_library( 19 | NAME 20 | proto_xllm 21 | SRCS 22 | xllm/chat.proto 23 | xllm/common.proto 24 | xllm/completion.proto 25 | ) 26 | -------------------------------------------------------------------------------- /xllm_service/proto/xllm/common.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package xllm_service.llm.proto; 4 | 5 | message Usage { 6 | // the number of tokens in the prompt. 7 | optional int32 prompt_tokens = 1 [json_name="prompt_tokens"]; 8 | 9 | // the number of tokens in the generated completion. 10 | optional int32 completion_tokens = 2 [json_name="completion_tokens"]; 11 | 12 | // the total number of tokens used in the request (prompt + completion). 13 | optional int32 total_tokens = 3 [json_name="total_tokens"]; 14 | } 15 | 16 | // Options for streaming response. 17 | message StreamOptions{ 18 | // if set, an additional chunk with usage will be streamed before the data: [DONE] message. 19 | optional bool include_usage = 1; 20 | } 21 | 22 | message Status { 23 | bool ok = 1; 24 | } 25 | 26 | message Routing { 27 | string prefill_name = 1; 28 | 29 | string decode_name = 2; 30 | } 31 | -------------------------------------------------------------------------------- /xllm_service/proto/xllm_http_service.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package xllm_service.proto; 4 | option cc_enable_arenas = true; 5 | option cc_generic_services = true; 6 | 7 | message HttpHelloRequest { 8 | string ping = 1; 9 | } 10 | 11 | message HttpHelloResponse { 12 | string pong = 1; 13 | } 14 | 15 | message HttpRequest { 16 | } 17 | 18 | message HttpResponse { 19 | } 20 | 21 | service XllmHttpService { 22 | rpc Hello(HttpHelloRequest) returns (HttpHelloResponse) {} 23 | rpc Completions (HttpRequest) returns (HttpResponse) {} 24 | rpc ChatCompletions (HttpRequest) returns (HttpResponse) {} 25 | rpc Embeddings (HttpRequest) returns (HttpResponse) {} 26 | rpc Models (HttpRequest) returns (HttpResponse) {} 27 | rpc Metrics (HttpRequest) returns (HttpResponse) {} 28 | } 29 | -------------------------------------------------------------------------------- /xllm_service/proto/xllm_rpc_service.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package xllm_service.proto; 4 | option cc_generic_services = true; 5 | 6 | message Empty {} 7 | 8 | message Status { 9 | bool ok = 1; 10 | } 11 | 12 | message StatusSet { 13 | repeated Status all_status = 1; 14 | } 15 | 16 | message StatusCode { 17 | int32 status_code = 1; 18 | } 19 | 20 | enum InstanceType { 21 | DEFAULT = 0; 22 | PREFILL = 1; 23 | DECODE = 2; 24 | } 25 | 26 | message WorkerKVAddr { 27 | repeated uint64 layer_addrs = 1; 28 | } 29 | 30 | message InstanceMetaInfo { 31 | // http server address currently 32 | string name = 1; 33 | // rpc server address 34 | string rpc_address = 2; 35 | optional InstanceType type = 3; 36 | repeated uint64 cluster_ids = 4; 37 | repeated string addrs = 8; 38 | repeated int64 k_cache_ids = 5; 39 | repeated int64 v_cache_ids = 6; 40 | int32 dp_size = 7; 41 | } 42 | 43 | message KvCacheEvent { 44 | repeated bytes stored_cache = 1; 45 | repeated bytes removed_cache = 2; 46 | repeated bytes offload_cache = 3; 47 | } 48 | 49 | message LoadMetrics { 50 | uint64 waiting_requests_num = 1; 51 | float gpu_cache_usage_perc = 2; 52 | } 53 | 54 | message LatencyMetrics { 55 | int64 recent_max_ttft = 1; 56 | int64 recent_max_tbt = 2; 57 | } 58 | 59 | message HeartbeatRequest { 60 | string name = 1; 61 | KvCacheEvent cache_event = 2; 62 | LoadMetrics load_metrics = 3; 63 | LatencyMetrics latency_metrics = 4; 64 | } 65 | 66 | message InstanceID { 67 | string name = 1; 68 | } 69 | 70 | message InstanceIDs { 71 | repeated string names = 1; 72 | } 73 | 74 | 75 | message OutputUsage { 76 | // the number of tokens in the prompt. 77 | int32 num_prompt_tokens = 1; 78 | // the number of tokens in the generated completion. 79 | int32 num_generated_tokens = 2; 80 | // the total number of tokens used in the request (prompt + completion). 81 | int32 num_total_tokens = 3; 82 | } 83 | 84 | message LogProbData { 85 | // the text of the token 86 | string token = 1; 87 | // the token id 88 | int32 token_id = 2; 89 | // the log probability of the token 90 | float logprob = 3; 91 | // whether the token is finished 92 | bool finished_token = 4; 93 | } 94 | 95 | message LogProb { 96 | LogProbData log_prob_data = 1; 97 | repeated LogProbData top_logprobs = 2; 98 | } 99 | 100 | message SequenceOutput { 101 | // the index of the sequence in the request. 102 | int32 index = 1; 103 | // the generated/delta text. 104 | // delta text is the text generated since the last response for streaming. 105 | string text = 2; 106 | // the token ids of the generated text. 107 | repeated int32 token_ids = 3; 108 | // the reason the sequence finished. 109 | string finish_reason = 4; 110 | // log probabilities of the generated tokens. 111 | repeated LogProb logprobs = 5; 112 | } 113 | 114 | message GenerationStatus { 115 | int32 status_code = 1; 116 | string status_msg = 2; 117 | } 118 | 119 | // Stream response token to prefill instance from decode. 120 | message DisaggStreamGeneration { 121 | // req id 122 | string req_id = 1; 123 | // req id which is generated in xllm service. 124 | string service_req_id = 2; 125 | // the status of the request 126 | GenerationStatus gen_status = 3; 127 | // maybe multi sequences in the request 128 | repeated SequenceOutput outputs = 4; 129 | OutputUsage usage = 5; 130 | bool finished = 6; 131 | } 132 | 133 | message DisaggStreamGenerations { 134 | repeated DisaggStreamGeneration gens = 1; 135 | } 136 | 137 | message ServiceConfig { 138 | bool enable_decode_response_to_service = 1; 139 | } 140 | 141 | service XllmRpcService { 142 | rpc Hello(Empty) returns (Status) {} 143 | rpc RegisterInstance(InstanceMetaInfo) returns (StatusCode) {} 144 | rpc GetInstanceInfo(InstanceID) returns (InstanceMetaInfo) {} 145 | rpc Heartbeat(HeartbeatRequest) returns (Status) {} 146 | rpc GetStaticDecodeList(InstanceID) returns (InstanceIDs) {} 147 | rpc GetConfig(Empty) returns (ServiceConfig) {} 148 | 149 | // xllm service receive response from decode instance directly in disagg pd mode. 150 | // This can eliminate the cost brought by forwarding through prefill. 151 | rpc Generations(DisaggStreamGenerations) returns (StatusSet) {} 152 | } 153 | 154 | -------------------------------------------------------------------------------- /xllm_service/request/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | request 6 | HDRS 7 | request.h 8 | DEPS 9 | :common 10 | ) -------------------------------------------------------------------------------- /xllm_service/request/request.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "chat_template/jinja_chat_template.h" 19 | #include "common/types.h" 20 | #include "common/xllm/output.h" 21 | 22 | namespace xllm_service { 23 | 24 | // Store request-related data 25 | struct Request { 26 | // model name 27 | std::string model; 28 | 29 | // request id generated by service 30 | std::string service_request_id; 31 | 32 | // whether to stream the response 33 | bool stream = false; 34 | 35 | // whether to return usage 36 | bool include_usage = false; 37 | 38 | // input prompt 39 | std::string prompt; 40 | 41 | // input messages 42 | ChatMessages messages; 43 | 44 | // token ids of prompt 45 | std::vector token_ids; 46 | 47 | // instance routing 48 | Routing routing; 49 | 50 | // the estimated TTFT obtained from the TTFT predictor 51 | int64_t estimated_ttft = 0; 52 | 53 | // output callback 54 | OutputCallback output_callback; 55 | 56 | // trace callback 57 | std::function trace_callback = nullptr; 58 | }; 59 | 60 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/rpc_service/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | xllm_rpc_service 8 | HDRS 9 | service.h 10 | SRCS 11 | service.cpp 12 | DEPS 13 | :common 14 | :scheduler 15 | absl::random_random 16 | absl::strings 17 | glog::glog 18 | proto::proto_rpc_service 19 | proto_xllm 20 | tokenizer 21 | chat_template 22 | ) 23 | target_link_libraries(xllm_rpc_service PRIVATE brpc-static) 24 | 25 | cc_binary( 26 | NAME 27 | xllm_rpc_service_test 28 | SRCS 29 | rpc_service_test.cpp 30 | DEPS 31 | :xllm_rpc_service 32 | gflags::gflags 33 | glog::glog 34 | GTest::gtest_main 35 | ) 36 | add_test(NAME XllmRpcServiceTest COMMAND xllm_rpc_service_test) 37 | 38 | cc_binary( 39 | NAME 40 | xllm_rpc_serving 41 | SRCS 42 | main.cpp 43 | DEPS 44 | :xllm_rpc_service 45 | gflags::gflags 46 | ) 47 | 48 | cc_library( 49 | NAME 50 | xllm_rpc_client 51 | HDRS 52 | client.h 53 | SRCS 54 | client.cpp 55 | DEPS 56 | :common 57 | glog::glog 58 | proto::proto_rpc_service 59 | ) 60 | target_link_libraries(xllm_rpc_client PRIVATE brpc-static) 61 | -------------------------------------------------------------------------------- /xllm_service/rpc_service/client.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "rpc_service/client.h" 17 | 18 | #include 19 | 20 | namespace xllm_service { 21 | 22 | // magic number, TODO: move to config file or env var 23 | static constexpr int kHeartbeatInterval = 3; // in seconds 24 | 25 | XllmRpcClient::XllmRpcClient(const std::string& instace_name, 26 | const std::string& master_addr, 27 | const ChannelOptions& options) 28 | : instance_name_(instace_name), master_addr_(master_addr) { 29 | brpc::ChannelOptions chan_options; 30 | chan_options.protocol = options.protocol; 31 | chan_options.connection_type = options.connection_type; 32 | chan_options.timeout_ms = options.timeout_ms /*milliseconds*/; 33 | chan_options.max_retry = options.max_retry; 34 | if (master_channel_.Init(master_addr_.c_str(), 35 | options.load_balancer.c_str(), 36 | &chan_options) != 0) { 37 | LOG(ERROR) << "Fail to initialize brpc channel to server " << master_addr_; 38 | return; 39 | } 40 | master_stub_ = std::make_unique(&master_channel_); 41 | 42 | // heartbeat thread 43 | heartbeat_thread_ = 44 | std::make_unique(&XllmRpcClient::heartbeat, this); 45 | } 46 | 47 | XllmRpcClient::~XllmRpcClient() { 48 | exited_ = true; 49 | if (heartbeat_thread_) { 50 | heartbeat_thread_->join(); 51 | } 52 | } 53 | 54 | // TODO: send metainfo/metrics to master ? 55 | void XllmRpcClient::heartbeat() { 56 | while (!exited_) { 57 | std::this_thread::sleep_for(std::chrono::seconds(kHeartbeatInterval)); 58 | if (!register_inst_done_) continue; 59 | 60 | brpc::Controller cntl; 61 | proto::HeartbeatRequest req; 62 | req.set_name(instance_name_); 63 | // TODO: set req.cache_event and req.load_metrics 64 | proto::Status res; 65 | master_stub_->Heartbeat(&cntl, &req, &res, nullptr); 66 | if (cntl.Failed()) { 67 | LOG(ERROR) << instance_name_ 68 | << " failed to send heartbeat to master: " << cntl.ErrorText(); 69 | ; 70 | } else if (!res.ok()) { 71 | LOG(ERROR) << instance_name_ 72 | << " failed to send heartbeat to master, status: " << res.ok(); 73 | } 74 | } 75 | } 76 | 77 | ErrorCode XllmRpcClient::register_instance() { 78 | InstanceMetaInfo metainfo; 79 | metainfo.name = instance_name_; 80 | return register_instance(metainfo); 81 | } 82 | 83 | ErrorCode XllmRpcClient::register_instance(const InstanceMetaInfo& metainfo) { 84 | brpc::Controller cntl; 85 | proto::InstanceMetaInfo req; 86 | req.set_name(metainfo.name); 87 | if (metainfo.type == InstanceType::PREFILL) { 88 | req.set_type(proto::InstanceType::PREFILL); 89 | } else if (metainfo.type == InstanceType::DECODE) { 90 | req.set_type(proto::InstanceType::DECODE); 91 | } else { 92 | req.set_type(proto::InstanceType::DEFAULT); 93 | } 94 | proto::StatusCode res; 95 | master_stub_->RegisterInstance(&cntl, &req, &res, nullptr); 96 | if (cntl.Failed()) { 97 | LOG(ERROR) << instance_name_ 98 | << " failed to send register_instance to master: " 99 | << cntl.ErrorText(); 100 | ; 101 | } else if (res.status_code() != ConvertErrorCode::to_int(ErrorCode::OK)) { 102 | LOG(ERROR) << instance_name_ 103 | << " failed to send register_instance to master: " 104 | << "res = " << res.status_code(); 105 | } else { 106 | // register instance success 107 | register_inst_done_ = true; 108 | } 109 | return ConvertErrorCode::from_int(res.status_code()); 110 | } 111 | 112 | } // namespace xllm_service 113 | -------------------------------------------------------------------------------- /xllm_service/rpc_service/client.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | #include "common/types.h" 25 | #include "xllm_rpc_service.pb.h" 26 | 27 | namespace xllm_service { 28 | 29 | struct ChannelOptions { 30 | std::string protocol = "baidu_std"; 31 | std::string connection_type = ""; 32 | std::string load_balancer = ""; 33 | int timeout_ms = 100; 34 | int max_retry = 3; 35 | int interval_ms = 1000; 36 | }; 37 | 38 | class XllmRpcClient { 39 | public: 40 | XllmRpcClient(const std::string& instace_name, 41 | const std::string& master_addr, 42 | const ChannelOptions& options); 43 | ~XllmRpcClient(); 44 | 45 | ErrorCode register_instance(); 46 | ErrorCode register_instance(const InstanceMetaInfo& metainfo); 47 | 48 | private: 49 | void heartbeat(); 50 | 51 | private: 52 | bool exited_ = false; 53 | bool register_inst_done_ = false; 54 | // instance rdma address or other info: ip port 55 | std::string instance_name_; 56 | std::string master_addr_; 57 | brpc::Channel master_channel_; 58 | std::unique_ptr master_stub_; 59 | std::unique_ptr heartbeat_thread_; 60 | }; 61 | 62 | } // namespace xllm_service 63 | -------------------------------------------------------------------------------- /xllm_service/rpc_service/main.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "common/global_gflags.h" 21 | #include "common/types.h" 22 | #include "common/utils.h" 23 | #include "rpc_service/service.h" 24 | 25 | int main(int argc, char* argv[]) { 26 | // Initialize gflags 27 | gflags::ParseCommandLineFlags(&argc, &argv, true); 28 | 29 | // Initialize glog 30 | google::InitGoogleLogging(argv[0]); 31 | 32 | LOG(INFO) << "Dump all gflags: " << std::endl 33 | << google::CommandlineFlagsIntoString(); 34 | google::FlushLogFiles(google::INFO); 35 | 36 | LOG(INFO) << "Starting xllm rpc service, port: " << FLAGS_port; 37 | 38 | if (!xllm_service::utils::is_port_available(FLAGS_port)) { 39 | LOG(ERROR) << "Port " << FLAGS_port << " is already in use. " 40 | << "Please specify a different port using --port flag."; 41 | return 1; 42 | } 43 | 44 | // create xllm service 45 | xllm_service::Options service_options; 46 | xllm_service::XllmRpcService service(service_options, nullptr); 47 | 48 | // Initialize brpc server 49 | std::string server_address = "0.0.0.0:" + std::to_string(FLAGS_port); 50 | brpc::Server server; 51 | if (server.AddService(&service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { 52 | LOG(ERROR) << "Failed to add service to server"; 53 | return -1; 54 | } 55 | 56 | butil::EndPoint endpoint; 57 | if (!FLAGS_listen_addr.empty()) { 58 | if (butil::str2endpoint(FLAGS_listen_addr.c_str(), &endpoint) < 0) { 59 | LOG(ERROR) << "Invalid listen address:" << FLAGS_listen_addr; 60 | return -1; 61 | } 62 | } else { 63 | endpoint = butil::EndPoint(butil::IP_ANY, FLAGS_port); 64 | } 65 | 66 | // Start the server. 67 | brpc::ServerOptions options; 68 | options.idle_timeout_sec = FLAGS_idle_timeout_s; 69 | options.num_threads = FLAGS_num_threads; 70 | options.max_concurrency = FLAGS_max_concurrency; 71 | options.idle_timeout_sec = FLAGS_idle_timeout_s; 72 | if (server.Start(endpoint, &options) != 0) { 73 | LOG(ERROR) << "Fail to start Brpc rpc server"; 74 | return -1; 75 | } 76 | 77 | LOG(INFO) << "Xllm rpc service listening on " << server_address; 78 | 79 | // Wait until Ctrl-C is pressed, then Stop() and Join() the server. 80 | server.RunUntilAskedToQuit(); 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /xllm_service/rpc_service/rpc_service_test.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | 19 | #include "rpc_service/service.h" 20 | 21 | namespace xllm_service::test { 22 | 23 | class XllmRpcServiceTest : public ::testing::Test { 24 | protected: 25 | void SetUp() override { google::InitGoogleLogging("XllmRpcServiceTest"); } 26 | 27 | void TearDown() override { google::ShutdownGoogleLogging(); } 28 | }; 29 | // TODO 30 | // TEST_F(XllmRpcServiceTest, RegisterInstance) { 31 | // RpcServiceConfig config; 32 | // HttpServiceConfig http_config; 33 | // ModelConfig model_config; 34 | // auto xllm_service = 35 | // std::make_shared(config, model_config, 36 | // http_config); 37 | // std::string inst_name = "127.0.0.1@nic0"; 38 | // InstanceMetaInfo metainfo(inst_name, "127.0.0.1:7777", 39 | // InstanceType::PREFILL); EXPECT_EQ(ErrorCode::OK, 40 | // xllm_service->register_instance(inst_name, metainfo)); 41 | 42 | // metainfo.type = InstanceType::DECODE; 43 | // EXPECT_EQ(ErrorCode::INSTANCE_EXISTED, 44 | // xllm_service->register_instance(inst_name, metainfo)); 45 | // } 46 | 47 | // TEST_F(XllmRpcServiceTest, UpdateInstanceMetainfo) { 48 | // RpcServiceConfig config; 49 | // HttpServiceConfig http_config; 50 | // ModelConfig model_config; 51 | // auto xllm_service = 52 | // std::make_shared(config, model_config, 53 | // http_config); 54 | // std::string inst_name = "127.0.0.1@nic0"; 55 | // InstanceMetaInfo metainfo(inst_name, "127.0.0.1:7777", 56 | // InstanceType::PREFILL); EXPECT_EQ(ErrorCode::OK, 57 | // xllm_service->register_instance(inst_name, metainfo)); 58 | // metainfo.type = InstanceType::DECODE; 59 | // EXPECT_EQ(ErrorCode::OK, 60 | // xllm_service->update_instance_metainfo(inst_name, metainfo)); 61 | 62 | // std::string inst_name2 = "127.0.0.1@nic2"; 63 | // InstanceMetaInfo metainfo2( 64 | // inst_name2, "127.0.0.1:7778", InstanceType::PREFILL); 65 | // EXPECT_EQ(ErrorCode::INSTANCE_NOT_EXISTED, 66 | // xllm_service->update_instance_metainfo(inst_name2, metainfo)); 67 | // } 68 | 69 | } // namespace xllm_service::test 70 | -------------------------------------------------------------------------------- /xllm_service/rpc_service/service.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "chat.pb.h" 22 | #include "common/options.h" 23 | #include "common/types.h" 24 | #include "common/xllm/output.h" 25 | #include "common/xllm/status.h" 26 | #include "completion.pb.h" 27 | #include "xllm_rpc_service.pb.h" 28 | 29 | namespace xllm_service { 30 | 31 | struct ServiceConfig { 32 | ServiceConfig(bool decode_to_service) 33 | : enable_decode_response_to_service(decode_to_service) {} 34 | 35 | bool enable_decode_response_to_service = false; 36 | }; 37 | 38 | class Scheduler; 39 | class InstanceMgr; 40 | 41 | class XllmRpcServiceImpl final { 42 | public: 43 | XllmRpcServiceImpl(const Options& options, Scheduler* scheduler); 44 | ~XllmRpcServiceImpl(); 45 | 46 | void heartbeat(const proto::HeartbeatRequest* req); 47 | 48 | InstanceMetaInfo get_instance_info(const std::string& instance_name); 49 | 50 | ServiceConfig get_config(); 51 | 52 | std::vector get_static_decode_list( 53 | const std::string& prefill_name); 54 | 55 | public: 56 | // handle generations from prefill/decode instance 57 | bool handle_generation(const llm::RequestOutput& request_output); 58 | 59 | private: 60 | Options options_; 61 | 62 | // not own 63 | Scheduler* scheduler_; 64 | 65 | // In disagg pd mode, we support receive generated token from 66 | // prefill or from decode directly. 67 | // 1. 68 | // [service] ---req---> [prefill] ---req---> [decode] 69 | // [service] <---first resp--- [prefill] ---first resp---> [decode] 70 | // [service] <---resp--- [prefill] <---resp--- [decode] 71 | // 72 | // 2. 73 | // [service] ---req---> [prefill] ---req---> [decode] 74 | // [service] <---first resp-- [prefill] --first resp---> [decode] 75 | // [service] <---resp-- [decode] 76 | // 77 | bool enable_decode_response_to_service_ = false; 78 | }; 79 | 80 | // parse proto data and call XllmRpcService 81 | class XllmRpcService : public proto::XllmRpcService { 82 | public: 83 | explicit XllmRpcService(const Options& options, Scheduler* scheduler); 84 | virtual ~XllmRpcService(); 85 | 86 | virtual void Hello(google::protobuf::RpcController* cntl_base, 87 | const proto::Empty* req, 88 | proto::Status* resp, 89 | google::protobuf::Closure* done) override; 90 | 91 | virtual void Heartbeat(google::protobuf::RpcController* cntl_base, 92 | const proto::HeartbeatRequest* req, 93 | proto::Status* resp, 94 | google::protobuf::Closure* done) override; 95 | 96 | virtual void GetInstanceInfo(google::protobuf::RpcController* cntl_base, 97 | const proto::InstanceID* req, 98 | proto::InstanceMetaInfo* resp, 99 | google::protobuf::Closure* done) override; 100 | 101 | virtual void GetStaticDecodeList(google::protobuf::RpcController* cntl_base, 102 | const proto::InstanceID* req, 103 | proto::InstanceIDs* resp, 104 | google::protobuf::Closure* done) override; 105 | 106 | // xllm service receive response from decode instance directly in disagg pd 107 | // mode. This can eliminate the cost brought by forwarding through prefill. 108 | virtual void Generations(google::protobuf::RpcController* cntl_base, 109 | const proto::DisaggStreamGenerations* req, 110 | proto::StatusSet* resp, 111 | google::protobuf::Closure* done) override; 112 | 113 | virtual void GetConfig(google::protobuf::RpcController* cntl_base, 114 | const proto::Empty* req, 115 | proto::ServiceConfig* resp, 116 | google::protobuf::Closure* done) override; 117 | 118 | private: 119 | std::unique_ptr xllm_rpc_service_impl_; 120 | }; 121 | 122 | } // namespace xllm_service 123 | -------------------------------------------------------------------------------- /xllm_service/scheduler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | add_subdirectory(etcd_client) 4 | add_subdirectory(managers) 5 | add_subdirectory(loadbalance_policy) 6 | 7 | cc_library( 8 | NAME 9 | scheduler 10 | HDRS 11 | response_handler.h 12 | scheduler.h 13 | SRCS 14 | response_handler.cpp 15 | scheduler.cpp 16 | DEPS 17 | :chat_template 18 | :common 19 | :etcd_client 20 | :loadbalance_policy 21 | :managers 22 | :request 23 | cpprest 24 | etcd-cpp-api 25 | glog::glog 26 | nlohmann_json::nlohmann_json 27 | ) -------------------------------------------------------------------------------- /xllm_service/scheduler/etcd_client/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | etcd_client 7 | HDRS 8 | etcd_client.h 9 | SRCS 10 | etcd_client.cpp 11 | DEPS 12 | :common 13 | cpprest 14 | etcd-cpp-api 15 | glog::glog 16 | nlohmann_json::nlohmann_json 17 | ) -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | loadbalance_policy 8 | HDRS 9 | loadbalance_policy.h 10 | round_robin.h 11 | cache_aware_routing.h 12 | SRCS 13 | round_robin.cpp 14 | cache_aware_routing.cpp 15 | DEPS 16 | :chat_template 17 | :common 18 | :managers 19 | ) 20 | -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/cache_aware_routing.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "cache_aware_routing.h" 17 | 18 | namespace xllm_service { 19 | 20 | constexpr float MIN_SCORE = -2.0; 21 | 22 | bool CacheAwareRouting::select_instances_pair( 23 | std::shared_ptr request) { 24 | LoadBalanceInfos lb_infos; 25 | if (!request->token_ids.empty()) { 26 | Slice token_ids(request->token_ids.data(), 27 | request->token_ids.size()); 28 | global_kvcache_mgr_->match(token_ids, &lb_infos.overlap_scores); 29 | DLOG(INFO) << lb_infos.debug_string(); 30 | } 31 | 32 | instance_mgr_->get_load_metrics(&lb_infos); 33 | DLOG(INFO) << lb_infos.debug_string(); 34 | 35 | if (lb_infos.prefill_load_metrics.size() == 0) { 36 | LOG(INFO) << "No node available!"; 37 | return false; 38 | } 39 | 40 | // find preifll 41 | cost_function(lb_infos.overlap_scores.hbm_instance_score, 42 | lb_infos.overlap_scores.max_block_num, 43 | lb_infos.prefill_load_metrics, 44 | lb_infos.prefill_max_waiting_requests_num, 45 | &request->routing.prefill_name); 46 | 47 | // find decode 48 | if (lb_infos.decode_load_metrics.size()) { 49 | cost_function(lb_infos.overlap_scores.hbm_instance_score, 50 | lb_infos.overlap_scores.max_block_num, 51 | lb_infos.decode_load_metrics, 52 | lb_infos.decode_max_waiting_requests_num, 53 | &request->routing.decode_name); 54 | } 55 | 56 | return true; 57 | } 58 | 59 | void CacheAwareRouting::cost_function( 60 | const std::unordered_map& overlap_scores, 61 | const uint32_t& max_block_num, 62 | const std::unordered_map& load_metrics, 63 | const int64_t& max_waiting_requests_num, 64 | std::string* best_choice) { 65 | float best_score = MIN_SCORE; 66 | for (const auto& it : load_metrics) { 67 | const auto matched_blocks_it = overlap_scores.find(it.first); 68 | uint32_t matched_blocks = 0; 69 | if (matched_blocks_it != overlap_scores.end()) { 70 | matched_blocks = matched_blocks_it->second; 71 | } 72 | 73 | auto score = 74 | (max_block_num == 0 ? 0 : matched_blocks / max_block_num) - 75 | it.second.gpu_cache_usage_perc - 76 | (max_waiting_requests_num == 0 77 | ? 0 78 | : it.second.waiting_requests_num / max_waiting_requests_num); 79 | 80 | if (score > best_score) { 81 | best_score = score; 82 | *best_choice = it.first; 83 | } 84 | } 85 | } 86 | 87 | } // namespace xllm_service 88 | -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/cache_aware_routing.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "common/macros.h" 19 | #include "loadbalance_policy.h" 20 | #include "scheduler/managers/global_kvcache_mgr.h" 21 | 22 | namespace xllm_service { 23 | 24 | class CacheAwareRouting final : public LoadBalancePolicy { 25 | public: 26 | CacheAwareRouting(std::shared_ptr instance_mgr, 27 | std::shared_ptr global_kvcache_mgr) 28 | : global_kvcache_mgr_(global_kvcache_mgr), 29 | LoadBalancePolicy(instance_mgr) {}; 30 | 31 | virtual ~CacheAwareRouting() = default; 32 | 33 | bool select_instances_pair(std::shared_ptr request) override; 34 | 35 | private: 36 | DISALLOW_COPY_AND_ASSIGN(CacheAwareRouting); 37 | 38 | void cost_function( 39 | const std::unordered_map& overlap_scores, 40 | const uint32_t& max_block_num, 41 | const std::unordered_map& load_metrics, 42 | const int64_t& max_waiting_requests_num, 43 | std::string* best_choice); 44 | 45 | std::shared_ptr global_kvcache_mgr_; 46 | }; 47 | 48 | } // namespace xllm_service 49 | -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/loadbalance_policy.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "common/types.h" 19 | #include "request/request.h" 20 | #include "scheduler/managers/instance_mgr.h" 21 | 22 | namespace xllm_service { 23 | 24 | class LoadBalancePolicy { 25 | public: 26 | LoadBalancePolicy(std::shared_ptr instance_mgr) 27 | : instance_mgr_(instance_mgr) {} 28 | 29 | virtual ~LoadBalancePolicy() = default; 30 | 31 | virtual bool select_instances_pair(std::shared_ptr request) = 0; 32 | 33 | protected: 34 | std::shared_ptr instance_mgr_; 35 | }; 36 | 37 | } // namespace xllm_service 38 | -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/round_robin.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "round_robin.h" 17 | 18 | namespace xllm_service { 19 | 20 | bool RoundRobin::select_instances_pair(std::shared_ptr request) { 21 | return instance_mgr_->get_next_instance_pair(&request->routing); 22 | } 23 | 24 | } // namespace xllm_service 25 | -------------------------------------------------------------------------------- /xllm_service/scheduler/loadbalance_policy/round_robin.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "common/macros.h" 19 | #include "loadbalance_policy.h" 20 | 21 | namespace xllm_service { 22 | 23 | class RoundRobin final : public LoadBalancePolicy { 24 | public: 25 | RoundRobin(std::shared_ptr instance_mgr) 26 | : LoadBalancePolicy(instance_mgr) {}; 27 | 28 | virtual ~RoundRobin() = default; 29 | 30 | bool select_instances_pair(std::shared_ptr request) override; 31 | 32 | private: 33 | DISALLOW_COPY_AND_ASSIGN(RoundRobin); 34 | }; 35 | 36 | } // namespace xllm_service 37 | -------------------------------------------------------------------------------- /xllm_service/scheduler/managers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | managers 7 | HDRS 8 | instance_mgr.h 9 | global_kvcache_mgr.h 10 | SRCS 11 | instance_mgr.cpp 12 | global_kvcache_mgr.cpp 13 | DEPS 14 | :chat_template 15 | :common 16 | :etcd_client 17 | :request 18 | absl::random_random 19 | absl::strings 20 | glog::glog 21 | proto::proto_rpc_service 22 | proto_xllm 23 | ) 24 | target_link_libraries(managers PRIVATE brpc-static) 25 | -------------------------------------------------------------------------------- /xllm_service/scheduler/managers/global_kvcache_mgr.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "../etcd_client/etcd_client.h" 22 | #include "common/hash_util.h" 23 | #include "common/macros.h" 24 | #include "common/options.h" 25 | #include "common/slice.h" 26 | #include "common/threadpool.h" 27 | #include "common/types.h" 28 | #include "xllm_rpc_service.pb.h" 29 | 30 | namespace xllm_service { 31 | 32 | class GlobalKVCacheMgr final { 33 | public: 34 | explicit GlobalKVCacheMgr(const Options& options, 35 | const std::shared_ptr& etcd_client, 36 | const bool is_master_service); 37 | ~GlobalKVCacheMgr(); 38 | 39 | void match(const Slice& token_ids, OverlapScores* overlap_scores); 40 | 41 | void record_updated_kvcaches(const std::string& instance_name, 42 | const proto::KvCacheEvent& kvcache_event); 43 | bool upload_kvcache(); 44 | 45 | void set_as_master(); 46 | 47 | private: 48 | DISALLOW_COPY_AND_ASSIGN(GlobalKVCacheMgr); 49 | 50 | void update_kvcache(const etcd::Response& response, 51 | const uint64_t prefix_len); 52 | 53 | private: 54 | Options options_; 55 | std::atomic_bool is_master_service_ = false; 56 | bool exited_ = false; 57 | std::shared_mutex kvcache_mutex_; 58 | Murmur3KeyCacheMap kvcache_infos_; 59 | std::shared_ptr etcd_client_; // not own 60 | 61 | std::mutex update_mutex_; 62 | Murmur3KeyCacheMap updated_kvcaches_; 63 | 64 | ThreadPool threadpool_; 65 | }; 66 | 67 | } // namespace xllm_service 68 | -------------------------------------------------------------------------------- /xllm_service/scheduler/managers/instance_mgr.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "common/macros.h" 26 | #include "common/options.h" 27 | #include "common/threadpool.h" 28 | #include "common/ttft_predictor.h" 29 | #include "common/types.h" 30 | #include "request/request.h" 31 | #include "scheduler/etcd_client/etcd_client.h" 32 | #include "xllm_rpc_service.pb.h" 33 | 34 | namespace xllm_service { 35 | 36 | class InstanceMgr final { 37 | public: 38 | explicit InstanceMgr(const Options& options, 39 | const std::shared_ptr& etcd_client, 40 | const bool is_master_service); 41 | 42 | ~InstanceMgr(); 43 | 44 | InstanceMetaInfo get_instance_info(const std::string& instance_name); 45 | 46 | bool get_next_instance_pair(Routing* routing); 47 | 48 | std::vector get_static_decode_list( 49 | const std::string& instance_name); 50 | 51 | void get_load_metrics(LoadBalanceInfos* infos); 52 | 53 | std::shared_ptr get_channel(const std::string& instance_name); 54 | 55 | void record_load_metrics_update(const std::string& instance_name, 56 | const proto::LoadMetrics& load_metrics); 57 | bool upload_load_metrics(); 58 | 59 | // update the recent token latency metrics for the corresponding instance 60 | void update_latency_metrics(const std::string& instance_name, 61 | const proto::LatencyMetrics& latency_metrics); 62 | 63 | // update request metrics under different actions 64 | void update_request_metrics(std::shared_ptr request, 65 | RequestAction action); 66 | 67 | void set_as_master(); 68 | 69 | private: 70 | DISALLOW_COPY_AND_ASSIGN(InstanceMgr); 71 | 72 | void init(); 73 | 74 | bool create_channel(const std::string& target_uri); 75 | // use etcd as ServiceDiscovery 76 | void update_instance_metainfo(const etcd::Response& response, 77 | const uint64_t& prefix_len); 78 | 79 | void update_load_metrics(const etcd::Response& response, 80 | const uint64_t& prefix_len); 81 | 82 | private: 83 | Options options_; 84 | 85 | bool exited_ = false; 86 | bool use_etcd_ = false; 87 | std::atomic_bool is_master_service_ = false; 88 | 89 | std::shared_ptr etcd_client_; 90 | 91 | std::shared_mutex inst_mutex_; 92 | std::unordered_map instances_; 93 | std::vector prefill_index_; 94 | std::vector decode_index_; 95 | uint64_t next_prefill_index_ = 0; 96 | uint64_t next_decode_index_ = 0; 97 | 98 | std::shared_mutex load_metric_mutex_; 99 | std::unordered_map load_metrics_; 100 | std::unordered_map> 101 | cached_channels_; 102 | 103 | std::mutex update_mutex_; 104 | std::unordered_map updated_metrics_; 105 | std::unordered_set removed_instance_; 106 | 107 | // "instance name" -> "TtftPredictor" map 108 | std::mutex ttft_predictor_mutex_; 109 | std::unordered_map ttft_predictors_; 110 | 111 | // Record the latest token latency metrics for each instance, including TTFT 112 | // and TBT. 113 | std::mutex latency_metrics_mutex_; 114 | std::unordered_map latency_metrics_; 115 | 116 | // Record the request metrics for each instance, including prefill token 117 | // count, prefill request count, estimated prefill execution time, decode 118 | // token count, and decode request count. 119 | std::mutex request_metrics_mutex_; 120 | std::unordered_map request_metrics_; 121 | 122 | ThreadPool threadpool_; 123 | }; 124 | 125 | } // namespace xllm_service 126 | -------------------------------------------------------------------------------- /xllm_service/scheduler/response_handler.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "common/call_data.h" 22 | #include "common/threadpool.h" 23 | #include "common/xllm/output.h" 24 | #include "common/xllm/status.h" 25 | 26 | namespace xllm_service { 27 | 28 | class ResponseHandler final { 29 | public: 30 | ResponseHandler() = default; 31 | ~ResponseHandler() = default; 32 | 33 | bool send_delta_to_client(std::shared_ptr call_data, 34 | std::unordered_set* first_message_sent, 35 | bool include_usage, 36 | const std::string& request_id, 37 | int64_t created_time, 38 | const std::string& model, 39 | const llm::RequestOutput& output); 40 | bool send_result_to_client(std::shared_ptr call_data, 41 | const std::string& request_id, 42 | int64_t created_time, 43 | const std::string& model, 44 | const llm::RequestOutput& req_output); 45 | 46 | bool send_delta_to_client(std::shared_ptr call_data, 47 | bool include_usage, 48 | const std::string& request_id, 49 | int64_t created_time, 50 | const std::string& model, 51 | const llm::RequestOutput& output); 52 | bool send_result_to_client(std::shared_ptr call_data, 53 | const std::string& request_id, 54 | int64_t created_time, 55 | const std::string& model, 56 | const llm::RequestOutput& req_output); 57 | }; 58 | 59 | } // namespace xllm_service 60 | -------------------------------------------------------------------------------- /xllm_service/scheduler/scheduler.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "chat_template/jinja_chat_template.h" 19 | #include "common/call_data.h" 20 | #include "common/options.h" 21 | #include "common/threadpool.h" 22 | #include "common/xllm/output.h" 23 | #include "etcd_client/etcd_client.h" 24 | #include "loadbalance_policy/loadbalance_policy.h" 25 | #include "managers/global_kvcache_mgr.h" 26 | #include "managers/instance_mgr.h" 27 | #include "request/request.h" 28 | #include "response_handler.h" 29 | #include "tokenizer/tokenizer.h" 30 | #include "tokenizer/tokenizer_args.h" 31 | 32 | namespace xllm_service { 33 | 34 | // A scheduler for scheduling requests and instances 35 | class Scheduler final { 36 | public: 37 | Scheduler(const Options& options); 38 | ~Scheduler(); 39 | 40 | bool schedule(std::shared_ptr request); 41 | 42 | std::shared_ptr get_channel(const std::string& target_name); 43 | 44 | InstanceMetaInfo get_instance_info(const std::string& instance_name); 45 | 46 | std::vector get_static_decode_list( 47 | const std::string& instance_name); 48 | 49 | void handle_instance_heartbeat(const proto::HeartbeatRequest* req); 50 | 51 | void exited() { exited_ = true; } 52 | 53 | // register new requests from http service 54 | // keep http callback util request finished. 55 | // `handle_generation` will handle response with these callbacks. 56 | bool record_new_request(std::shared_ptr call_data, 57 | std::shared_ptr request); 58 | bool record_new_request(std::shared_ptr call_data, 59 | std::shared_ptr request); 60 | void finish_request(const std::string& service_request_id, 61 | bool error = false); 62 | 63 | // handle generations from prefill/decode instance 64 | bool handle_generation(const llm::RequestOutput& request_output); 65 | 66 | // update request metrics for prefill finished request 67 | void update_request_metrics_for_prefill( 68 | const std::string& service_request_id); 69 | 70 | private: 71 | DISALLOW_COPY_AND_ASSIGN(Scheduler); 72 | 73 | void update_master_service_heartbeat(); 74 | 75 | void handle_master_service_watch(const etcd::Response& response, 76 | const uint64_t& prefix_len); 77 | 78 | Tokenizer* get_tls_tokenizer(); 79 | 80 | private: 81 | Options options_; 82 | 83 | bool exited_ = false; 84 | 85 | bool is_master_service_ = false; 86 | 87 | TokenizerArgs tokenizer_args_; 88 | 89 | // chat template instance 90 | std::unique_ptr chat_template_; 91 | 92 | std::shared_ptr etcd_client_; 93 | 94 | std::unique_ptr tokenizer_; 95 | 96 | std::shared_ptr instance_mgr_; 97 | 98 | std::shared_ptr global_kvcache_mgr_; 99 | 100 | std::unique_ptr lb_policy_; 101 | 102 | std::unique_ptr heartbeat_thread_; 103 | 104 | // `service request id` -> `request` map 105 | std::unordered_map> requests_; 106 | std::mutex request_mutex_; 107 | 108 | // use threadpool to handle all RequestOuputs queue 109 | static constexpr size_t kOutputTheadNum_ = 128; // magic num 110 | ThreadPool output_threadpools_[kOutputTheadNum_]; 111 | // A request will be handled in the same thread to guarantee the token's 112 | // order. 113 | std::unordered_map remote_requests_output_thread_map_; 114 | size_t next_thread_idx = 0; 115 | std::mutex thread_map_mutex_; 116 | 117 | // used when receive token from decode instance. 118 | ResponseHandler response_handler_; 119 | }; 120 | 121 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/tokenizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | add_subdirectory(tokenizers) 5 | 6 | cc_library( 7 | NAME 8 | tokenizer 9 | HDRS 10 | tokenizer_args.h 11 | tokenizer.h 12 | tokenizer_factory.h 13 | tiktoken_tokenizer.h 14 | sentencepiece_tokenizer.h 15 | fast_tokenizer.h 16 | SRCS 17 | tokenizer_args.cpp 18 | tokenizer_factory.cpp 19 | tiktoken_tokenizer.cpp 20 | sentencepiece_tokenizer.cpp 21 | fast_tokenizer.cpp 22 | DEPS 23 | :common 24 | :sentencepiece 25 | absl::flat_hash_map 26 | absl::strings 27 | glog::glog 28 | rust_tokenizers 29 | re2::re2 30 | ) 31 | 32 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/fast_tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include "fast_tokenizer.h" 2 | 3 | #include 4 | 5 | namespace xllm_service { 6 | 7 | FastTokenizer::FastTokenizer(const std::string& tokenizer_json_path) 8 | : tokenizer_json_path_(tokenizer_json_path) { 9 | handle_ = tokenizers_new_from_path(tokenizer_json_path.c_str()); 10 | CHECK(handle_ != nullptr) 11 | << "Failed to load tokenizer from file: " << tokenizer_json_path; 12 | } 13 | 14 | std::unique_ptr FastTokenizer::clone() const { 15 | return std::make_unique(tokenizer_json_path_); 16 | } 17 | 18 | FastTokenizer::~FastTokenizer() { tokenizers_free(handle_); } 19 | 20 | bool FastTokenizer::encode(const std::string_view& text, 21 | std::vector* ids) const { 22 | TokenizerEncodeResult result; 23 | tokenizers_encode( 24 | handle_, text.data(), text.size(), /*add_special_tokens=*/1, &result); 25 | 26 | std::vector ret(result.token_ids, result.token_ids + result.len); 27 | *ids = std::move(ret); 28 | 29 | return true; 30 | } 31 | 32 | std::string FastTokenizer::decode(const Slice& ids, 33 | bool skip_special_tokens) const { 34 | const char* data = nullptr; 35 | size_t len = 0; 36 | tokenizers_decode(handle_, 37 | reinterpret_cast(ids.data()), 38 | ids.size(), 39 | skip_special_tokens, 40 | &data, 41 | &len); 42 | return {data, len}; 43 | } 44 | 45 | std::optional FastTokenizer::token_to_id( 46 | const std::string_view& token) const { 47 | int32_t id = -1; 48 | tokenizers_token_to_id(handle_, token.data(), token.size(), &id); 49 | return id == -1 ? std::optional(std::nullopt) 50 | : std::optional(id); 51 | } 52 | 53 | std::string FastTokenizer::id_to_token(int32_t id) const { 54 | const char* data = nullptr; 55 | size_t len = 0; 56 | tokenizers_id_to_token(handle_, id, &data, &len); 57 | return {data, len}; 58 | } 59 | 60 | size_t FastTokenizer::vocab_size() const { 61 | size_t size; 62 | tokenizers_get_vocab_size(handle_, &size); 63 | CHECK(size > 0) << "vocab_size must be greater than 0."; 64 | return size; 65 | } 66 | 67 | } // namespace xllm_service 68 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/fast_tokenizer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include "tokenizer.h" 20 | #include "tokenizers/tokenizers.h" 21 | 22 | namespace xllm_service { 23 | 24 | class FastTokenizer : public Tokenizer { 25 | public: 26 | FastTokenizer(const std::string& tokenizer_json_path); 27 | 28 | ~FastTokenizer() override; 29 | 30 | bool encode(const std::string_view& text, 31 | std::vector* ids) const override; 32 | 33 | std::string decode(const Slice& ids, 34 | bool skip_special_tokens) const override; 35 | 36 | std::optional token_to_id( 37 | const std::string_view& token) const override; 38 | 39 | std::string id_to_token(int32_t id) const override; 40 | 41 | size_t vocab_size() const override; 42 | 43 | std::unique_ptr clone() const override; 44 | 45 | private: 46 | std::string tokenizer_json_path_; 47 | 48 | TokenizerHandle handle_ = nullptr; 49 | }; 50 | 51 | } // namespace xllm_service 52 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/sentencepiece_tokenizer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | #include "sentencepiece/sentencepiece_processor.h" 24 | #include "tokenizer.h" 25 | #include "tokenizer_args.h" 26 | 27 | namespace xllm_service { 28 | 29 | // a tokenizer that uses google/SentencePiece 30 | class SentencePieceTokenizer : public Tokenizer { 31 | public: 32 | SentencePieceTokenizer(const std::string_view& dir_path, 33 | const TokenizerArgs& args); 34 | 35 | bool encode(const std::string_view& text, 36 | std::vector* ids) const override; 37 | 38 | std::string decode(const Slice& ids, 39 | bool skip_special_tokens) const override; 40 | 41 | std::optional token_to_id( 42 | const std::string_view& token) const override; 43 | 44 | std::string id_to_token(int32_t id) const override; 45 | 46 | size_t vocab_size() const override; 47 | 48 | std::unique_ptr clone() const override; 49 | 50 | private: 51 | void load_special_tokens(const std::vector& special_tokens); 52 | 53 | bool encode_internal(const std::string_view& text, 54 | std::vector* ids) const; 55 | void decode_internal(const Slice& ids, 56 | size_t start, 57 | size_t end, 58 | std::stringstream* ss) const; 59 | 60 | std::string dir_path_; 61 | 62 | TokenizerArgs args_; 63 | 64 | sentencepiece::SentencePieceProcessor sp_processor_; 65 | 66 | // special tokens to ids 67 | absl::flat_hash_map special_token_encoder_; 68 | 69 | // special token ids to tokens 70 | absl::flat_hash_map special_token_decoder_; 71 | 72 | // special token regex (optional) 73 | std::unique_ptr special_token_regex_; 74 | 75 | // token ids to add to the beginning of the input sequence 76 | std::vector prefix_token_ids_; 77 | }; 78 | 79 | } // namespace xllm_service 80 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tiktoken_tokenizer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "tokenizer.h" 25 | #include "tokenizer_args.h" 26 | 27 | namespace xllm_service { 28 | 29 | // a simple c++ implementation of the openai/tiktoken 30 | // https://github.com/openai/tiktoken 31 | class TiktokenTokenizer : public Tokenizer { 32 | public: 33 | TiktokenTokenizer(const std::string_view& dir_path, 34 | const TokenizerArgs& args); 35 | 36 | bool encode(const std::string_view& text, 37 | std::vector* ids) const override; 38 | 39 | std::string decode(const Slice& ids, 40 | bool skip_special_tokens) const override; 41 | 42 | std::optional token_to_id( 43 | const std::string_view& token) const override; 44 | 45 | std::string id_to_token(int32_t id) const override; 46 | 47 | size_t vocab_size() const override; 48 | 49 | std::unique_ptr clone() const override; 50 | 51 | private: 52 | void load_special_tokens(const std::vector& special_tokens); 53 | 54 | void load_vocab(const std::string& vocab_file_path); 55 | 56 | void encode_internal(const std::string_view& text, 57 | std::vector* ids) const; 58 | 59 | void byte_pair_encode(const std::string_view& piece, 60 | std::vector* ids) const; 61 | 62 | std::string dir_path_; 63 | 64 | TokenizerArgs args_; 65 | 66 | // token to ids 67 | absl::flat_hash_map encoder_; 68 | // id to token 69 | absl::flat_hash_map decoder_; 70 | 71 | // a regex pattern to tokenize text 72 | // N.B. RE2 doesn't support look-around assertions. 73 | // https://github.com/google/re2/wiki/Syntax 74 | std::unique_ptr regex_; 75 | 76 | // special tokens to ids 77 | absl::flat_hash_map special_token_encoder_; 78 | 79 | // special token ids to tokens 80 | absl::flat_hash_map special_token_decoder_; 81 | 82 | // special token regex (optional) 83 | std::unique_ptr special_token_regex_; 84 | 85 | // token ids to add to the beginning of the input sequence 86 | std::vector prefix_token_ids_; 87 | }; 88 | 89 | } // namespace xllm_service 90 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "common/slice.h" 25 | 26 | namespace xllm_service { 27 | 28 | class Tokenizer { 29 | public: 30 | virtual ~Tokenizer() = default; 31 | 32 | virtual bool encode(const std::string_view& text, 33 | std::vector* ids) const = 0; 34 | 35 | virtual std::string decode(const Slice& ids, 36 | bool skip_special_tokens) const = 0; 37 | 38 | virtual std::optional token_to_id( 39 | const std::string_view& token) const = 0; 40 | 41 | virtual std::string id_to_token(int32_t id) const = 0; 42 | 43 | virtual size_t vocab_size() const = 0; 44 | 45 | virtual std::unique_ptr clone() const = 0; 46 | }; 47 | 48 | } // namespace xllm_service 49 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizer_args.cpp: -------------------------------------------------------------------------------- 1 | #include "tokenizer_args.h" 2 | 3 | #include 4 | 5 | #include "common/json_reader.h" 6 | 7 | namespace xllm_service { 8 | namespace { 9 | std::optional load_chat_template_file(const std::string& dir) { 10 | // chat_template.json 11 | const std::string chat_template_path = dir + "/chat_template.json"; 12 | JsonReader reader; 13 | if (reader.parse(chat_template_path); 14 | auto v = reader.value("chat_template")) { 15 | return v; 16 | } 17 | // chat_template.jinja 18 | const std::string raw_chat_template_path = dir + "/chat_template.jinja"; 19 | std::ifstream file(raw_chat_template_path); 20 | if (file.is_open()) { 21 | std::ostringstream content; 22 | content << file.rdbuf(); 23 | file.close(); 24 | return content.str(); 25 | } 26 | return std::nullopt; 27 | } 28 | } // namespace 29 | 30 | void load_tokenizer_args(const std::string& model_weights_path, 31 | TokenizerArgs& tokenizer_args) { 32 | // tokenizer args from tokenizer_config.json 33 | JsonReader tokenizer_reader; 34 | const std::string tokenizer_args_file_path = 35 | model_weights_path + "/tokenizer_config.json"; 36 | if (tokenizer_reader.parse(tokenizer_args_file_path)) { 37 | // read chat template if exists 38 | if (auto v = load_chat_template_file(model_weights_path)) { 39 | tokenizer_args.chat_template() = v.value(); 40 | } else if (auto v = tokenizer_reader.value("chat_template")) { 41 | tokenizer_args.chat_template() = v.value(); 42 | } 43 | if (auto v = tokenizer_reader.value("add_bos_token")) { 44 | tokenizer_args.add_bos_token() = v.value(); 45 | } 46 | if (auto v = tokenizer_reader.value("add_eos_token")) { 47 | tokenizer_args.add_eos_token() = v.value(); 48 | } 49 | if (auto v = tokenizer_reader.value("tokenizer_class")) { 50 | tokenizer_args.tokenizer_class() = v.value(); 51 | } 52 | // read bos_token 53 | if (auto v = tokenizer_reader.value("bos_token.content")) { 54 | tokenizer_args.bos_token() = v.value(); 55 | } else if (auto v = tokenizer_reader.value("bos_token")) { 56 | tokenizer_args.bos_token() = v.value(); 57 | } 58 | // read eos_token 59 | if (auto v = tokenizer_reader.value("eos_token.content")) { 60 | tokenizer_args.eos_token() = v.value(); 61 | } else if (auto v = tokenizer_reader.value("eos_token")) { 62 | tokenizer_args.eos_token() = v.value(); 63 | } 64 | // read pad_token 65 | if (auto v = tokenizer_reader.value("pad_token.content")) { 66 | tokenizer_args.pad_token() = v.value(); 67 | } else if (auto v = tokenizer_reader.value("pad_token")) { 68 | tokenizer_args.pad_token() = v.value(); 69 | } 70 | } 71 | } 72 | 73 | } // namespace xllm_service -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizer_args.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "common/macros.h" 27 | 28 | namespace xllm_service { 29 | 30 | using SpecialToken = std::pair; 31 | 32 | struct TokenizerArgs { 33 | // Type of tokenizer to use. valid values are "sentencepiece" and "tiktoken". 34 | PROPERTY(std::string, tokenizer_type) = "sentencepiece"; 35 | 36 | // Vocab file name. 37 | PROPERTY(std::string, vocab_file) = "tokenizer.model"; 38 | 39 | // Special tokens to add to the vocabulary. 40 | PROPERTY(std::vector, special_tokens); 41 | 42 | // Regex pattern used by tiktok tokenizer only. 43 | PROPERTY(std::string, pattern); 44 | 45 | // tokens to add to the beginning of the input sequence. 46 | PROPERTY(std::vector, prefix_tokens); 47 | 48 | // chat template 49 | PROPERTY(std::string, chat_template); 50 | 51 | // add_bos_token 52 | PROPERTY(bool, add_bos_token) = false; 53 | 54 | // add_eos_token 55 | PROPERTY(bool, add_eos_token) = false; 56 | 57 | // bos_token 58 | PROPERTY(std::string, bos_token); 59 | 60 | // eos_token 61 | PROPERTY(std::string, eos_token); 62 | 63 | // pad_token 64 | PROPERTY(std::string, pad_token); 65 | 66 | // tokenizer_class 67 | PROPERTY(std::string, tokenizer_class); 68 | }; 69 | 70 | inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) { 71 | os << "TokenizerArgs: ["; 72 | os << "tokenizer_type: " << args.tokenizer_type(); 73 | // os << ", chat_template: " << args.chat_template(); 74 | os << ", add_bos_token: " << args.add_bos_token(); 75 | os << ", add_eos_token: " << args.add_eos_token(); 76 | os << ", bos_token: " << args.bos_token(); 77 | os << ", eos_token: " << args.eos_token(); 78 | os << ", pad_token: " << args.pad_token(); 79 | os << ", tokenizer_class: " << args.tokenizer_class(); 80 | if (!args.special_tokens().empty()) { 81 | os << ", special_tokens: ["; 82 | for (const auto& [token, id] : args.special_tokens()) { 83 | os << "(" << token << ", " << id << ") "; 84 | } 85 | os << "]"; 86 | } 87 | os << ", pattern: " << absl::CEscape(args.pattern()); 88 | if (!args.prefix_tokens().empty()) { 89 | os << ", prefix_tokens: [" << absl::StrJoin(args.prefix_tokens(), ", ") 90 | << "]"; 91 | } 92 | os << "]"; 93 | return os; 94 | } 95 | 96 | void load_tokenizer_args(const std::string& model_weights_path, 97 | TokenizerArgs& tokenizer_args); 98 | 99 | } // namespace xllm_service 100 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizer_factory.cpp: -------------------------------------------------------------------------------- 1 | #include "tokenizer_factory.h" 2 | 3 | #include 4 | 5 | #include "tokenizer_args.h" 6 | 7 | namespace xllm_service { 8 | 9 | std::unique_ptr TokenizerFactory::create_tokenizer( 10 | const std::string& model_weights_path, 11 | TokenizerArgs* tokenizer_args) { 12 | load_tokenizer_args(model_weights_path, *tokenizer_args); 13 | 14 | const std::string tokenizer_json_path = 15 | model_weights_path + "/tokenizer.json"; 16 | if (std::filesystem::exists(tokenizer_json_path)) { 17 | // 1. fast tokenizer 18 | LOG(INFO) << "Create fast tokenizer."; 19 | return std::make_unique(tokenizer_json_path); 20 | } else if (tokenizer_args->tokenizer_type() == "tiktoken" || 21 | tokenizer_args->tokenizer_class() == "TikTokenTokenizer") { 22 | // 2. create tiktoken tokenizer 23 | LOG(INFO) << "Create Tiktoken tokenizer."; 24 | return std::make_unique(model_weights_path, 25 | *tokenizer_args); 26 | } else { 27 | // 3. create sentencepiece tokenizer 28 | LOG(INFO) << "Create SentencePiece tokenizer."; 29 | return std::make_unique(model_weights_path, 30 | *tokenizer_args); 31 | } 32 | } 33 | 34 | } // namespace xllm_service 35 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizer_factory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "fast_tokenizer.h" 4 | #include "sentencepiece_tokenizer.h" 5 | #include "tiktoken_tokenizer.h" 6 | #include "tokenizer_args.h" 7 | 8 | namespace xllm_service { 9 | 10 | class TokenizerFactory { 11 | public: 12 | static std::unique_ptr create_tokenizer( 13 | const std::string& model_weights_path, 14 | TokenizerArgs* tokenizer_args); 15 | }; 16 | 17 | } // namespace xllm_service 18 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cargo_shared_library) 2 | 3 | cargo_shared_library( 4 | NAME 5 | rust_tokenizers 6 | HDRS 7 | tokenizers.h 8 | ) 9 | 10 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizers/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust_tokenizers" 3 | version = "0.21.0" 4 | edition = "2018" 5 | 6 | [lib] 7 | name = "rust_tokenizers" 8 | crate-type = ["cdylib"] 9 | 10 | [dependencies] 11 | tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] } 12 | -------------------------------------------------------------------------------- /xllm_service/tokenizer/tokenizers/tokenizers.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm-service/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | // The C API 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | // The C interface to the hf-tokenizers library 25 | // ported from https://github.com/mlc-ai/tokenizers-cpp 26 | #include 27 | #include 28 | 29 | typedef void* TokenizerHandle; 30 | 31 | typedef struct { 32 | int* token_ids; 33 | size_t len; 34 | } TokenizerEncodeResult; 35 | 36 | TokenizerHandle tokenizers_new_from_path(const char* path); 37 | 38 | void tokenizers_encode(TokenizerHandle handle, 39 | const char* data, 40 | size_t len, 41 | int add_special_token, 42 | TokenizerEncodeResult* result); 43 | 44 | void tokenizers_decode(TokenizerHandle handle, 45 | const uint32_t* data, 46 | size_t len, 47 | int skip_special_tokens, 48 | const char** decode_data, 49 | size_t* decode_len); 50 | 51 | void tokenizers_id_to_token(TokenizerHandle handle, 52 | uint32_t id, 53 | const char** data, 54 | size_t* len); 55 | 56 | // tokenizers_token_to_id stores -1 to *id if the token is not in the vocab 57 | void tokenizers_token_to_id(TokenizerHandle handle, 58 | const char* token, 59 | size_t len, 60 | int32_t* id); 61 | 62 | void tokenizers_free(TokenizerHandle handle); 63 | 64 | void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size); 65 | 66 | #ifdef __cplusplus 67 | } 68 | #endif 69 | --------------------------------------------------------------------------------