├── examples
    ├── __init__.py
    ├── generate_embedding.py
    └── generate.py
├── version.txt
├── xllm
    ├── pybind
    │   ├── __init__.py
    │   ├── CMakeLists.txt
    │   └── util.py
    ├── core
    │   ├── kernels
    │   │   ├── npu
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── xllm_ops
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── replace_token.h
    │   │   │   │   ├── top_k_top_p.h
    │   │   │   │   ├── beam_search.h
    │   │   │   │   └── acltensor_utils.h
    │   │   │   ├── matmul.cpp
    │   │   │   └── active.cpp
    │   │   ├── mlu
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── gather_split.cpp
    │   │   │   └── random_sample.cpp
    │   │   ├── cuda
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── matmul.cpp
    │   │   ├── CMakeLists.txt
    │   │   └── ilu
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── matmul.cpp
    │   │   │   ├── activation.cpp
    │   │   │   └── rope.cpp
    │   ├── framework
    │   │   ├── tokenizer
    │   │   │   ├── tokenizers
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── Cargo.toml
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── tokenizer_factory.h
    │   │   ├── state_dict
    │   │   │   ├── safetensors
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── Cargo.toml
    │   │   │   └── CMakeLists.txt
    │   │   ├── prefix_cache
    │   │   │   ├── prefix_cache_factory.h
    │   │   │   ├── prefix_cache_factory.cpp
    │   │   │   ├── prefix_cache_with_upload.h
    │   │   │   └── CMakeLists.txt
    │   │   ├── chat_template
    │   │   │   └── CMakeLists.txt
    │   │   ├── dit_cache
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── dit_non_cache.h
    │   │   │   └── dit_cache.cpp
    │   │   ├── eplb
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── eplb_policy_test.cpp
    │   │   ├── xtensor
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── phy_page.cpp
    │   │   │   ├── phy_page.h
    │   │   │   └── options.h
    │   │   ├── batch
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── mposition.h
    │   │   ├── request
    │   │   │   ├── dit_request_params.h
    │   │   │   ├── finish_reason.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── finish_reason.h
    │   │   ├── kv_cache
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── kv_cache_event.h
    │   │   ├── parallel_state
    │   │   │   └── CMakeLists.txt
    │   │   ├── CMakeLists.txt
    │   │   ├── block
    │   │   │   └── CMakeLists.txt
    │   │   ├── sampling
    │   │   │   └── CMakeLists.txt
    │   │   └── model
    │   │   │   └── CMakeLists.txt
    │   ├── layers
    │   │   ├── ilu
    │   │   │   └── CMakeLists.txt
    │   │   ├── cuda
    │   │   │   └── CMakeLists.txt
    │   │   ├── mlu
    │   │   │   └── CMakeLists.txt
    │   │   ├── common
    │   │   │   ├── tests
    │   │   │   │   └── CMakeLists.txt
    │   │   │   ├── layer_utils.h
    │   │   │   ├── activation.h
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── activation.cpp
    │   │   │   └── layer_utils.cpp
    │   │   ├── npu
    │   │   │   ├── loader
    │   │   │   │   ├── lm_head_loader.h
    │   │   │   │   ├── column_parallel_linear_loader.h
    │   │   │   │   ├── word_embedding_loader.h
    │   │   │   │   ├── rms_norm_loader.h
    │   │   │   │   ├── llama_decoder_loader.h
    │   │   │   │   ├── siglip_encoder_loader.h
    │   │   │   │   ├── qwen3_decoder_loader.h
    │   │   │   │   └── qwen3_decoder_manual_loader.h
    │   │   │   └── buffer
    │   │   │   │   ├── atb_buffer.h
    │   │   │   │   └── atb_workspace.h
    │   │   ├── lm_head.h
    │   │   ├── glm4_decoder_layer.h
    │   │   ├── word_embedding.h
    │   │   ├── pos_embedding.h
    │   │   ├── llama_decoder_layer.h
    │   │   ├── qwen2_decoder_layer.h
    │   │   ├── qwen3_decoder_layer.h
    │   │   ├── glm4_vision_encode_layer.h
    │   │   ├── qwen2_vision_encode_layer.h
    │   │   ├── qwen3_vision_encode_layer.h
    │   │   ├── qwen3_moe_decoder_layer.h
    │   │   ├── qwen2dot5_vision_encode_layer.h
    │   │   ├── siglip_encoder_layer.h
    │   │   └── deepseek_v2_decoder_layer.h
    │   ├── CMakeLists.txt
    │   ├── platform
    │   │   ├── npu
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── npu_layer_synchronizer.h
    │   │   └── CMakeLists.txt
    │   ├── scheduler
    │   │   ├── profile
    │   │   │   └── CMakeLists.txt
    │   │   ├── scheduler_factory.h
    │   │   └── CMakeLists.txt
    │   ├── distributed_runtime
    │   │   ├── spawn_worker_server
    │   │   │   └── CMakeLists.txt
    │   │   └── pd_ooc_service_impl.h
    │   ├── common
    │   │   ├── rate_limiter_test.cpp
    │   │   ├── rate_limiter.h
    │   │   ├── CMakeLists.txt
    │   │   ├── interruption_bus.h
    │   │   └── instance_name.h
    │   ├── util
    │   │   ├── pretty_print.h
    │   │   ├── uuid.h
    │   │   ├── timer.h
    │   │   ├── uuid.cpp
    │   │   ├── net.h
    │   │   ├── type_traits.h
    │   │   ├── device_name_utils.h
    │   │   ├── pretty_print.cpp
    │   │   └── timer.cpp
    │   └── runtime
    │   │   └── dit_executor.cpp
    ├── models
    │   ├── CMakeLists.txt
    │   └── llm
    │   │   └── npu
    │   │       └── llama3.h
    ├── cc_api
    │   ├── examples
    │   │   ├── start-llm-instance.sh
    │   │   └── service_request.h
    │   ├── macros.h
    │   └── README.md
    ├── server
    │   ├── CMakeLists.txt
    │   └── xllm_server_registry.h
    ├── parser
    │   ├── CMakeLists.txt
    │   ├── reasoning_parser.h
    │   └── reasoning_parser.cpp
    ├── launch_xllm.py
    ├── proto
    │   ├── CMakeLists.txt
    │   ├── tensor.proto
    │   ├── rerank.proto
    │   ├── models.proto
    │   └── xtensor_manager.proto
    ├── function_call
    │   ├── partial_json_parser
    │   │   ├── CMakeLists.txt
    │   │   └── include
    │   │   │   └── partial_json_parser
    │   │   │       └── options.h
    │   └── CMakeLists.txt
    ├── processors
    │   ├── CMakeLists.txt
    │   ├── input_processor.h
    │   └── pywarpper_image_processor.h
    ├── api_service
    │   ├── CMakeLists.txt
    │   ├── call.h
    │   ├── qwen3_rerank_service_impl.h
    │   ├── models_service_impl.h
    │   └── api_service_impl.h
    └── __init__.py
├── .style.yapf
├── third_party
    └── .clang-format
├── cmake
    ├── CMakeTestRustCompiler.cmake
    ├── CMakeRustCompiler.cmake.in
    └── CMakeDetermineRustCompiler.cmake
├── docs
    ├── assets
    │   ├── logo.png
    │   ├── xllm_arch.png
    │   ├── logo_with_llm.png
    │   ├── moe_eplevel1.jpg
    │   ├── moe_eplevel2.jpg
    │   ├── service_arch.png
    │   ├── wechat_qrcode.jpg
    │   ├── pd_architecture.jpg
    │   ├── eplb_architecture.png
    │   ├── groupmatmul_performance.png
    │   ├── globalkvcache_architecture.png
    │   ├── multi_streams_architecture.jpg
    │   └── async_schedule_architecture.jpg
    ├── zh
    │   ├── xLLM_Technical_Report_zh.pdf
    │   ├── features
    │   │   ├── basics.md
    │   │   ├── continuous_scheduler.md
    │   │   ├── zero_evict_scheduler.md
    │   │   ├── multimodal.md
    │   │   ├── chunked_scheduler.md
    │   │   ├── prefix_cache.md
    │   │   ├── topk_topP.md
    │   │   ├── multi_streams.md
    │   │   ├── ppmatmul.md
    │   │   ├── moe_params.md
    │   │   ├── acl_graph.md
    │   │   ├── xtensor_memory.md
    │   │   ├── eplb.md
    │   │   ├── async_schedule.md
    │   │   ├── global_kvcache.md
    │   │   ├── groupgemm.md
    │   │   └── xllm_service_overview.md
    │   ├── .readthedocs.yaml
    │   └── index.md
    ├── mkdocs
    │   ├── javascripts
    │   │   └── mathjax.js
    │   ├── stylesheets
    │   │   └── extra.css
    │   └── overrides
    │   │   └── .icons
    │   │       ├── email-fill.svg
    │   │       └── gitcodeai.svg
    ├── en
    │   ├── features
    │   │   ├── continuous_scheduler.md
    │   │   ├── zero_evict_scheduler.md
    │   │   ├── chunked_scheduler.md
    │   │   ├── prefix_cache.md
    │   │   ├── topk_topp.md
    │   │   └── ppmatmul.md
    │   └── .readthedocs.yaml
    └── requirements.txt
├── .clang-format
├── .pre-commit-config.yaml
├── cibuild
    ├── install
    │   ├── install_ninja.sh
    │   ├── install_user.sh
    │   ├── install_ccache.sh
    │   ├── install_cmake.sh
    │   ├── install_gcc.sh
    │   ├── install_python.sh
    │   └── install_base.sh
    ├── build_mlu.sh
    └── build_npu.sh
├── MANIFEST.in
├── .github
    └── ISSUE_TEMPLATE
    │   ├── question.yaml
    │   ├── bug-report.yaml
    │   └── feature-request.yml
├── .gitignore
├── tools
    └── README.md
├── .gitmodules
└── CONTRIBUTING_zh.md


/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.7.0
2 | 


--------------------------------------------------------------------------------
/xllm/pybind/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style = google
3 | 


--------------------------------------------------------------------------------
/third_party/.clang-format:
--------------------------------------------------------------------------------
1 | DisableFormat: true
2 | SortIncludes: Never


--------------------------------------------------------------------------------
/cmake/CMakeTestRustCompiler.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_Rust_COMPILER_WORKS 1 CACHE INTERNAL "")
2 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include(cc_library)
2 | 
3 | add_subdirectory(xllm_ops)


--------------------------------------------------------------------------------
/docs/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/logo.png


--------------------------------------------------------------------------------
/docs/assets/xllm_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/xllm_arch.png


--------------------------------------------------------------------------------
/docs/assets/logo_with_llm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/logo_with_llm.png


--------------------------------------------------------------------------------
/docs/assets/moe_eplevel1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/moe_eplevel1.jpg


--------------------------------------------------------------------------------
/docs/assets/moe_eplevel2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/moe_eplevel2.jpg


--------------------------------------------------------------------------------
/docs/assets/service_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/service_arch.png


--------------------------------------------------------------------------------
/docs/assets/wechat_qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/wechat_qrcode.jpg


--------------------------------------------------------------------------------
/docs/assets/pd_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/pd_architecture.jpg


--------------------------------------------------------------------------------
/docs/assets/eplb_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/eplb_architecture.png


--------------------------------------------------------------------------------
/docs/zh/xLLM_Technical_Report_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/zh/xLLM_Technical_Report_zh.pdf


--------------------------------------------------------------------------------
/docs/assets/groupmatmul_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/groupmatmul_performance.png


--------------------------------------------------------------------------------
/docs/assets/globalkvcache_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/globalkvcache_architecture.png


--------------------------------------------------------------------------------
/docs/assets/multi_streams_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/multi_streams_architecture.jpg


--------------------------------------------------------------------------------
/docs/assets/async_schedule_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/async_schedule_architecture.jpg


--------------------------------------------------------------------------------
/docs/zh/features/basics.md:
--------------------------------------------------------------------------------
 1 | # 基础知识
 2 | 
 3 | - xLLM使用一卡一进程模式，多卡之间使用rpc进行函数调用，模型计算过程中的数据通信使用device集合通信库。
 4 | 
 5 | - HCCL/LCCL是高性能集合通信，提供单机多卡以及多机多卡间的数据并行、模型并行集合通信方案。
 6 | 
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/xllm/core/framework/tokenizer/tokenizers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cargo_library)
 2 | 
 3 | cargo_library(
 4 |   NAME
 5 |     rust_tokenizers
 6 |   HDRS
 7 |     tokenizers.h
 8 | )
 9 | 
10 | 


--------------------------------------------------------------------------------
/xllm/core/framework/state_dict/safetensors/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cargo_library)
 2 | 
 3 | cargo_library(
 4 |   NAME 
 5 |     rust_safetensors
 6 |   HDRS
 7 |     safetensors.h
 8 | )
 9 | 
10 | 


--------------------------------------------------------------------------------
/xllm/core/layers/ilu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     ilu_layers
 6 |   HDRS
 7 |     attention.h
 8 |   SRCS
 9 |     attention.cpp
10 |   DEPS
11 |     :common_layers
12 | )
13 | 


--------------------------------------------------------------------------------
/xllm/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | # Define the library
 4 | cc_library(
 5 |   NAME
 6 |     models
 7 |   HDRS
 8 |     model_registry.h
 9 |     models.h
10 |   SRCS
11 |     model_registry.cpp
12 |   DEPS
13 |     :model
14 | )
15 | 


--------------------------------------------------------------------------------
/xllm/core/layers/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     cuda_layers
 6 |   HDRS
 7 |     attention.h
 8 |     flashinfer_workspace.h
 9 |   SRCS
10 |     attention.cpp
11 |     flashinfer_workspace.cpp
12 |   DEPS
13 |     :common_layers
14 | )
15 | 


--------------------------------------------------------------------------------
/xllm/core/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(common)
2 | add_subdirectory(distributed_runtime)
3 | add_subdirectory(framework)
4 | add_subdirectory(kernels)
5 | add_subdirectory(layers)
6 | add_subdirectory(platform)
7 | add_subdirectory(runtime)
8 | add_subdirectory(scheduler)
9 | add_subdirectory(util)


--------------------------------------------------------------------------------
/xllm/core/platform/npu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME 
 5 |     platform_npu
 6 |   HDRS
 7 |     npu_layer_synchronizer.h
 8 |   SRCS
 9 |     npu_layer_synchronizer.cpp
10 |   DEPS
11 |     torch_npu
12 |     glog::glog
13 |     torch
14 |     ascendcl
15 | )


--------------------------------------------------------------------------------
/docs/zh/features/continuous_scheduler.md:
--------------------------------------------------------------------------------
1 | # continuous调度器
2 | 
3 | ## 功能介绍
4 | xLLM实现了支持continuous batching的调度策略，continuous_batch是一种动态批处理策略，它不等待批次填满，而是在有请求时就开始处理，同时持续接收新请求并将其加入正在执行的批次中，从而在保持高吞吐量的同时显著降低延迟。
5 | 
6 | ## 使用方式
7 | continuous batching的调度策略在xLLM提供了实现，如果不开其它调度策略，则默认使用continuous batching。
8 | 
9 | 


--------------------------------------------------------------------------------
/xllm/core/framework/state_dict/safetensors/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rust_safetensors"
 3 | version = "0.6.0"
 4 | edition = "2021"
 5 | 
 6 | [lib]
 7 | name = "rust_safetensors"
 8 | crate-type = ["staticlib"]
 9 | 
10 | [dependencies]
11 | thiserror = "1.0"
12 | safetensors = "0.6.0"
13 | 
14 | 


--------------------------------------------------------------------------------
/xllm/core/framework/tokenizer/tokenizers/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rust_tokenizers"
 3 | version = "0.21.0"
 4 | edition = "2018"
 5 | 
 6 | [lib]
 7 | name = "rust_tokenizers"
 8 | crate-type = ["staticlib"]
 9 | 
10 | [dependencies]
11 | tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] }
12 | 


--------------------------------------------------------------------------------
/xllm/core/framework/prefix_cache/prefix_cache_factory.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | 
 4 | #include "prefix_cache.h"
 5 | 
 6 | namespace xllm {
 7 | 
 8 | std::unique_ptr<PrefixCache> create_prefix_cache(
 9 |     const int32_t block_size,
10 |     const bool& enable_cache_upload = false);
11 | 
12 | }  // namespace xllm
13 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | BasedOnStyle: Google
 3 | UseTab: Never
 4 | IndentWidth: 2
 5 | ColumnLimit: 80
 6 | 
 7 | BinPackParameters: false
 8 | BinPackArguments: false
 9 | ExperimentalAutoDetectBinPacking: false
10 | AllowAllParametersOfDeclarationOnNextLine: false
11 | DerivePointerAlignment: false
12 | PointerAlignment: Left
13 | ...
14 | 


--------------------------------------------------------------------------------
/xllm/cc_api/examples/start-llm-instance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | clear
 4 | 
 5 | # export ASDOPS_LOG_LEVEL=DEBUG
 6 | # export ASDOPS_LOG_TO_STDOUT=1
 7 | export ASCEND_RT_VISIBLE_DEVICES=12
 8 | python3 -c "import torch; import torch_npu; torch_npu.npu.set_device('npu:0')"
 9 | 
10 | # build/single_llm_instance
11 | build/multiple_llm_instances
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit install
 2 | # pre-commit run --all-files
 3 | 
 4 | repos:
 5 | -   repo: https://github.com/pre-commit/mirrors-clang-format
 6 |     rev: v20.1.6
 7 |     hooks:
 8 |       - id: clang-format
 9 |         types_or: [c++, c, cuda]
10 |         exclude: ^(cibuild/|tools/|third_party/|cmake/|build/|.*\.ptx\.h$)
11 | 
12 | 


--------------------------------------------------------------------------------
/cibuild/install/install_ninja.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$NINJA_VERSION" ]
 6 | 
 7 | url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
 8 | 
 9 | pushd /tmp
10 | wget --no-verbose --output-document=ninja-linux.zip "$url"
11 | unzip ninja-linux.zip -d /usr/local/bin
12 | rm -f ninja-linux.zip
13 | popd


--------------------------------------------------------------------------------
/xllm/server/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     xllm_server
 6 |   HDRS
 7 |     xllm_server.h
 8 |     xllm_server_registry.h
 9 |   SRCS
10 |     xllm_server.cpp
11 |     xllm_server_registry.cpp
12 |   DEPS
13 |     :api_service
14 |     :request
15 |     absl::strings
16 |     glog::glog
17 |     proto::xllm_proto
18 | )
19 | 


--------------------------------------------------------------------------------
/xllm/core/layers/mlu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     mlu_layers
 6 |   HDRS
 7 |     attention.h
 8 |     deepseek_v2_attention.h
 9 |     deepseek_v2_decoder_layer_impl.h
10 |   SRCS
11 |     attention.cpp
12 |     deepseek_v2_attention.cpp
13 |     deepseek_v2_decoder_layer_impl.cpp
14 |   DEPS
15 |     :common_layers
16 | )
17 | 


--------------------------------------------------------------------------------
/cibuild/install/install_user.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # mirror jenkins user in container
 6 | echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
 7 | echo "jenkins:x:1000:" >> /etc/group
 8 | # needed on focal or newer
 9 | echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow
10 | 
11 | # allow sudo
12 | echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins


--------------------------------------------------------------------------------
/xllm/core/scheduler/profile/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME 
 7 |     profile
 8 |   HDRS
 9 |     profile_manager.h
10 |     time_predictor.h
11 |   SRCS
12 |     profile_manager.cpp
13 |     time_predictor.cpp
14 |   DEPS
15 |     :batch
16 |     :request
17 |     :runtime
18 |     glog::glog
19 |     absl::time
20 | )
21 | 


--------------------------------------------------------------------------------
/xllm/parser/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | # Define the library
 5 | cc_library(
 6 |   NAME 
 7 |     reasoning
 8 |   HDRS
 9 |     reasoning_detector.h
10 |     reasoning_parser.h
11 |     detector_registry.h
12 |   SRCS
13 |     reasoning_detector.cpp
14 |     reasoning_parser.cpp
15 |     detector_registry.cpp
16 |   DEPS
17 |     absl::strings
18 |     glog::glog
19 | )


--------------------------------------------------------------------------------
/cibuild/install/install_ccache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$CCACHE_VERSION" ]
 6 | 
 7 | ARCH=$(uname -m)
 8 | url=https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-${ARCH}.tar.xz
 9 | 
10 | pushd /tmp
11 | curl -L "$url" | xz -d | tar -x
12 | cp ./ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/bin/ccache
13 | popd
14 | 
15 | # set max cache size to 25GiB
16 | /usr/bin/ccache -M 25Gi


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include MANIFEST.in
 2 | include CMakeLists.txt
 3 | include LICENSE
 4 | include .gitmodules
 5 | recursive-include src *.*
 6 | recursive-include xllm *.py
 7 | recursive-include examples *.py
 8 | recursive-include third_party *
 9 | recursive-include docs *.*
10 | recursive-include tools *.*
11 | recursive-include scripts *.*
12 | recursive-include proto *.*
13 | prune */__pycache__
14 | global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
15 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/mlu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | file(GLOB_RECURSE MLU_HEADER_FILES
 4 |   "${CMAKE_CURRENT_LIST_DIR}/*.h"
 5 | )
 6 | 
 7 | file(GLOB_RECURSE MLU_SOURCE_FILES
 8 |   "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
 9 | )
10 | 
11 | cc_library(
12 |   NAME
13 |     mlu_kernels
14 |   HDRS
15 |     ${MLU_HEADER_FILES}
16 |   SRCS
17 |     ${MLU_SOURCE_FILES}
18 |   DEPS
19 |     torch
20 |     cnclep
21 |     torch_mlu_ops
22 |     python3.10
23 | )
24 | 


--------------------------------------------------------------------------------
/docs/zh/features/zero_evict_scheduler.md:
--------------------------------------------------------------------------------
 1 | # zero_evict调度器
 2 | 
 3 | ## 功能介绍
 4 | xLLM支持zero_evict调度策略。zero_evict调度策略是一种尽可能减少请求淘汰率的调度算法，可以减少淘汰请求的prefill计算，减少TPOT。
 5 | 这种调度算法通过模拟轮次，检测请求是否调度可以被调度且不导致其它请求被淘汰。
 6 | 
 7 | ## 使用方式
 8 | 上述策略已在xLLM实现，并向外暴露gflag参数，控制功能的开关。
 9 | 
10 | - 开启zero_evict策略，并设置max_decode_token_per_sequence。
11 | ```
12 | --use_zero_evict=true
13 | --max_decode_token_per_sequence=256
14 | ```
15 | 
16 | ## 性能效果
17 | 开启zero_evict之后，在Qwen3-8B模型上，限制E2E时延，TPOT时延 **下降27%**。
18 | 


--------------------------------------------------------------------------------
/xllm/launch_xllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import subprocess
 4 | import sys
 5 | import xllm
 6 | 
 7 | 
 8 | def launch_xllm():
 9 |     system = platform.system()
10 |     binary_name = {
11 |         "Linux": "xllm",
12 |         # "Windows"
13 |         # "Darwin"
14 |     }.get(system, "xllm")
15 |     
16 |     bin_path = os.path.dirname(xllm.__file__) + "/xllm"
17 | 
18 |     result = subprocess.run([str(bin_path)] + sys.argv[1:])
19 |     return result.returncode
20 | 


--------------------------------------------------------------------------------
/xllm/proto/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(proto_library)
 2 | 
 3 | proto_library(
 4 |   NAME
 5 |     xllm_proto
 6 |   SRCS
 7 |     tensor.proto
 8 |     common.proto
 9 |     rec.proto
10 |     completion.proto
11 |     chat.proto
12 |     multimodal.proto
13 |     embedding.proto
14 |     rerank.proto
15 |     models.proto
16 |     worker.proto
17 |     disagg_pd.proto
18 |     xllm_service.proto
19 |     xservice.proto
20 |     image_generation.proto
21 |     xtensor_manager.proto
22 | )
23 | 


--------------------------------------------------------------------------------
/xllm/core/framework/state_dict/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_test)
 2 | include(cc_library)
 3 | 
 4 | include_directories(..)
 5 | 
 6 | add_subdirectory(safetensors)
 7 | 
 8 | cc_library(
 9 |   NAME 
10 |     state_dict
11 |   HDRS
12 |     state_dict.h
13 |     utils.h
14 |     rec_vocab_dict.h
15 |   SRCS
16 |     state_dict.cpp
17 |     utils.cpp
18 |     rec_vocab_dict.cpp
19 |   DEPS
20 |     rust_safetensors
21 |     torch
22 |     glog::glog
23 |     Folly::folly
24 |     util
25 | )
26 | 
27 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/xllm_ops/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     xllm_ops
 6 |   HDRS
 7 |     replace_token.h
 8 |     top_k_top_p.h
 9 |     acltensor_utils.h
10 |     beam_search.h
11 |   SRCS
12 |     replace_token.cpp
13 |     top_k_top_p.cpp
14 |     acltensor_utils.cpp
15 |     beam_search.cpp
16 |   DEPS
17 |     atb
18 |     torch_npu
19 |     gflags::gflags
20 |     nlohmann_json::nlohmann_json
21 |     opapi
22 |     spdlog::spdlog
23 | )
24 | 


--------------------------------------------------------------------------------
/docs/mkdocs/javascripts/mathjax.js:
--------------------------------------------------------------------------------
 1 | window.MathJax = {
 2 |   tex: {
 3 |     inlineMath: [["\\(", "\\)"]],
 4 |     displayMath: [["\\[", "\\]"]],
 5 |     processEscapes: true,
 6 |     processEnvironments: true
 7 |   },
 8 |   options: {
 9 |     ignoreHtmlClass: ".*|",
10 |     processHtmlClass: "arithmatex"
11 |   }
12 | };
13 | 
14 | document$.subscribe(() => { 
15 |   MathJax.startup.output.clearCache()
16 |   MathJax.typesetClear()
17 |   MathJax.texReset()
18 |   MathJax.typesetPromise()
19 | })


--------------------------------------------------------------------------------
/xllm/function_call/partial_json_parser/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME 
 6 |     partial_json_parser
 7 |   SRCS 
 8 |     src/parser.cpp
 9 |   INCLUDES 
10 |     include
11 |   DEPS
12 |     nlohmann_json::nlohmann_json
13 | )
14 | 
15 | cc_test(
16 |   NAME
17 |     partial_json_parser_test
18 |   SRCS
19 |     test/test_examples.cpp
20 |     test/test_property_based.cpp
21 |   DEPS
22 |     :partial_json_parser
23 |     GTest::gtest
24 |     GTest::gtest_main
25 | )


--------------------------------------------------------------------------------
/xllm/core/framework/chat_template/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library (
 5 |   NAME
 6 |     chat_template
 7 |   HDRS
 8 |     jinja_chat_template.h
 9 |   SRCS
10 |     jinja_chat_template.cpp
11 |   DEPS
12 |     :minja
13 |     :tokenizer
14 |     nlohmann_json::nlohmann_json
15 |     glog::glog
16 | )
17 | 
18 | cc_test (
19 |   NAME
20 |     chat_template_test
21 |   SRCS
22 |     jinja_chat_template_test.cpp
23 |   DEPS
24 |     :chat_template
25 |     GTest::gtest_main
26 | )
27 | 
28 | 


--------------------------------------------------------------------------------
/cibuild/install/install_cmake.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$CMAKE_VERSION" ]
 6 | 
 7 | # Remove existing CMake installation
 8 | rm -f /usr/local/bin/cmake
 9 | 
10 | path="v${CMAKE_VERSION}"
11 | file="cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
12 | 
13 | # Download and install specific CMake version in /usr/local
14 | pushd /tmp
15 | wget -q "https://github.com/Kitware/CMake/releases/download/${path}/${file}"
16 | tar -C /usr/local --strip-components 1 --no-same-owner -zxf ${file}
17 | rm -f cmake-*.tar.gz
18 | popd


--------------------------------------------------------------------------------
/xllm/core/framework/dit_cache/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |  NAME
 6 |   dit_cache
 7 |  HDRS
 8 |   dit_cache_type.h
 9 |   dit_cache_config.h
10 |   dit_cache_impl.h
11 |   dit_cache.h
12 |   dit_non_cache.h
13 |   fbcache.h
14 |   fbcache_taylorseer.h
15 |   taylorseer.h
16 |  SRCS
17 |   dit_cache_impl.cpp
18 |   dit_cache.cpp
19 |   dit_non_cache.cpp
20 |   fbcache.cpp
21 |   fbcache_taylorseer.cpp
22 |   taylorseer.cpp
23 |  DEPS
24 |   torch
25 |   glog::glog
26 |   Folly::folly
27 | )


--------------------------------------------------------------------------------
/cmake/CMakeRustCompiler.cmake.in:
--------------------------------------------------------------------------------
 1 | 
 2 | # ported from https://github.com/Devolutions/CMakeRust
 3 | set(CMAKE_Rust_COMPILER "@CMAKE_Rust_COMPILER@")
 4 | set(CMAKE_Rust_COMPILER_ID "@CMAKE_Rust_COMPILER_ID@")
 5 | set(CMAKE_Rust_COMPILER_VERSION "@CMAKE_Rust_COMPILER_VERSION@")
 6 | set(CMAKE_Rust_COMPILER_LOADED @CMAKE_Rust_COMPILER_LOADED@)
 7 | set(CMAKE_Rust_PLATFORM_ID "@CMAKE_Rust_PLATFORM_ID@")
 8 | 
 9 | SET(CMAKE_Rust_SOURCE_FILE_EXTENSIONS rs)
10 | SET(CMAKE_Rust_LINKER_PREFERENCE 40)
11 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC")
12 | 
13 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | file(GLOB_RECURSE CUDA_HEADER_FILES
 4 |   "${CMAKE_CURRENT_LIST_DIR}/*.h"
 5 |   "${CMAKE_CURRENT_LIST_DIR}/*.cuh"
 6 | )
 7 | 
 8 | file(GLOB_RECURSE CUDA_SOURCE_FILES
 9 |   "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
10 |   "${CMAKE_CURRENT_LIST_DIR}/*.cu"
11 | )
12 | 
13 | cc_library(
14 |   NAME
15 |     cuda_kernels
16 |   HDRS
17 |     ${CUDA_HEADER_FILES}
18 |   SRCS
19 |     ${CUDA_SOURCE_FILES}
20 |   DEPS
21 |     tvm_ffi
22 |     torch
23 |     :util
24 |     :platform
25 | )
26 | 


--------------------------------------------------------------------------------
/docs/zh/features/multimodal.md:
--------------------------------------------------------------------------------
 1 | # 多模态支持
 2 | 本文档主要介绍xLLM推理引擎中多模态的支持进展，包括支持模型及模态类型，以及离在线接口等。
 3 | 
 4 | ## 支持模型
 5 | - Qwen2.5-VL: 包括7B/32B/72B。
 6 | - Qwen3-VL: 包括2B/4B/8B/32B。
 7 | - Qwen3-VL-MoE: 包括A3B/A22B。
 8 | - MiniCPM-V-2_6: 7B。
 9 | 
10 | ## 模态类型
11 | - 图片: 支持单图、多图的输入，以及图片+Prompt组合、纯文本Promot等输入方式。
12 | 
13 | 
14 | !!! warning "注意事项"
15 |     - 目前多模态后端不支持prefix cache以及chunk prefill，正在支持中。
16 |     - 目前，xLLM统一基于JinJa渲染ChatTemplate，部署MiniCPM-V-2_6，模型目录需提供ChatTemplate文件。
17 |     - 图片支持Base64输入以及图片Url。
18 |     - 目前多模态模型主要支持了图片模态，视频、音频等模态正在推进中。
19 |     
20 | 


--------------------------------------------------------------------------------
/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | 
 3 | cc_binary(
 4 |   NAME 
 5 |     spawn_worker
 6 |   HDRS
 7 |     spawn_worker_server.h
 8 |   SRCS
 9 |     spawn_worker_server.cpp
10 |     spawn_worker_server_process.cpp
11 |   DEPS
12 |     :models
13 |     :model
14 |     :distributed_runtime
15 |     absl::strings
16 |     xllm_kernels
17 |     ascendcl
18 |     nnopbase
19 |     atb
20 |     atb_customize
21 |     c_sec
22 |     spdlog::spdlog
23 | )
24 | 
25 | add_dependencies(export_module spawn_worker)
26 | 


--------------------------------------------------------------------------------
/docs/zh/features/chunked_scheduler.md:
--------------------------------------------------------------------------------
 1 | # chunked调度器
 2 | 
 3 | ## 功能介绍
 4 | xLLM支持chunked prefill调度策略。Chunked prefill是一种优化大语言模型推理的技术，将长prompt分割成多个较小的chunk进行分批处理，而不是一次性处理整个prompt。
 5 | 这种方法可以有效降低显存峰值使用量，提高Device利用率，并且能够更好地与decode阶段的请求进行调度和混合处理。
 6 | 
 7 | ## 使用方式
 8 | 上述策略已在xLLM实现，并向外暴露gflag参数，控制功能的开关。
 9 | 
10 | - 开启chunked prefill，并设置chunked_size，如果不手动设置chunked size，则默认等于max_tokens_per_batch。
11 | ```bash
12 | --enable_chunked_prefill=true
13 | --max_tokens_per_chunk_for_prefill=20480 # optional
14 | ```
15 | 
16 | ## 性能效果
17 | 开启chunked_prefill之后，在Qwen3-8B模型上，限制TPOT 50ms，TTFT时延 **下降46%**。
18 | 


--------------------------------------------------------------------------------
/xllm/core/framework/prefix_cache/prefix_cache_factory.cpp:
--------------------------------------------------------------------------------
 1 | #include "prefix_cache_factory.h"
 2 | 
 3 | #include <absl/strings/numbers.h>
 4 | #include <absl/strings/str_split.h>
 5 | 
 6 | #include "prefix_cache_with_upload.h"
 7 | 
 8 | namespace xllm {
 9 | 
10 | std::unique_ptr<PrefixCache> create_prefix_cache(
11 |     int32_t block_size,
12 |     const bool& enable_cache_upload) {
13 |   if (enable_cache_upload) {
14 |     return std::make_unique<PrefixCacheWithUpload>(block_size);
15 |   }
16 |   return std::make_unique<PrefixCache>(block_size);
17 | }
18 | 
19 | }  // namespace xllm
20 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | if(USE_NPU)
 4 |   add_subdirectory(npu)
 5 | endif()
 6 | 
 7 | if(USE_MLU)
 8 |   add_subdirectory(mlu)
 9 | endif()
10 | 
11 | if(USE_CUDA)
12 |   add_subdirectory(cuda)
13 | endif()
14 | 
15 | if(USE_ILU)
16 |   add_subdirectory(ilu)
17 | endif()
18 | 
19 | cc_library(
20 |   NAME
21 |     kernels
22 |   HDRS
23 |     param.h
24 |     ops_api.h
25 |   SRCS
26 |     ops_api.cpp
27 |   DEPS
28 |     torch
29 |     $<$<BOOL:${USE_NPU}>:torch_npu_kernels>
30 |     $<$<BOOL:${USE_MLU}>:mlu_kernels>
31 |     $<$<BOOL:${USE_CUDA}>:cuda_kernels>
32 |     $<$<BOOL:${USE_ILU}>:ilu_kernels>
33 | )


--------------------------------------------------------------------------------
/docs/zh/features/prefix_cache.md:
--------------------------------------------------------------------------------
 1 | # prefix cache 优化
 2 | 
 3 | ## 功能介绍
 4 | xLLM支持prefix_cache匹配。prefix_cache基于mermer_hash，使用lru淘汰策略，提供更极致的匹配效率，同时提高prefix_cache命中率。
 5 | 同时对prefix_cache进行了优化，支持continuous_scheduler、chunked_scheduler和zero_evict_scheduler，在prefill之后即更新
 6 | prefix_cache，提高匹配时效性，同时对于chunked_scheduler，支持多阶段chunked_prefill匹配，减少计算量并尽可能减少kv_cache占用。
 7 | 
 8 | ## 使用方式
 9 | prefix_cache已在xLLM实现，并向外暴露gflag参数，控制功能的开关。
10 | 
11 | - 开启zero_evict策略，并设置max_decode_token_per_sequence。
12 | ```
13 | --enable_prefix_cache=true
14 | ```
15 | 
16 | ## 性能效果
17 | 开启prefix_cache之后，在Qwen3-8B模型上，限制TPOT50ms，E2E时延 **下降10%**。
18 | 
19 | !!! warning "注意"
20 |     暂不支持PD分离调度器


--------------------------------------------------------------------------------
/xllm/core/platform/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     platform
 6 |   HDRS
 7 |     stream.h
 8 |     device.h
 9 |     vmm_api.h
10 |   SRCS
11 |     stream.cpp
12 |     device.cpp
13 |     vmm_api.cpp
14 |   DEPS
15 |     torch
16 |     $<$<BOOL:${USE_NPU}>:torch_npu>
17 |     $<$<BOOL:${USE_NPU}>:ascendcl>
18 |     $<$<BOOL:${USE_MLU}>:torch_mlu>
19 |     $<$<BOOL:${USE_MLU}>:cnrt>
20 |     $<$<BOOL:${USE_MLU}>:cndrv>
21 |     $<$<OR:$<BOOL:${USE_CUDA}>,$<BOOL:${USE_ILU}>>:cuda>
22 |     $<$<OR:$<BOOL:${USE_CUDA}>,$<BOOL:${USE_ILU}>>:cudart>
23 | )
24 | 
25 | if(USE_NPU)
26 |   add_subdirectory(npu)
27 | endif()
28 | 


--------------------------------------------------------------------------------
/xllm/proto/tensor.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option go_package = "jd.com/jd-infer/xllm;xllm";
 4 | package xllm.proto;
 5 | 
 6 | message TensorContents
 7 | {
 8 |   repeated bool bool_contents = 1;
 9 |   repeated int32 int_contents = 2;
10 |   repeated int64 int64_contents = 3;
11 |   repeated uint32 uint_contents = 4;
12 |   repeated uint64 uint64_contents = 5;
13 |   repeated float fp32_contents = 6;
14 |   repeated double fp64_contents = 7;
15 |   repeated bytes bytes_contents = 8;
16 | }
17 | 
18 | message Tensor {
19 |   string name = 1;
20 |   string datatype = 2;
21 |   repeated int64 shape = 3;
22 |   TensorContents contents = 4;           
23 | }


--------------------------------------------------------------------------------
/docs/en/features/continuous_scheduler.md:
--------------------------------------------------------------------------------
1 | # Continuous Scheduler
2 | 
3 | ## Feature Introduction
4 | xLLM implements a scheduling strategy that supports continuous batching. Continuous batching is a dynamic batching strategy that does not wait for a batch to be filled. Instead, it starts processing as soon as requests are available, while continuously accepting new requests and adding them to the currently executing batch. This approach significantly reduces latency while maintaining high throughput.
5 | 
6 | ## Usage
7 | The continuous batching scheduling strategy is implemented in xLLM. If no other scheduling strategies are enabled, continuous batching is used by default.


--------------------------------------------------------------------------------
/xllm/core/kernels/ilu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | set(CMAKE_CUDA_ARCHITECTURES ivcore11)
 3 | file(GLOB_RECURSE ILU_HEADER_FILES
 4 |   "${CMAKE_CURRENT_LIST_DIR}/*.h"
 5 | )
 6 | 
 7 | file(GLOB_RECURSE ILU_SOURCE_FILES
 8 |   "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
 9 |   "${CMAKE_CURRENT_LIST_DIR}/*.cu"
10 | )
11 | 
12 | find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
13 | 
14 | cc_library(
15 |   NAME
16 |     ilu_kernels
17 |   HDRS
18 |     ${ILU_HEADER_FILES}
19 |   SRCS
20 |     ${ILU_SOURCE_FILES}
21 |   DEPS
22 |     torch
23 |     :util
24 |     ixformer_kernels
25 |     ixformer
26 |     ${Python3_LIBRARIES}
27 |     cuinfer
28 | )
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yaml:
--------------------------------------------------------------------------------
 1 | name: ❓ Question
 2 | description: Submit a question
 3 | title: "[Question]: "
 4 | labels: ["question"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues).
11 | - type: textarea
12 |   attributes:
13 |     label: ❓ Describe the question
14 |     description: |
15 |       Please provide a clear and concise description of your question.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/docs/zh/features/topk_topP.md:
--------------------------------------------------------------------------------
 1 | # Topk&Topp算子优化
 2 | 
 3 | ## 背景
 4 | 在自然语言生成任务中，topK和topP采样策略被广泛应用于控制生成文本的多样性和质量。然而，在小模型中，这两种策略的计算耗时相对较长。这主要是由于小模型的参数较少，导致在处理概率分布时，排序和筛选的效率降低，从而影响了生成速度。因此，优化小模型中topK和topP的实现，可以提升其采样效率。
 5 | 
 6 | 
 7 | ## 功能介绍
 8 | 
 9 | topKtopP算子的实现将排序、topK、softmax和topP等多个小算子融合为一个大算子，从而提高了计算效率和性能。
10 | 
11 | 
12 | ## 用户接口
13 | ### 算子调用API
14 | ```c++
15 | void top_k_top_p(torch::Tensor& logits,
16 |                  const torch::Tensor& topK,
17 |                  const torch::Tensor& topP);
18 | ```
19 | 
20 | - `logits`: 输入的logits张量，包含模型的输出分数。
21 | - `topK`: 用于选择的前K个概率的阈值张量。
22 | - `topP`: 用于选择的累积概率的阈值张量。
23 | 
24 | 
25 | ## 性能效果
26 | 
27 | * 使用topKtopP融合算子后，在qwen2-0.5B模型中，TTOT **下降37%**,TTFT **提升10%**。
28 | 


--------------------------------------------------------------------------------
/xllm/pybind/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(pybind_extension)
 2 | 
 3 | pybind_extension(
 4 |   NAME
 5 |     xllm_export
 6 |   COPTS
 7 |     -DPY_MODULE_NAME=xllm_export
 8 |   SRCS
 9 |     bind.cpp
10 |   DEFINES
11 |     PYBIND11_DETAILED_ERROR_MESSAGES=1
12 |   LINKDIRS
13 |     ${TORCH_INSTALL_PREFIX}/lib
14 |   DEPS
15 |     :master
16 |     :request
17 |     :util
18 |     absl::strings
19 |     brpc
20 |     gflags::gflags
21 |     glog::glog
22 |     Python::Module
23 |     torch_python
24 |     torch
25 |     c10
26 | )
27 | target_link_options(xllm_export PRIVATE -Wl,-Bsymbolic)
28 | target_link_libraries(common PRIVATE leveldb::leveldb ZLIB::ZLIB OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
29 | add_dependencies(common brpc-static)
30 | 
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Visual Studio Code
 2 | /.vscode*
 3 | 
 4 | # Idea
 5 | /.idea
 6 | /cmake-build-debug/
 7 | /cmake-build-release/
 8 | 
 9 | # CMake
10 | /build*
11 | 
12 | # vcpkg
13 | /.vcpkg*
14 | 
15 | # cache
16 | /.*cache
17 | 
18 | # deps
19 | /.deps
20 | 
21 | # libtorch
22 | /libtorch
23 | 
24 | # tests
25 | /Testing*
26 | 
27 | # rust
28 | Cargo.lock
29 | 
30 | 
31 | # distribution / packaging
32 | .Python
33 | build/
34 | dist/
35 | eggs/
36 | .eggs/
37 | sdist/
38 | wheels/
39 | *.egg-info/
40 | .installed.cfg
41 | *.egg
42 | MANIFEST
43 | 
44 | # Python module builds
45 | *.egg-info/
46 | xllm/*.pyd
47 | xllm/*.so
48 | xllm/version.py
49 | **/__pycache__/*
50 | 
51 | # compile_commands.json from nvbench
52 | compile_commands.json
53 | 
54 | # ascend kernel meta files
55 | /kernel_meta
56 | 
57 | # local files
58 | /local


--------------------------------------------------------------------------------
/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version, and other tools you might need
 8 | build:
 9 |   os: ubuntu-24.04
10 |   tools:
11 |     python: "3.13"
12 |   jobs:
13 |     pre_build:
14 |       - cp -r docs/en/* docs/
15 |       - find docs/ -name "*.md" -exec sed -i 's#../assets/#assets/#g' {} \;
16 | 
17 | # Build documentation with Mkdocs
18 | mkdocs:
19 |    configuration: mkdocs_en.yml
20 | 
21 | # Optionally, but recommended,
22 | # declare the Python requirements required to build your documentation
23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
24 | python:
25 |    install:
26 |    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/docs/zh/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version, and other tools you might need
 8 | build:
 9 |   os: ubuntu-24.04
10 |   tools:
11 |     python: "3.13"
12 |   jobs:
13 |     pre_build:
14 |       - cp -r docs/zh/* docs/
15 |       - find docs/ -name "*.md" -exec sed -i 's#../assets/#assets/#g' {} \;
16 | 
17 | # Build documentation with Mkdocs
18 | mkdocs:
19 |    configuration: mkdocs_zh.yml
20 | 
21 | # Optionally, but recommended,
22 | # declare the Python requirements required to build your documentation
23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
24 | python:
25 |    install:
26 |    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/docs/zh/features/multi_streams.md:
--------------------------------------------------------------------------------
 1 | # 多流并行
 2 | 
 3 | ## 背景
 4 | 大模型分布式推理场景中需要引入额外的通信操作，将不同设备上的计算结果聚合在一起。以Deepseek这类大规模的MoE模型为例，分布式规模通常较大，通信开销也会随之变大。计算和通信都采用同一个stream的话，在通信的同时，device计算资源会出现浪费，一直等待通信完成才能开始后面的计算。
 5 | 
 6 | 
 7 | ## 功能介绍
 8 | xLLM在模型图层支持了多流并行功能，将输入的batch拆分成2个micro batches，一个流执行一个micro batch的计算操作，另一个流执行另一个micro batch的通信操作，计算和通信同时执行，从而掩盖通信开销。
 9 | ![异步调度](../../assets/multi_streams_architecture.jpg)
10 | 
11 | 
12 | ## 使用方式
13 | 
14 | xLLM中提供了gflags参数`enable_multi_stream_parallel`，默认false，如需开启在xLLM的服务启动脚本中设置为true即可，示例如下：
15 | ```shell
16 | --enable_multi_stream_parallel=true
17 | ```
18 | 
19 | 
20 | ## 性能效果
21 | prefill双流并行开启后，基本可掩盖75以上的通信开销，在DeepSeek-R1模型上，只输出1个token的情况下
22 | 
23 | - TTFT下降 **7%**
24 | - 吞吐 **提升7%**
25 | 
26 | 
27 | !!! warning "注意"
28 |     双流并行目前只支持prefill阶段，请求输入越长，收益越大。
29 |     目前仅支持DeepSeek、Qwen3 dense（非MoE）模型。


--------------------------------------------------------------------------------
/cibuild/build_mlu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | function error() {
 5 |   echo "Require build command, e.g. python setup.py build"
 6 |   exit 1
 7 | }
 8 | 
 9 | IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251104"
10 | 
11 | RUN_OPTS=(
12 |   --rm
13 |   -t
14 |   --privileged
15 |   --ipc=host
16 |   --network=host
17 |   --pid=host
18 |   --shm-size '128gb'
19 |   -v /export/home:/export/home
20 |   -v /usr/bin/cnmon:/usr/bin/cnmon
21 |   -v /export/home/mlu_vcpkg_cache:/root/.cache/vcpkg # cached vcpkg installed dir
22 |   -w /export/home
23 | )
24 | 
25 | CMD="$*"
26 | [[ -z "${CMD}" ]] && error
27 | 
28 | [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command is missing." && exit 1
29 | 
30 | docker run "${RUN_OPTS[@]}" "${IMAGE}" bash -c "set -euo pipefail; cd $(pwd); ${CMD}"
31 | 


--------------------------------------------------------------------------------
/xllm/proto/rerank.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option go_package = "jd.com/jd-infer/xllm;xllm";
 4 | package xllm.proto;
 5 | 
 6 | import "common.proto";
 7 | 
 8 | message RerankRequest {
 9 |   string model = 1;
10 |   string query = 2;
11 |   repeated string documents = 3;
12 |   optional int32 top_n = 4;
13 |   optional int32 truncate_prompt_tokens = 5;
14 | 
15 |   optional string user = 6;
16 | 
17 |   optional string service_request_id = 7;
18 | }
19 | 
20 | message RerankDocument {
21 |   string text = 1;
22 | }
23 | 
24 | message RerankResult {
25 |   int32 index = 1;
26 | 
27 |   RerankDocument document = 2;
28 | 
29 |   float relevance_score = 3;
30 | }
31 | 
32 | message RerankResponse {
33 |   string id = 1;
34 | 
35 |   string model = 2;
36 | 
37 |   Usage usage = 3;
38 | 
39 |   repeated RerankResult results = 4;
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/xllm/processors/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | # Define the base dependencies
 5 | set(BASE_DEPS
 6 |   :common
 7 |   :layers
 8 |   :kv_cache
 9 |   :prefix_cache
10 |   :block
11 |   :chat_template
12 |   glog::glog
13 |   torch
14 |   torch_python
15 | )
16 | 
17 | 
18 | # Define the library
19 | cc_library(
20 |   NAME 
21 |     processors
22 |   HDRS
23 |     image_processor.h
24 |     clip_image_processor.h
25 |     minicpmv_image_processor.h
26 |     qwen2_vl_image_processor.h
27 |     glm4v_image_processor.h
28 |     pywarpper_image_processor.h
29 |     input_processor.h
30 |   SRCS
31 |     image_processor.cpp
32 |     clip_image_processor.cpp
33 |     minicpmv_image_processor.cpp
34 |     qwen2_vl_image_processor.cpp
35 |     glm4v_image_processor.cpp
36 |     pywarpper_image_processor.cpp
37 |   DEPS
38 |     ${BASE_DEPS}
39 | )
40 | 


--------------------------------------------------------------------------------
/docs/zh/features/ppmatmul.md:
--------------------------------------------------------------------------------
 1 | # PpMatmul 算子优化
 2 | 
 3 | ## 背景
 4 | 
 5 | 针对大模型推理中矩阵乘法占比高、耗时长的问题，优化了矩阵乘法算子的实现。
 6 | 
 7 | ## 功能介绍
 8 | 
 9 | PpMatmul 算子使用 Tiling 切分策略，将矩阵乘法分解为多个小的矩阵乘法任务。然而当 tile 数量较小时任务无法被均匀分配到所有 npu 核心上，导致 tail effect 问题，影响计算效率。我们通过预取内存或重新划分任务的方式，优化 PpMatmul 算子的性能。
10 | 
11 | ## 用户接口
12 | 
13 | ### 算子直调 API
14 | 
15 | ```cpp
16 | aclnnStatus aclnnPpMatmulOptGetWorkspaceSize(
17 |     const aclTensor *a,
18 |     const aclTensor *b,
19 |     const aclTensor *out,
20 |     uint64_t *workspaceSize,
21 |     aclOpExecutor **executor);
22 | 
23 | aclnnStatus aclnnPpMatmulOpt(
24 |     void *workspace,
25 |     uint64_t workspaceSize,
26 |     aclOpExecutor *executor,
27 |     aclrtStream stream);
28 | ```
29 | 
30 | - `a`: 输入矩阵 A。
31 | - `b`: 输入矩阵 B。
32 | - `out`: 输出矩阵，存储计算结果。
33 | 
34 | ## 性能效果
35 | 
36 | 对于 tile 数量较小的情况（例如 M 较小，对应于 batch size 较小的情况），在（TP=4）时，算子较优化前有 **18%** 的性能提升。


--------------------------------------------------------------------------------
/cmake/CMakeDetermineRustCompiler.cmake:
--------------------------------------------------------------------------------
 1 | # ported from https://github.com/Devolutions/CMakeRust
 2 | if(NOT CMAKE_Rust_COMPILER)
 3 | 	find_package(Rust)
 4 | 	if(RUST_FOUND)
 5 | 		set(CMAKE_Rust_COMPILER "${RUSTC_EXECUTABLE}")
 6 | 		set(CMAKE_Rust_COMPILER_ID "Rust")
 7 | 		set(CMAKE_Rust_COMPILER_VERSION "${RUST_VERSION}")
 8 | 		set(CMAKE_Rust_PLATFORM_ID "Rust")
 9 | 	endif()
10 | endif()
11 | 
12 | message(STATUS "Cargo Home: ${CARGO_HOME}")
13 | message(STATUS "Rust Compiler Version: ${RUSTC_VERSION}")
14 | 
15 | mark_as_advanced(CMAKE_Rust_COMPILER)
16 | 
17 | if(CMAKE_Rust_COMPILER)
18 | 	set(CMAKE_Rust_COMPILER_LOADED 1)
19 | endif(CMAKE_Rust_COMPILER)
20 | 
21 | configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeRustCompiler.cmake.in
22 | 	${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${CMAKE_VERSION}/CMakeRustCompiler.cmake IMMEDIATE @ONLY)
23 | 
24 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC")
25 | 
26 | 


--------------------------------------------------------------------------------
/xllm/proto/models.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option go_package = "jd.com/jd-infer/xllm;xllm";
 4 | package xllm.proto;
 5 | 
 6 | message ModelCard {
 7 |   // The model identifier, which can be referenced in the API endpoints.
 8 |   optional string id = 1;
 9 | 
10 |   // The Unix timestamp (in seconds) when the model was created.
11 |   optional uint32 created = 2;
12 | 
13 |   // the object type, which is always "model".
14 |   optional string object = 3;  
15 | 
16 |   // the organization that owns the model.
17 |   optional string owned_by = 4 [json_name = "owned_by"];
18 | }
19 | 
20 | message ModelList {
21 |   optional string object = 1;
22 |   repeated ModelCard data = 2;
23 | }
24 | 
25 | message ModelListRequest {
26 |   // The model identifier.
27 |   // string model = 1;
28 | }
29 | 
30 | message ModelListResponse {
31 |   // The list of models.
32 |   repeated ModelCard data = 1;
33 | }


--------------------------------------------------------------------------------
/xllm/pybind/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import psutil
 3 | import signal
 4 | import socket
 5 | import sys
 6 | 
 7 | def terminate_process(pid, timeout=30):
 8 |     try:
 9 |         parent = psutil.Process(pid)
10 |     except psutil.NoSuchProcess:
11 |         return
12 | 
13 |     children = parent.children(recursive=True)
14 |     procs = children + [parent]
15 | 
16 |     for p in procs:
17 |         try:
18 |             p.terminate()
19 |         except psutil.NoSuchProcess:
20 |             pass
21 | 
22 |     gone, alive = psutil.wait_procs(procs, timeout=timeout)
23 |     for p in alive:
24 |         try:
25 |             p.kill()
26 |         except psutil.NoSuchProcess:
27 |             pass
28 | 
29 | def get_free_port():
30 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
31 |         s.bind(('0.0.0.0', 0))
32 |         _, port = s.getsockname()
33 |     return port
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yaml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug report
 2 | description: Raise an issue here if you find a bug.
 3 | title: "[Bug]: "
 4 | labels: ["bug"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues).
11 | - type: textarea
12 |   attributes:
13 |     label: Your environment
14 |     description: |
15 |       Please provide what the environment you are running.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: 🐛 Describe the bug
21 |     description: |
22 |       Please provide a clear and concise description of what the bug is.
23 |   validations:
24 |     required: true
25 | - type: markdown
26 |   attributes:
27 |     value: >
28 |       Thanks for report the bug!


--------------------------------------------------------------------------------
/cibuild/install/install_gcc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$GCC_VERSION" ]
 6 | 
 7 | install_ubuntu() { 
 8 |   # Need the official toolchain repo to get alternate packages
 9 |   add-apt-repository ppa:ubuntu-toolchain-r/test
10 |   apt-get update
11 |   apt-get install -y g++-$GCC_VERSION
12 |   update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
13 |   update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
14 |   update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
15 | 
16 | 
17 |   # Cleanup package manager
18 |   apt-get autoclean && apt-get clean
19 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
20 | }
21 | 
22 | 
23 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
24 | case "$ID" in
25 |   ubuntu)
26 |     install_ubuntu
27 |     ;;
28 |   *)
29 |     echo "Unable to determine OS..."
30 |     exit 1
31 |     ;;
32 | esac


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # mkdocs-material
 3 | # mkdocs-minify-plugin
 4 | # python-markdown-math
 5 | # regex
 6 | # ruff
 7 | 
 8 | # jinja2~=3.1
 9 | # markdown~=3.2
10 | # mkdocs~=1.6
11 | # mkdocs-material-extensions~=1.3
12 | # pygments~=2.16
13 | # pymdown-extensions~=10.2
14 | 
15 | # # Requirements for plugins
16 | # babel~=2.10
17 | # colorama~=0.4
18 | # paginate~=0.5
19 | # backrefs~=5.7.post1
20 | # requests~=2.26
21 | 
22 | # Requirements for core
23 | jinja2
24 | markdown
25 | mkdocs
26 | mkdocs-material
27 | mkdocs-material-extensions
28 | pygments
29 | pymdown-extensions
30 | mkdocs-minify-plugin
31 | python-markdown-math
32 | 
33 | mkdocs-git-revision-date-localized-plugin
34 | # Requirements for plugins
35 | babel
36 | colorama
37 | paginate
38 | backrefs
39 | requests
40 | 
41 | # Temporarily pin click until this is resolved in MkDocs, see
42 | # https://github.com/mkdocs/mkdocs/issues/4014#issuecomment-3146508306
43 | click


--------------------------------------------------------------------------------
/docs/zh/features/moe_params.md:
--------------------------------------------------------------------------------
 1 | # EP并行
 2 | ## 背景介绍
 3 | 在部署DeepSeek-R1 671B参数规模模型时传统分布式部署面临、显存利用率低、通信开销大、硬件成本高昂等核心瓶颈，因此需要引入ep并行。
 4 | + 在同等资源下，单张卡上的Expert越少，可用于KV Cache的显存越多，可Cache的token个数越多。
 5 | + 因MLA的特性，同等资源下TP Size越小，冗余的KV Cache就越少，可Cache的token个数越多。
 6 | + 采用大规模ep并行部署，可以将同一个expert的token计算集中到同一设备上，提高硬件利用率
 7 | ## 参数设置
 8 | + dp_size：设置Attention部分的dp规模大小，默认值为1，可设置为2的指数倍，当dp_size不等于卡数时，dp组内为tp并行.
 9 | + ep_size：设置MoE部分的ep规模大小，默认值为1，可设置为2的指数倍，当ep_size不等于卡数时，dp组内为tp并行.
10 | + enable_mla ：默认为false，当模型使用mla时需要设置为true.
11 | + expert_parallel_degree ：ep并行相关参数，不开启ep时默认设置为0，开启ep时默认为1，此时为ep level1，当ep_size等于卡数时可以设置为2开启ep level2.
12 | ## 方案设计
13 | + 当开启ep时，默认为ep level1，此时attn与moe部分计算完成后，通过All Gather全卡通讯将数据发送到下一阶段，以64卡attn部分dp32tp2 moe部分ep32tp2为例，执行流程如下：
14 | ![Alt text](../../assets/moe_eplevel1.jpg)
15 | + 当ep_size设置为卡数时，可以开启ep level2，此时attn部分与moe部分之间通讯变为ALL2ALL，只向需要的卡发送数据，降低通讯量与通讯开销，以64卡部署为例，执行流程如下：
16 | ![Alt text](../../assets/moe_eplevel2.jpg)
17 | 


--------------------------------------------------------------------------------
/docs/mkdocs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | :root  > * {
 2 |   --md-primary-fg-color:        #F1002B;
 3 |   --md-primary-fg-color--light: #F1002B;
 4 |   --md-primary-fg-color--dark:  #af0510;
 5 | 
 6 |   --md-accent-fg-color:        #F1002B;
 7 |   --md-accent-fg-color--light: #F1002B;
 8 |   --md-accent-fg-color--dark:  #af0510;
 9 | }
10 | 
11 | /* :root > * {
12 |   --md-footer-bg-color: var(--md-primary-fg-color);
13 |   --md-footer-fg-color: var(--md-primary-bg-color);
14 |   --md-footer-fg-color--light: var(--md-primary-bg-color--light);
15 |   --md-footer-fg-color--lighter: var(--md-primary-bg-color--lighter);
16 | } */
17 | 
18 | [data-md-color-scheme="jd"] {
19 |   --md-primary-fg-color:        #FB002B;
20 |   --md-primary-fg-color--light: #FB002B;
21 |   --md-primary-fg-color--dark:  #af0510;
22 | 
23 |   --md-accent-fg-color:        #FB002B;
24 |   --md-accent-fg-color--light: #FB002B;
25 |   --md-accent-fg-color--dark:  #af0510;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/xllm/models/llm/npu/llama3.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "llama.h"
19 | 
20 | namespace xllm {
21 | // register the causal model
22 | REGISTER_CAUSAL_MODEL(llama3, LlamaForCausalLM);
23 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/common/rate_limiter_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "rate_limiter.h"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include "global_flags.h"
 6 | 
 7 | namespace xllm {
 8 | 
 9 | TEST(RequestLimiterTest, Basic) {
10 |   // Set the maximum number of concurrent requests to 1.
11 |   FLAGS_max_concurrent_requests = 1;
12 |   RateLimiter rate_limiter;
13 |   // The current number of concurrent requests is 0, no rate limiting is
14 |   // applied.
15 |   EXPECT_EQ(rate_limiter.is_limited(), false);
16 |   // The current number of concurrent requests is 1, rate limiting is applied.
17 |   EXPECT_EQ(rate_limiter.is_limited(), true);
18 |   // Decrease the number of concurrent requests by one, changing the concurrency
19 |   // from 1 to 0.
20 |   rate_limiter.decrease_one_request();
21 |   // The current number of concurrent requests is 0, no rate limiting is
22 |   // applied.
23 |   EXPECT_EQ(rate_limiter.is_limited(), false);
24 | }
25 | 
26 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/eplb/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | include_directories(
 6 |   ${CMAKE_SOURCE_DIR}/xllm/core/kernels/ascend
 7 |   ${CMAKE_SOURCE_DIR}/xllm/core/kernels/ascend/core/include
 8 | )
 9 | 
10 | cc_library(
11 |   NAME 
12 |     eplb
13 |   HDRS
14 |     eplb_executor.h
15 |     eplb_manager.h
16 |     eplb_policy.h
17 |     expert_weight_buffer_shm.h
18 |     expert_buffer_manager.h
19 |   SRCS 
20 |     eplb_executor.cpp
21 |     eplb_manager.cpp
22 |     eplb_policy.cpp
23 |     expert_weight_buffer_shm.cpp
24 |     expert_buffer_manager.cpp
25 |   DEPS
26 |     :request
27 |     :common
28 |     glog::glog
29 |     torch
30 |     :platform
31 | )
32 | 
33 | set(TEST_SRCS
34 |   eplb_policy_test.cpp
35 | )
36 | 
37 | cc_test(
38 |   NAME
39 |     eplb_policy_test
40 |   SRCS
41 |     ${TEST_SRCS}
42 |   DEPS
43 |     torch
44 |     :eplb
45 |     fmt::fmt
46 |     GTest::gtest_main
47 | )
48 | 
49 | 


--------------------------------------------------------------------------------
/xllm/core/framework/tokenizer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | add_subdirectory(tokenizers)
 5 | 
 6 | cc_library(
 7 |   NAME 
 8 |     tokenizer
 9 |   HDRS
10 |     tokenizer_args.h
11 |     tokenizer.h
12 |     tokenizer_factory.h
13 |     tiktoken_tokenizer.h
14 |     sentencepiece_tokenizer.h
15 |     fast_tokenizer.h
16 |     tokenizer_proxy.h
17 |     rec_tokenizer.h
18 |   SRCS
19 |     tokenizer_factory.cpp
20 |     tiktoken_tokenizer.cpp
21 |     sentencepiece_tokenizer.cpp
22 |     fast_tokenizer.cpp
23 |     tokenizer_proxy.cpp
24 |     rec_tokenizer.cpp
25 |   DEPS
26 |     :common
27 |     :sentencepiece
28 |     absl::flat_hash_map
29 |     absl::strings
30 |     glog::glog
31 |     rust_tokenizers
32 |     re2::re2
33 | )
34 | 
35 | cc_test(
36 |   NAME
37 |     fast_tokenizer_test
38 |   SRCS
39 |     tests/fast_tokenizer_tests.cpp
40 |   DEPS
41 |     :tokenizer
42 |     glog::glog
43 |     GTest::gtest_main
44 | )
45 | 
46 | 


--------------------------------------------------------------------------------
/xllm/core/util/pretty_print.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <string>
19 | 
20 | namespace xllm {
21 | 
22 | std::string readable_size(size_t bytes);
23 | 
24 | }  // namespace xllm
25 | 


--------------------------------------------------------------------------------
/docs/en/features/zero_evict_scheduler.md:
--------------------------------------------------------------------------------
 1 | # Zero Evict Scheduler
 2 | 
 3 | ## Feature Introduction
 4 | xLLM supports the zero evict scheduling strategy. The zero evict scheduling strategy is an algorithm designed to minimize request eviction rates, reducing the need for prefill computation on evicted requests and consequently improving TPOT (Time Per Output Token).
 5 | This scheduling algorithm employs simulation rounds to detect whether a request can be scheduled without causing the eviction of other requests.
 6 | 
 7 | ## Usage
 8 | The aforementioned strategy has been implemented in xLLM and is exposed through gflags parameters to control the feature's on/off state.
 9 | 
10 | - Enable the zero evict strategy and set the maximum decode tokens per sequence.
11 | ```
12 | --use_zero_evict=true
13 | --max_decode_token_per_sequence=256
14 | ```
15 | 
16 | ## Performance Impact
17 | After enabling zero evict, on the Qwen3-8B model with an E2E latency constraint, the TPOT latency **decreased by 27%**.


--------------------------------------------------------------------------------
/xllm/core/framework/xtensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | 
 4 | cc_library(
 5 |   NAME
 6 |     xtensor
 7 |   HDRS
 8 |     options.h
 9 |     phy_page.h
10 |     phy_page_pool.h
11 |     xtensor.h
12 |     xtensor_manager.h
13 |     xtensor_manager_client.h
14 |     remote_xtensor_manager.h
15 |     xtensor_manager_service.h
16 |     xtensor_manager_server.h
17 |     xtensor_manager_pool.h
18 |     multi_layer_xtensor.h
19 |     multi_layer_xtensor_transfer.h
20 |   SRCS
21 |     phy_page.cpp
22 |     phy_page_pool.cpp
23 |     xtensor.cpp
24 |     xtensor_manager.cpp
25 |     xtensor_manager_client.cpp
26 |     remote_xtensor_manager.cpp
27 |     xtensor_manager_service.cpp
28 |     xtensor_manager_server.cpp
29 |     xtensor_manager_pool.cpp
30 |     multi_layer_xtensor.cpp
31 |     multi_layer_xtensor_transfer.cpp
32 |   DEPS
33 |     torch
34 |     :request
35 |     :common
36 |     glog::glog
37 |     proto::xllm_proto
38 |     :collective_service
39 |     :platform
40 | )


--------------------------------------------------------------------------------
/xllm/proto/xtensor_manager.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option go_package = "jd.com/jd-infer/xllm;xllm";
 4 | package xllm.proto;
 5 | option cc_enable_arenas = true;
 6 | option cc_generic_services = true;
 7 | 
 8 | import "common.proto";
 9 | 
10 | message SeqId {
11 |   int32 seq_id = 1;
12 | }
13 | 
14 | message AllocatePagesRequest {
15 |   int32 seq_id = 1;
16 |   uint64 num_tokens = 2;
17 | }
18 | 
19 | message NumPages {
20 |   uint64 num_pages = 1;
21 | }
22 | 
23 | message Utilization {
24 |   double utilization = 1;
25 | }
26 | 
27 | // PageManager receive action from master engine.
28 | service DistributeXTensorManager{
29 |   rpc Hello (Status) returns (Status);
30 |   rpc Allocate (AllocatePagesRequest) returns (Status);
31 |   rpc Deallocate (SeqId) returns (Empty);
32 |   rpc Cache (SeqId) returns (Empty);
33 |   rpc NumFreePagesPerLayer (Empty) returns (NumPages);
34 |   rpc NumUsedPagesPerLayer (Empty) returns (NumPages);
35 |   rpc KvCacheUtilization (Empty) returns (Utilization);
36 | }


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # NPU Timeline Generation Guide
 2 | ## Prerequisites
 3 | - Python environment
 4 | - Chrome browser (for visualization)
 5 | ## Implementation Steps
 6 | ### 1. Code Modification
 7 | #### Register the subscriber
 8 | Add the following at the beginning of your program:
 9 | ```cpp
10 | MsptiMetrics::register_subscriber();
11 | ```
12 | #### Add tracing to ACLNN functions (work for msprof as well)
13 | Insert the following macro in your ACLNN functions where you want to measure performance:
14 | ```cpp
15 | LLM_MSTX_RANGE();
16 | ```
17 | #### Release the subscriber
18 | Add this at the end of your program:
19 | ```cpp
20 | MsptiMetrics::release_subscriber();
21 | ```
22 | ### 2. Log Processing
23 | After running your program, process the generated log file using the timeline script:
24 | ```bash
25 | python npu_timeline.py -i custom_log.log -o custom_output.json
26 | ```
27 | ### 3. Visualization
28 | Open Chrome browser
29 | Navigate to: chrome://tracing
30 | Load the generated JSON file: custom_output.json


--------------------------------------------------------------------------------
/xllm/core/framework/prefix_cache/prefix_cache_with_upload.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <glog/logging.h>
 4 | 
 5 | #include "prefix_cache.h"
 6 | #include "util/double_buffer.h"
 7 | 
 8 | namespace xllm {
 9 | class PrefixCacheWithUpload final : public PrefixCache {
10 |  public:
11 |   explicit PrefixCacheWithUpload(uint32_t block_size);
12 | 
13 |   ~PrefixCacheWithUpload();
14 | 
15 |   // insert the token ids and blocks into the prefix tree
16 |   // and set hash key to the corresponding block
17 |   // return the length of new inserted tokens
18 |   size_t insert(const Slice<int32_t>& token_ids,
19 |                 std::vector<Block>& blocks) override;
20 | 
21 |   // evict blocks hold by the prefix cache
22 |   // return the actual number of evicted blocks
23 |   size_t evict(size_t n_blocks) override;
24 | 
25 |   virtual KvCacheEvent* get_upload_kvcache_events() override;
26 | 
27 |  private:
28 |   ThreadPool threadpool_;
29 | 
30 |   DoubleBuffer<KvCacheEvent> db_kvcache_events_;
31 | };
32 | 
33 | }  // namespace xllm
34 | 


--------------------------------------------------------------------------------
/xllm/core/layers/common/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_test)
 2 | 
 3 | if(USE_MLU)
 4 | 	list(APPEND TEST_SRCS indexer_tests.cpp mla_tests.cpp deepseek_v2_decoder_layer_tests.cpp)
 5 | endif()
 6 | 
 7 | # Add test for common test
 8 | cc_test(
 9 |   NAME
10 |     layer_test
11 |   SRCS
12 |     dense_mlp_tests.cpp
13 |     fused_moe_tests.cpp
14 |     tests_utils.cpp
15 |     ${TEST_SRCS}
16 |   DEPS
17 |     :common_layers
18 |     :parallel_state
19 |     :model
20 |     :model_context
21 |     :state_dict
22 |     glog::glog
23 |     torch
24 |     GTest::gtest_main
25 | )
26 | 
27 | # Add test for DeepEP
28 | # This test must exist individually, because it contains forked processes
29 | #  which does not allow any device init on main process
30 | cc_test(
31 |   NAME
32 |     deep_ep_test
33 |   SRCS
34 |     deep_ep_tests.cpp
35 |     tests_utils.cpp
36 |   DEPS
37 |     :common_layers
38 |     :parallel_state
39 |     :model
40 |     :model_context
41 |     :state_dict
42 |     GTest::gtest_main
43 |     torch
44 |     glog::glog
45 |   )
46 | 


--------------------------------------------------------------------------------
/xllm/cc_api/macros.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | namespace xllm {
19 | 
20 | #ifdef XLLM_CAPI_WEAK
21 | #define XLLM_CAPI_EXPORT \
22 |   __attribute__((visibility("default"))) __attribute((weak))
23 | #else
24 | #define XLLM_CAPI_EXPORT __attribute__((visibility("default")))
25 | #endif  // XLLM_CAPI_WEAK
26 | 
27 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/batch/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME 
 7 |     batch
 8 |   HDRS
 9 |     dit_batch.h
10 |     batch.h
11 |     batch_factory.h
12 |     batch_input_builder.h
13 |     mposition.h
14 |   SRCS 
15 |     dit_batch.cpp
16 |     batch.cpp
17 |     batch_factory.cpp
18 |     batch_input_builder.cpp
19 |     mposition.cpp
20 |     beam_search.h
21 |   DEPS
22 |     :request
23 |     :runtime
24 |     :common
25 |     glog::glog
26 | )
27 | 
28 | cc_test(
29 |   NAME
30 |     batch_test
31 |   SRCS
32 |     batch_test.cpp
33 |   DEPS
34 |     :batch
35 |     absl::time
36 |     GTest::gtest_main
37 |     $<$<BOOL:${USE_NPU}>:torch_npu>
38 | )
39 | target_link_libraries(batch_test
40 |                       PUBLIC
41 |                       Python::Python
42 |                       $<$<BOOL:${USE_NPU}>:ascendcl>
43 |                       $<$<BOOL:${USE_NPU}>:hccl>
44 |                       $<$<BOOL:${USE_NPU}>:c_sec>
45 |                       $<$<BOOL:${USE_NPU}>:nnopbase>)
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/en/features/chunked_scheduler.md:
--------------------------------------------------------------------------------
 1 | # Chunked Scheduler
 2 | 
 3 | ## Feature Introduction
 4 | xLLM supports the chunked prefill scheduling strategy. Chunked prefill is a technique that optimizes large language model inference by splitting long prompts into smaller chunks for batch processing, rather than processing the entire prompt at once.
 5 | This method can effectively reduce peak GPU memory usage, improve device utilization, and better schedule and mix processing with requests from the decode stage.
 6 | 
 7 | ## Usage
 8 | The aforementioned strategy has been implemented in xLLM and is exposed through gflags parameters to control the feature's on/off state.
 9 | 
10 | - Enable chunked prefill and set the chunked size, if not set chunked size, its default value is equal to max_tokens_per_batch.
11 | ```bash
12 | --enable_chunked_prefill=true
13 | --max_tokens_per_chunk_for_prefill=20480 # optional
14 | ```
15 | 
16 | 
17 | 
18 | ## Performance Impact
19 | After enabling chunked prefill, on the Qwen3-8B model with a TPOT constraint of 50ms, the TTFT latency **decreased by 46%**.


--------------------------------------------------------------------------------
/examples/generate_embedding.py:
--------------------------------------------------------------------------------
 1 | # python examples/generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0'
 2 | # python generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0,npu:1'
 3 | 
 4 | from xllm import ArgumentParser, Embedding, RequestParams
 5 | 
 6 | # Create an EmbeddingLM.
 7 | parser = ArgumentParser()
 8 | emb = Embedding(**vars(parser.parse_args()))
 9 | 
10 | # Create a reqeust params, include sampling params
11 | request_params = RequestParams()
12 | request_params.is_embeddings = True
13 | request_params.max_tokens = 1
14 | 
15 | inputs = [
16 |     "Hello, my name is",
17 |     "The president of the United States is",
18 |     "The capital of France is",
19 |     "The future of AI is",
20 | ]
21 | 
22 | outputs = emb.embedding(inputs, request_params, True)
23 | 
24 | # Print the outputs.
25 | for i, output in enumerate(outputs):
26 |     input_str = output.prompt
27 |     generated_embedding = output.outputs[0].embeddings
28 |     print(f"Input: {input_str!r}, Generated embedding: {generated_embedding!r}")
29 | 
30 | emb.finish()
31 | 
32 | 


--------------------------------------------------------------------------------
/cibuild/build_npu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | function error() {
 5 |   echo "Require build command, e.g. python setup.py build"
 6 |   exit 1
 7 | }
 8 | 
 9 | IMAGE="quay.io/jd_xllm/xllm-ai:xllm-dev-hb-rc2-x86"
10 | 
11 | RUN_OPTS=(
12 |   --rm
13 |   -t
14 |   --privileged
15 |   --ipc=host
16 |   --network=host
17 |   --device=/dev/davinci0
18 |   --device=/dev/davinci_manager
19 |   --device=/dev/devmm_svm
20 |   --device=/dev/hisi_hdc
21 |   -v /var/queue_schedule:/var/queue_schedule
22 |   -v /usr/local/Ascend/driver:/usr/local/Ascend/driver
23 |   -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi
24 |   -v /usr/local/sbin/:/usr/local/sbin/
25 |   -v /export/home:/export/home
26 |   -v /export/home/npu_vcpkg_cache:/root/.cache/vcpkg # cached vcpkg installed dir
27 |   -v /etc/hccn.conf:/etc/hccn.conf
28 |   -w /export/home
29 | )
30 | 
31 | CMD="$*"
32 | [[ -z "${CMD}" ]] && error
33 | 
34 | [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command is missing." && exit 1
35 | 
36 | docker run "${RUN_OPTS[@]}" "${IMAGE}" bash -c "set -euo pipefail; cd $(pwd); ${CMD}"
37 | 


--------------------------------------------------------------------------------
/xllm/core/framework/request/dit_request_params.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/torch.h>
 3 | 
 4 | #include <cstdint>
 5 | #include <nlohmann/json.hpp>
 6 | #include <optional>
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | #include "dit_request_output.h"
11 | #include "dit_request_state.h"
12 | #include "image_generation.pb.h"
13 | #include "request.h"
14 | #include "tensor.pb.h"
15 | namespace xllm {
16 | 
17 | struct DiTRequestParams {
18 |   DiTRequestParams() = default;
19 |   DiTRequestParams(const proto::ImageGenerationRequest& request,
20 |                    const std::string& x_rid,
21 |                    const std::string& x_rtime);
22 | 
23 |   bool verify_params(DiTOutputCallback callback) const;
24 | 
25 |   // request id
26 |   std::string request_id;
27 |   std::string x_request_id;
28 |   std::string x_request_time;
29 | 
30 |   std::string model;
31 | 
32 |   DiTInputParams input_params;
33 |   // Mandatory: Generation control parameters (encapsulates all fields related
34 |   // to "image generation process")
35 |   DiTGenerationParams generation_params;
36 | };
37 | 
38 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/zh/features/acl_graph.md:
--------------------------------------------------------------------------------
 1 | # ACLGraph
 2 | 
 3 | 
 4 | ## 功能介绍
 5 | 
 6 | 为了优化Host侧调度性能，NPU近期推出了类似CUDA Graph的图模式方案ACLGraph。与采用CPU密集小任务提交、NPU频繁启动小Kernel的传统模式相比，ACLGraph模式通过在CPU一次提交大任务后，NPU内部流式执行小kernel，显著降低了启动时间和NPU气泡。
 7 | 
 8 | 在xLLM引擎中使用ACLGraph功能，我们实现了以下特性：
 9 | ### 动态维度参数化
10 |   - 将关键动态维度（如批大小和序列长度）作为整图输入参数，从而提高灵活性。在进行图的内存分配和内核配置时，利用这些动态参数计算实际所需值，例如通过公式   $block\_table\_size = batch\_size \times (max\_seq\_len / block\_size)$ 计算block_table的大小。在图启动阶段，则将实际的批大小和最大序列长度作为参数传入，以确保kerenel能够使用正确的stride来访问数据。
11 | 
12 | ### 多shape复用的显存池
13 |   - 为了避免多shape使用单独显存buffer（输入、输出和中间Tensor）导致浪费，我们采用了可扩张的显存池。多shape复用基地址，不同shape对池基地址的偏移量（Offset）不同。
14 | 
15 | 
16 | ## 使用方式
17 | 
18 | 上述功能已经在xLLM引擎内部进行了实现，对用户透明，用户无需关注内部实现细节，在适用的场景直接开启相关功能即可。通过gflags参数`enable_aclgraph`开启。参数默认为false，如需开启在xLLM的服务启动脚本中设置为true即可，示例如下：
19 | ```shell
20 | --enable_aclgraph=true
21 | ```
22 | 
23 | 
24 | ## 性能效果
25 | - 开启ACLGraph功能后，在Qwen3-0.6B和Qwen3-1.7B等模型上，decode阶段吞吐 **提升8%-10%**。
26 | 
27 | !!! warning "注意事项"
28 |     - 为新模型添加ACLGraph支持时，需要check计算过程中用到的kerenel是否实现了动态维度参数化。如果没有，需要重新实现kernel。
29 | 
30 | !!! tip "未来计划"
31 |     * 支持MoE模型Attention DP和FFN EP之间的通信操作适配不同shape。
32 | 


--------------------------------------------------------------------------------
/xllm/function_call/partial_json_parser/include/partial_json_parser/options.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARTIAL_JSON_PARSER_OPTIONS_H
 2 | #define PARTIAL_JSON_PARSER_OPTIONS_H
 3 | 
 4 | namespace partial_json_parser {
 5 | 
 6 | // TypeOptions enum that matches the Go implementation exactly
 7 | enum TypeOptions {
 8 |   STR = 1 << 0,        // 1
 9 |   NUM = 1 << 1,        // 2
10 |   ARR = 1 << 2,        // 4
11 |   OBJ = 1 << 3,        // 8
12 |   NULL_TYPE = 1 << 4,  // 16 (using NULL_TYPE to avoid conflict with NULL macro)
13 |   BOOL = 1 << 5,       // 32
14 |   NAN_TYPE = 1 << 6,   // 64 (using NAN_TYPE to avoid conflict with NAN macro)
15 |   INFINITY_TYPE =
16 |       1
17 |       << 7,  // 128 (using INFINITY_TYPE to avoid conflict with INFINITY macro)
18 |   NEG_INFINITY = 1 << 8,  // 256
19 | 
20 |   // Composite options - exactly matching Go implementation
21 |   INF = INFINITY_TYPE | NEG_INFINITY,
22 |   SPECIAL = NULL_TYPE | BOOL | INF | NAN_TYPE,
23 |   ATOM = STR | NUM | SPECIAL,
24 |   COLLECTION = ARR | OBJ,
25 |   ALL = ATOM | COLLECTION
26 | };
27 | 
28 | }  // namespace partial_json_parser
29 | 
30 | #endif  // PARTIAL_JSON_PARSER_OPTIONS_H


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a request for a new feature
 3 | title: "[Feature]: "
 4 | labels: ["feature"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues).
11 | - type: textarea
12 |   attributes:
13 |     label: 🚀 The motivation and feature
14 |     description: >
15 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Alternatives
21 |     description: >
22 |       A description of any alternative solutions or features you've considered, if any.
23 | - type: textarea
24 |   attributes:
25 |     label: Additional context
26 |     description: >
27 |       Add any other context or screenshots about the feature request.
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/xllm/core/common/rate_limiter.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <atomic>
19 | 
20 | namespace xllm {
21 | 
22 | class RateLimiter final {
23 |  public:
24 |   RateLimiter() = default;
25 | 
26 |   ~RateLimiter() = default;
27 | 
28 |   bool is_limited();
29 | 
30 |   void decrease_one_request();
31 | 
32 |  private:
33 |   std::atomic<int32_t> num_concurrent_requests_{0};
34 | };
35 | 
36 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/kernels/cuda/matmul.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "cuda_ops_api.h"
17 | 
18 | namespace xllm::kernel::cuda {
19 | 
20 | torch::Tensor matmul(torch::Tensor a,
21 |                      torch::Tensor b,
22 |                      std::optional<torch::Tensor> bias) {
23 |   namespace F = torch::nn::functional;
24 |   return F::linear(a, b, bias.value_or(torch::Tensor()));
25 | }
26 | 
27 | }  // namespace xllm::kernel::cuda


--------------------------------------------------------------------------------
/xllm/core/kernels/ilu/matmul.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ilu_ops_api.h"
17 | 
18 | namespace xllm::kernel::ilu {
19 | 
20 | torch::Tensor matmul(torch::Tensor a,
21 |                      torch::Tensor b,
22 |                      std::optional<torch::Tensor> bias) {
23 |   namespace F = torch::nn::functional;
24 |   return F::linear(a, b, bias.value_or(torch::Tensor()));
25 | }
26 | 
27 | }  // namespace xllm::kernel::ilu


--------------------------------------------------------------------------------
/xllm/processors/input_processor.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include <string>
21 | 
22 | #include "core/framework/request/mm_data.h"
23 | 
24 | namespace xllm {
25 | 
26 | class InputProcessor {
27 |  public:
28 |   virtual ~InputProcessor() = default;
29 | 
30 |   virtual void process(std::string& prompt, const MMData& mm_data) = 0;
31 | };
32 | 
33 | }  // namespace xllm
34 | 


--------------------------------------------------------------------------------
/cibuild/install/install_python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | PYTHON_VERSION="$1"
 6 | shift
 7 | 
 8 | NO_RC_PYTHON_VERSION="${PYTHON_VERSION%rc*}"
 9 | 
10 | url="https://www.python.org/ftp/python/${NO_RC_PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz"
11 | 
12 | pushd /tmp
13 | wget "$url"
14 | tar xvzf "Python-${PYTHON_VERSION}.tgz"
15 | cd "Python-${PYTHON_VERSION}"
16 | 
17 | # Extract major and minor version number
18 | MAJOR=$(echo "${PYTHON_VERSION}" | cut -d . -f 1)
19 | MINOR=$(echo "${PYTHON_VERSION}" | cut -d . -f 2)
20 | 
21 | INSTALL_FOLDER="/opt/python/cp${MAJOR}${MINOR}-cp${MAJOR}${MINOR}"
22 | 
23 | ./configure \
24 |   --enable-shared \
25 |   --enable-ipv6 \
26 |   --prefix=${INSTALL_FOLDER} \
27 |   LDFLAGS=-Wl,-rpath=${INSTALL_FOLDER}/lib,--disable-new-dtags
28 | 
29 | make -j$(nproc) install
30 | # upgrade pip, setuptools and wheel
31 | ${INSTALL_FOLDER}/bin/python3 -m pip install --upgrade pip setuptools wheel
32 | # create symlinks
33 | cp ${INSTALL_FOLDER}/bin/pip3 ${INSTALL_FOLDER}/bin/pip
34 | ln -s ${INSTALL_FOLDER}/bin/python3 ${INSTALL_FOLDER}/bin/python
35 | 
36 | rm -rf "Python-${PYTHON_VERSION}"
37 | popd
38 | 
39 | 


--------------------------------------------------------------------------------
/xllm/core/framework/xtensor/phy_page.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "phy_page.h"
17 | 
18 | namespace xllm {
19 | PhyPage::PhyPage(torch::Device device) : device_(device) {
20 |   int32_t device_id = device_.index();
21 | 
22 |   // create a physical memory handle for the device
23 |   vmm::create_phy_mem_handle(phy_handle_, device_id);
24 | }
25 | 
26 | PhyPage::~PhyPage() { vmm::release_phy_mem_handle(phy_handle_); }
27 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/en/features/prefix_cache.md:
--------------------------------------------------------------------------------
 1 | # Prefix Cache Optimization
 2 | 
 3 | ## Feature Introduction
 4 | xLLM supports prefix cache matching. The prefix cache is based on `murmur_hash` and uses an LRU eviction policy, delivering superior matching efficiency and increased prefix cache hit rates.
 5 | Additionally, the prefix cache has been optimized to support the `continuous_scheduler`, `chunked_scheduler`, and `zero_evict_scheduler`. The cache is updated immediately after prefill operations, enhancing matching timeliness. For the `chunked_scheduler`, multi-stage chunked prefill matching is supported, reducing computational overhead and minimizing KV cache usage as much as possible.
 6 | 
 7 | ## Usage
 8 | The prefix cache is implemented in xLLM and exposed through gflags parameters to control its functionality.
 9 | 
10 | - Enable prefix cache with specific policy and settings:
11 | ```
12 | --enable_prefix_cache=true
13 | ```
14 | 
15 | ## Performance Impact
16 | After enabling prefix cache, on the Qwen3-8B model with a TPOT constraint of 50ms, the E2E latency **decreased by 10%**.
17 | 
18 | !!! warning "Note"
19 |     PD separation scheduler is not currently supported.


--------------------------------------------------------------------------------
/docs/mkdocs/overrides/.icons/email-fill.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1755326167427" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="13611" xmlns:xlink="http://www.w3.org/1999/xlink" width="256" height="256"><path d="M926.47619 355.644952V780.190476a73.142857 73.142857 0 0 1-73.142857 73.142857H170.666667a73.142857 73.142857 0 0 1-73.142857-73.142857V355.644952l304.103619 257.828572a170.666667 170.666667 0 0 0 220.745142 0L926.47619 355.644952zM853.333333 170.666667a74.044952 74.044952 0 0 1 26.087619 4.778666 72.704 72.704 0 0 1 30.622477 22.186667 73.508571 73.508571 0 0 1 10.678857 17.67619c3.169524 7.509333 5.12 15.652571 5.607619 24.210286L926.47619 243.809524v24.380952L559.469714 581.241905a73.142857 73.142857 0 0 1-91.306666 2.901333l-3.632762-2.925714L97.52381 268.190476v-24.380952a72.899048 72.899048 0 0 1 40.155428-65.292191A72.97219 72.97219 0 0 1 170.666667 170.666667h682.666666z" p-id="13612" data-spm-anchor-id="a313x.search_index.0.i13.7cb13a81l9Qfa6" class="selected" fill="#acacac"></path></svg>


--------------------------------------------------------------------------------
/xllm/core/framework/kv_cache/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | 
 6 | cc_library(
 7 |   NAME
 8 |     kv_cache
 9 |   HDRS
10 |     embedding_allocator.h
11 |     $<$<BOOL:${USE_NPU}>:hccl_kv_cache_transfer.h>
12 |     kv_cache.h
13 |     kv_cache_event.h
14 |     kv_cache_transfer.h
15 |     $<$<BOOL:${USE_NPU}>:llm_data_dist_transfer.h>
16 |     $<$<BOOL:${USE_NPU}>:spec_kv_cache_transfer.h>
17 |     kv_cache_store.h
18 |     hierarchy_kv_cache_transfer.h
19 |   SRCS
20 |     embedding_allocator.cpp
21 |     $<$<BOOL:${USE_NPU}>:hccl_kv_cache_transfer.cpp>
22 |     kv_cache.cpp
23 |     kv_cache_transfer.cpp
24 |     $<$<BOOL:${USE_NPU}>:llm_data_dist_transfer.cpp>
25 |     $<$<BOOL:${USE_NPU}>:spec_kv_cache_transfer.cpp>
26 |     kv_cache_store.cpp
27 |     hierarchy_kv_cache_transfer.cpp
28 |   DEPS
29 |     :common
30 |     $<$<BOOL:${USE_NPU}>:graph>
31 |     glog::glog
32 |     $<$<BOOL:${USE_NPU}>:hccl_transfer>
33 |     $<$<BOOL:${USE_NPU}>:llm_datadist>
34 |     torch
35 |     $<$<BOOL:${USE_NPU}>:torch_npu>
36 |     mooncake_store
37 |     :xtensor
38 |     $<$<BOOL:${USE_NPU}>:platform_npu>
39 | )
40 | 


--------------------------------------------------------------------------------
/xllm/core/framework/parallel_state/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME
 6 |     parallel_state
 7 |   HDRS
 8 |     mapping_npu.h
 9 |     parallel_args.h
10 |     parallel_state.h
11 |     process_group.h
12 |     $<$<BOOL:${USE_NPU}>:npu_process_group.h>
13 |     $<$<BOOL:${USE_MLU}>:mlu_process_group.h>
14 |     $<$<BOOL:${USE_CUDA}>:cuda_process_group.h>
15 |     $<$<BOOL:${USE_ILU}>:ilu_process_group.h>
16 |     collective_communicator.h
17 |   SRCS
18 |     mapping_npu.cpp
19 |     parallel_state.cpp
20 |     process_group.cpp
21 |     $<$<BOOL:${USE_NPU}>:npu_process_group.cpp>
22 |     collective_communicator.cpp
23 |   DEPS
24 |     :common
25 |     torch
26 |     $<$<BOOL:${USE_MLU}>:torch_mlu>
27 |     $<$<BOOL:${USE_NPU}>:hccl>
28 |     glog::glog
29 | )
30 | 
31 | if(USE_NPU)
32 |   cc_test(
33 |     NAME
34 |       mapping_npu_test
35 |     SRCS
36 |       mapping_npu_test.cpp
37 |     DEPS
38 |       parallel_state
39 |       absl::synchronization
40 |       absl::time
41 |       GTest::gtest_main
42 |       xllm_kernels
43 |       ascendcl
44 |       atb
45 |       c_sec
46 |       spdlog::spdlog
47 |   )
48 | endif()
49 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/xllm_ops/replace_token.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <torch_npu/csrc/libs/init_npu.h>
18 | #include <torch_npu/torch_npu.h>
19 | 
20 | #include <vector>
21 | 
22 | #include "acl/acl.h"
23 | #include "aclnn_replace_token.h"
24 | #include "acltensor_utils.h"
25 | #include "util/tensor_helper.h"
26 | 
27 | namespace xllm_ops {
28 | void replace_token(torch::Tensor& forked, torch::Tensor& lastStepOutPut);
29 | }  // namespace xllm_ops
30 | 


--------------------------------------------------------------------------------
/xllm/core/layers/common/layer_utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include "framework/model/model_input_params.h"
18 | #include "framework/parallel_state/parallel_args.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | void update_dummy_run_input(int64_t dp_rank,
24 |                             torch::Tensor& positions,
25 |                             ModelInputParams& input_params);
26 | 
27 | }  // namespace layer
28 | }  // namespace xllm
29 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/lm_head_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "base_loader.h"
17 | 
18 | namespace xllm {
19 | namespace layer {
20 | class LmHeadLoader : public BaseLoader {
21 |  public:
22 |   LmHeadLoader(uint64_t weight_count, const ModelContext& context);
23 | 
24 |   void load_state_dict(const StateDict& state_dict) override;
25 |   void verify_loaded_weights(const std::string& weight_str) const override;
26 | };
27 | }  // namespace layer
28 | }  // namespace xllm
29 | 


--------------------------------------------------------------------------------
/docs/zh/features/xtensor_memory.md:
--------------------------------------------------------------------------------
 1 | # xTensor显存管理
 2 | 
 3 | ## 背景介绍
 4 | 
 5 | 目前的大模型推理引擎都是使用基于 block 的方式一次性分配大块连续显存用于存储 KVCache，然而这会造成 KVCache 的离散存储，且无法动态扩容/缩容。
 6 | 
 7 | 而 GPU 和 NPU 都提供了虚拟内存管理 API（Virtual Memory Management，VMM），VMM API 可以将显存的虚拟地址和物理地址的分配解耦，然后将物理内存按需映射到虚拟内存上，从而实现物理内存的弹性分配，并保证虚拟内存的连续性。
 8 | 
 9 | 基于 VMM API，我们实现了 KVCache 的连续存储及按需分配物理内存，并且实现了针对解码阶段的连续 KVCache 版本的 Attention 算子。
10 | 
11 | ## 主要接口
12 | * `PhyPage`：对物理页的封装。
13 | * `XTensor`：对虚拟内存的封装。
14 | * `PageAllocator`：用于管理一个device上的`PhyPage`的分配与回收。
15 | * `PageManager`：用于管理一个device上虚拟内存与物理内存的映射与取消映射。
16 | * `PageManagerPool`：用于管理所有的device上的`PageManager`。
17 | 
18 | ## 使用方式
19 | 只需在启动 xLLM 时加上下面的 gflag 参数即可：
20 | 
21 | ```bash
22 | --enable_continuous_kvcache=true
23 | ```
24 | 
25 | !!! warning "注意事项"
26 |     目前该方案暂不支持prefix cacheing，chunked prefill，disaggregated pd，speculative decoding，在使用时需要将这些功能关闭：
27 |     ```bash
28 |     --enable_prefix_cache=false
29 |     --enable_chunked_prefill=false
30 |     --enable_disagg_pd=false
31 |     --num_speculative_tokens=0
32 |     ```
33 | 
34 | !!! tip "未来计划"
35 |     * 使用 VMM API 将 KVCache 和激活值统一管理，并动态管理二者使用的物理显存大小。
36 |     * 使用 VMM API 实现当多个 LLM 模型共享 GPUs时，动态调整它们使用的 KVCache 的大小从而实现高效负载。
37 | 
38 | 


--------------------------------------------------------------------------------
/docs/zh/features/eplb.md:
--------------------------------------------------------------------------------
 1 | # MOE负载均衡
 2 | 
 3 | ## 背景介绍
 4 | 
 5 | MoE模型依赖动态路由分配tokens给专家，但实际部署中因数据分布不均，导致专家负载失衡（部分过载、部分闲置）。专家冗余调整（如新增/删除副本）需要消耗额外显存，并可能因权重迁移影响推理延迟，如何高效、平滑地完成是一大挑战。为此，采用专家冗余策略（复制热点专家）结合分层和全局动态负载均衡实现了动态的MOE负载均衡。
 6 | 
 7 | ## 功能介绍
 8 | xLLM eplb功能主要通过以下三个模块实现：
 9 | - eplb manager: 负责专家负载并收集并管理专家分布更新更新，采用逐层更新机制，根据专家负载变化情况判断是否更新该层。
10 | - eplb excutor: 实际专家分布更新执行器。
11 | - eplb policy: 新专家负载表生成策略。
12 | 整体架构图如下：
13 | ![xLLM eplb](../../assets/eplb_architecture.png)
14 | 
15 | ## 使用方式
16 | 只需在启动 xLLM 时加上下面的 gflag 参数即可：
17 | 替换为实际的Device个数 ep_size要与device个数保持一致
18 | 
19 | - xLLM中提供了gflags参数`enable_eplb`，默认false，如需开启动态专家负载均衡，在xLLM的服务启动脚本中设置为true即可。
20 | - `expert_parallel_degree`与`ep_size`为moe相关参数，`expert_parallel_degree`需要设置为`2`，`ep_size`要与实际NPU/GPU卡个数保持一致。参考 [moe_params](./moe_params.md)
21 | - `eplb_update_interval`为专家分布更新时间间隔，单位为妙，默认值为1000.
22 | - 专家分布更新采用根据专家负载的逐层更新机制，当某一层专家的前后两次的负载相似度小于`eplb_update_interval`时选择更新该层，默认值为1，取之范围为(0,1)。
23 | 
24 | ```bash
25 |   --enable_eplb=true 
26 |   --expert_parallel_degree=2 
27 |   --ep_size=16  
28 |   --eplb_update_interval=2000
29 |   --eplb_update_threshold=0.9
30 | ```
31 | 
32 | ## 未来工作
33 |   * 采用更加细粒度的专家更新机制。
34 | 
35 |   * 与调度层结合，通过请求batch的重组实现更好的负载均衡。
36 | 


--------------------------------------------------------------------------------
/examples/generate.py:
--------------------------------------------------------------------------------
 1 | # python examples/generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0'
 2 | # python generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0,npu:1'
 3 | 
 4 | from xllm import ArgumentParser, LLM, RequestParams
 5 | 
 6 | # Create an LLM.
 7 | parser = ArgumentParser()
 8 | llm = LLM(**vars(parser.parse_args()))
 9 | 
10 | # Create a reqeust params, include sampling params
11 | request_params = RequestParams()
12 | request_params.temperature = 0.8
13 | request_params.top_p = 0.95
14 | request_params.max_tokens = 10
15 | 
16 | # Generate texts from the prompts. The output is a list of RequestOutput
17 | # objects that contain the prompt, generated text, and other information.
18 | prompts = [
19 |     "Hello, my name is",
20 |     "The president of the United States is",
21 |     "The capital of France is",
22 |     "The future of AI is",
23 | ]
24 | 
25 | outputs = llm.generate(prompts, request_params, True)
26 | 
27 | # Print the outputs.
28 | for i, output in enumerate(outputs):
29 |     prompt = output.prompt
30 |     generated_text = output.outputs[0].text
31 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
32 | 
33 | llm.finish()
34 | 
35 | 


--------------------------------------------------------------------------------
/xllm/processors/pywarpper_image_processor.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <vector>
19 | 
20 | #include "image_processor.h"
21 | 
22 | namespace xllm {
23 | 
24 | struct MMData;
25 | 
26 | class PyWarpperImageProcessor : public ImageProcessor {
27 |  public:
28 |   PyWarpperImageProcessor(const ModelArgs&);
29 |   ~PyWarpperImageProcessor() override = default;
30 | 
31 |   bool process(const MMInput& mm_inputs, MMData& mm_datas) override;
32 | };
33 | 
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/docs/zh/features/async_schedule.md:
--------------------------------------------------------------------------------
 1 | # 异步调度
 2 | 
 3 | ## 背景
 4 | 大模型推理过程可划分为3个阶段，包括CPU执行调度准备模型输入阶段，device计算阶段，CPU处理输出阶段。
 5 | 由于解码操作的序列性，step-i+1 的输入需要依赖 step-i 的输出结果，
 6 | 上述3个阶段需要按顺序串行执行，导致在CPU执行阶段1和3的时候，device侧空闲等待出现空泡，资源利用不充分。
 7 | 
 8 | 
 9 | 
10 | ## 功能介绍
11 | 
12 | xLLM在框架层支持了异步调度功能，在device执行 step-i 计算的同时提前让CPU执行 step-i+1 的调度操作，device在完成 step-i 计算后可立即开始 step-i+1 的计算，从而消除空泡。
13 | 具体地，CPU在发起 step-i 计算调用后，不等待device计算完成，为 step-i 的请求构造fake token，使用fake token执行 step-i+1 的调度操作，分配KV Cache等；device在启动 step-i+1 的计算时，用 step-i 计算出来的true token替换fake token，保证计算的正确性。CPU在另外的线程中同步处理 step-i 的结果返回给client。
14 | 
15 | 整体架构如图，实现中CPU侧执行阶段1和阶段3的操作分别采用了不同的线程池，rpc等函数调用采用C++ future和promise非阻塞调用，实现全异步runtime。![异步调度](../../assets/async_schedule_architecture.jpg)
16 | 
17 | 
18 | ## 使用方式
19 | 
20 | xLLM中提供了gflags参数`enable_schedule_overlap`，默认true，如需关闭在xLLM的服务启动脚本中设置为false即可，示例如下：
21 | ```shell
22 | --enable_schedule_overlap=false
23 | ```
24 | 
25 | 
26 | ## 性能效果
27 | - 异步调度开启后，两个step之间的device空闲时在200us左右，基本类似一个kernel launch的时间。
28 | - 在DeepSeek-R1-Distill-Qwen-1.5B模型上，限制TPOT 50ms，吞吐 **提升17%**。
29 | 
30 | 
31 | !!! warning "注意"
32 |     - 异步调度功能会在服务端额外计算一个step，当使用场景中输出token数量较少，或是类似embedding模型只一次性输出的场景，会影响服务端吞吐，所以强制关闭异步调度。
33 |     - VLM模型正在适配中，暂时会强制关闭异步调度。


--------------------------------------------------------------------------------
/docs/zh/features/global_kvcache.md:
--------------------------------------------------------------------------------
 1 | # 全局多级KV Cache
 2 | ## 背景
 3 | 大型语言模型（LLM）解码阶段因自回归生成需频繁访问历史KV缓存，导致显存带宽成为瓶颈。随着模型规模与上下文窗口扩大（如128K Token消耗超40GB显存），单卡显存压力剧增。现有方案（如vLLM）在长上下文场景下存在明显局限：预填充耗时激增、解码阶段显存带宽争抢严重，为满足SLO（TTFT<2s, TBT<100ms）常需过量预留资源，致使GPU利用率不足40%，且难以利用跨服务器资源。为此，我们提出分布式全局多级KV缓存管理系统，采用存算一体架构以突破单机资源限制。
 4 | 
 5 | ## 功能介绍
 6 | xLLM 全局KV Cache功能主要通过以下三个模块实现：
 7 | - etcd: 集群服务注册、负载信息同步及全局缓存状态管理
 8 | - xLLM Service: 调度请求和管理所有计算实例
 9 | - xLLM: 请求计算实例
10 | 
11 | 整体架构图如下：
12 | ![xLLM 全局多级KV Cache](../../assets/globalkvcache_architecture.png)
13 | ## 功能使用示例
14 | ### 使用准备
15 | #### 安装相关依赖
16 | - **xLLM**: 参见[安装编译](../getting_started/compile.md)
17 | - **xLLM Service**: 参见[PD分离部署](../getting_started/PD_disagg.md)
18 | 
19 | ### 使用方式
20 | 1. etcd启动配置：
21 | ```bash
22 | ./etcd  --listen-peer-urls=http://0.0.0.0:10999 --listen-client-urls=http://0.0.0.0:10998
23 | ```
24 | 2. xLLM Service启动配置：
25 | ```bash
26 | ./xllm_master_serving  --etcd_addr="127.0.0.1:10998" --http_server_port 28888 --rpc_server_port 28889 --tokenizer_path=/path/to/tokenizer_config_dir/
27 | ```
28 | 3. xLLM启动添加上下面的 gflag 参数即可：
29 | ```bash
30 | --enable_service_routing=true
31 | --enable_cache_upload=true
32 | # PD分离暂时不支持全局KVCache管理
33 | --enable_disagg_pd=false
34 | ```


--------------------------------------------------------------------------------
/xllm/core/framework/xtensor/phy_page.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <torch/torch.h>
18 | 
19 | #include "platform/vmm_api.h"
20 | 
21 | namespace xllm {
22 | class PhyPage {
23 |  public:
24 |   PhyPage(torch::Device device);
25 | 
26 |   ~PhyPage();
27 | 
28 |   const torch::Device& device() const { return device_; }
29 | 
30 |   PhyMemHandle get_phy_handle() const { return phy_handle_; }
31 | 
32 |  private:
33 |   torch::Device device_;
34 |   PhyMemHandle phy_handle_;
35 | };
36 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/column_parallel_linear_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "base_loader.h"
17 | 
18 | namespace xllm {
19 | namespace layer {
20 | class ColumParallelLinearLoader : public BaseLoader {
21 |  public:
22 |   ColumParallelLinearLoader(uint64_t weight_count, const ModelContext& context);
23 | 
24 |   void load_state_dict(const StateDict& state_dict) override;
25 |   void verify_loaded_weights(const std::string& weight_str) const override;
26 | };
27 | }  // namespace layer
28 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/lm_head.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class LmHead : public torch::nn::ModuleHolder<LmHeadImpl> {
24 |  public:
25 |   using torch::nn::ModuleHolder<LmHeadImpl>::ModuleHolder;
26 |   using Impl __attribute__((__unused__)) = LmHeadImpl;
27 | 
28 |   LmHead(const ModelContext& context)
29 |       : ModuleHolder(std::make_shared<LmHeadImpl>(context)) {}
30 | };
31 | 
32 | }  // namespace layer
33 | }  // namespace xllm
34 | 


--------------------------------------------------------------------------------
/xllm/core/util/uuid.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <absl/random/random.h>
19 | 
20 | #include <string>
21 | 
22 | namespace xllm {
23 | 
24 | class ShortUUID {
25 |  public:
26 |   ShortUUID() = default;
27 | 
28 |   std::string random(size_t len = 0);
29 | 
30 |  private:
31 |   std::string alphabet_ =
32 |       "23456789ABCDEFGHJKLMNPQRSTUVWXYZ"
33 |       "abcdefghijkmnopqrstuvwxyz";
34 |   absl::BitGen gen_;
35 | };
36 | 
37 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/api_service/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME 
 5 |     api_service
 6 |   HDRS
 7 |     api_service.h
 8 |     api_service_impl.h
 9 |     call.h
10 |     completion_service_impl.h
11 |     rec_completion_service_impl.h
12 |     chat_service_impl.h
13 |     embedding_service_impl.h
14 |     image_generation_service_impl.h
15 |     rerank_service_impl.h
16 |     qwen3_rerank_service_impl.h
17 |     non_stream_call.h
18 |     service_impl_factory.h
19 |     stream_call.h
20 |     models_service_impl.h
21 |     stream_output_parser.h
22 |     mm_service_utils.h
23 |   SRCS
24 |     api_service.cpp
25 |     call.cpp
26 |     completion_service_impl.cpp
27 |     rec_completion_service_impl.cpp
28 |     chat_service_impl.cpp
29 |     embedding_service_impl.cpp
30 |     image_generation_service_impl.cpp
31 |     models_service_impl.cpp
32 |     rerank_service_impl.cpp
33 |     stream_output_parser.cpp
34 |     qwen3_rerank_service_impl.cpp
35 |   DEPS
36 |     :master
37 |     :chat_template
38 |     :util
39 |     glog::glog
40 |     proto::xllm_proto
41 |     absl::flat_hash_set
42 |     absl::random_random
43 |     :function_call
44 |     :reasoning
45 |     torch
46 |     $<$<BOOL:${USE_NPU}>:torch_npu>
47 | )
48 | 
49 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/xllm_ops/top_k_top_p.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <torch_npu/csrc/libs/init_npu.h>
18 | #include <torch_npu/torch_npu.h>
19 | 
20 | #include <vector>
21 | 
22 | #include "acl/acl.h"
23 | #include "aclnnop/aclnn_apply_top_k_top_p.h"
24 | #include "acltensor_utils.h"
25 | #include "util/tensor_helper.h"
26 | 
27 | namespace xllm_ops {
28 | void top_k_top_p(torch::Tensor& logits,
29 |                  const torch::Tensor& topK,
30 |                  const torch::Tensor& topP);
31 | }  // namespace xllm_ops


--------------------------------------------------------------------------------
/xllm/core/scheduler/scheduler_factory.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "runtime/xservice_client.h"
19 | #include "scheduler/continuous_scheduler.h"
20 | #include "scheduler/dit_scheduler.h"
21 | 
22 | namespace xllm {
23 | 
24 | std::unique_ptr<ContinuousScheduler> create_continuous_scheduler(
25 |     Engine* engine,
26 |     ContinuousScheduler::Options options);
27 | 
28 | std::unique_ptr<DiTScheduler> create_dit_scheduler(
29 |     DiTEngine* engine,
30 |     DiTScheduler::Options options);
31 | 
32 | }  // namespace xllm
33 | 


--------------------------------------------------------------------------------
/xllm/core/framework/kv_cache/kv_cache_event.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <unordered_set>
18 | 
19 | #include "util/hash_util.h"
20 | 
21 | namespace xllm {
22 | 
23 | struct KvCacheEvent {
24 |   std::unordered_set<Murmur3Key, FixedStringKeyHash, FixedStringKeyEqual>
25 |       stored_cache;
26 |   std::unordered_set<Murmur3Key, FixedStringKeyHash, FixedStringKeyEqual>
27 |       removed_cache;
28 | 
29 |   void clear() {
30 |     stored_cache.clear();
31 |     removed_cache.clear();
32 |   }
33 | };
34 | 
35 | }  // namespace xllm
36 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/matmul.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "npu_ops_api.h"
17 | #include "ops_npu/npu_ops.h"
18 | 
19 | namespace xllm::kernel::npu {
20 | 
21 | torch::Tensor matmul(const torch::Tensor& a,
22 |                      const torch::Tensor& b,
23 |                      const std::optional<torch::Tensor>& bias) {
24 |   if (!bias.has_value()) {
25 |     return torch::nn::functional::linear(a, b);
26 |   } else {
27 |     return torch::nn::functional::linear(a, b, bias.value());
28 |   }
29 | }
30 | 
31 | }  // namespace xllm::kernel::npu
32 | 


--------------------------------------------------------------------------------
/xllm/core/layers/common/activation.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include <string>
21 | 
22 | namespace xllm {
23 | namespace layer {
24 | 
25 | class ActivationImpl : public torch::nn::Module {
26 |  public:
27 |   ActivationImpl(const std::string& act_mode, bool is_gated);
28 | 
29 |   void forward(torch::Tensor& input, torch::Tensor& output);
30 | 
31 |  private:
32 |   std::string act_mode_;
33 |   bool is_gated_;
34 | };
35 | TORCH_MODULE(Activation);
36 | 
37 | }  // namespace layer
38 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/glm4_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Licensed under the Apache License, Version 2.0 (the "License");
 3 | you may not use this file except in compliance with the License.
 4 | You may obtain a copy of the License at
 5 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 6 | Unless required by applicable law or agreed to in writing, software
 7 | distributed under the License is distributed on an "AS IS" BASIS,
 8 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 9 | See the License for the specific language governing permissions and
10 | limitations under the License.
11 | ==============================================================================*/
12 | #pragma once
13 | 
14 | #include "config.h"
15 | 
16 | namespace xllm {
17 | namespace layer {
18 | 
19 | class Glm4DecoderLayer : public torch::nn::ModuleHolder<Glm4DecoderLayerImpl> {
20 |  public:
21 |   using torch::nn::ModuleHolder<Glm4DecoderLayerImpl>::ModuleHolder;
22 |   using Impl __attribute__((__unused__)) = Glm4DecoderLayerImpl;
23 |   Glm4DecoderLayer(const ModelContext& context)
24 |       : ModuleHolder(std::make_shared<Glm4DecoderLayerImpl>(context)) {}
25 | };
26 | 
27 | }  // namespace layer
28 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/kernels/ilu/activation.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ilu_ops_api.h"
17 | 
18 | using namespace ixformer;
19 | 
20 | namespace xllm::kernel::ilu {
21 | 
22 | void act_and_mul(torch::Tensor out,
23 |                  torch::Tensor input,
24 |                  const std::string& act_mode) {
25 |   if (act_mode == "silu") {
26 |     infer::silu_and_mul(input, out);
27 |   } else {
28 |     LOG(FATAL) << "Unsupported act mode: " << act_mode
29 |                << ", only support silu, gelu, gelu_tanh";
30 |   }
31 | }
32 | }  // namespace xllm::kernel::ilu
33 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/active.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <glog/logging.h>
17 | #include <torch_npu/csrc/aten/CustomFunctions.h>
18 | 
19 | #include "npu_ops_api.h"
20 | #include "ops_npu/npu_ops.h"
21 | 
22 | namespace xllm::kernel::npu {
23 | 
24 | torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) {
25 |   if (act_mode != "silu" && act_mode != "swiglu") {
26 |     LOG(FATAL) << "Only swiglu activation is supported in NPU active";
27 |   }
28 |   return at_npu::native::custom_ops::npu_swiglu(input);
29 | }
30 | }  // namespace xllm::kernel::npu


--------------------------------------------------------------------------------
/xllm/core/layers/word_embedding.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class WordEmbedding : public torch::nn::ModuleHolder<WordEmbeddingImpl> {
24 |  public:
25 |   using torch::nn::ModuleHolder<WordEmbeddingImpl>::ModuleHolder;
26 |   using Impl __attribute__((__unused__)) = WordEmbeddingImpl;
27 |   WordEmbedding(const ModelContext& context)
28 |       : ModuleHolder(std::make_shared<WordEmbeddingImpl>(context)) {}
29 | };
30 | 
31 | }  // namespace layer
32 | }  // namespace xllm
33 | 


--------------------------------------------------------------------------------
/xllm/cc_api/examples/service_request.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "llm.h"
17 | 
18 | namespace xllm {
19 | namespace cc_api_test {
20 | // Send Completion request and print the inference result.
21 | void run_completion_request(const std::string& model_name,
22 |                             xllm::LLM* llm_instance);
23 | 
24 | // Send ChatCompletion request and print the inference result.
25 | void run_chat_completion_request(const std::string& model_name,
26 |                                  xllm::LLM* llm_instance);
27 | }  // namespace cc_api_test
28 | }  // namespace xllm
29 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/mlu/gather_split.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "mlu_ops_api.h"
17 | 
18 | namespace xllm::kernel::mlu {
19 | 
20 | void gather_split(const torch::Tensor& input,
21 |                   const torch::Tensor& gather_index,
22 |                   const torch::Tensor& valid_token_num,
23 |                   const torch::Tensor& output_head,
24 |                   const torch::Tensor& output_tail) {
25 |   tmo::torch_api::gather_split(
26 |       output_head, output_tail, input, gather_index, valid_token_num);
27 | }
28 | 
29 | }  // namespace xllm::kernel::mlu
30 | 


--------------------------------------------------------------------------------
/xllm/core/util/timer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <absl/time/time.h>
20 | 
21 | namespace xllm {
22 | 
23 | class Timer final {
24 |  public:
25 |   Timer();
26 | 
27 |   // reset the timer
28 |   void reset();
29 | 
30 |   // get the elapsed time.
31 |   double elapsed_seconds() const;
32 |   double elapsed_milliseconds() const;
33 |   double elapsed_microseconds() const;
34 | 
35 |  private:
36 |   // the start time of the timer
37 |   absl::Time start_;
38 | };
39 | 
40 | }  // namespace xllm


--------------------------------------------------------------------------------
/cibuild/install/install_base.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | install_ubuntu() {
 6 |   deploy_deps="libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev"
 7 |   # Install common dependencies
 8 |   apt-get update
 9 |   apt-get install -y --no-install-recommends \
10 |     ${deploy_deps} \
11 |     build-essential \
12 |     zip \
13 |     pkg-config \
14 |     libssl-dev \
15 |     software-properties-common \
16 |     curl \
17 |     git \
18 |     wget \
19 |     sudo \
20 |     vim \
21 |     jq \
22 |     libtool \
23 |     unzip \
24 |     gdb
25 | 
26 |   # Cleanup package manager
27 |   apt-get autoclean && apt-get clean
28 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
29 | }
30 | 
31 | install_almalinux() {
32 |   yum -y update
33 |   yum -y install \
34 |     zip \
35 |     wget \
36 |     curl \
37 |     perl \
38 |     sudo \
39 |     vim \
40 |     jq \
41 |     libtool \
42 |     unzip
43 |   
44 |   # Cleanup
45 |   yum clean all
46 | }
47 | 
48 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
49 | case "$ID" in
50 |   ubuntu)
51 |     install_ubuntu
52 |     ;;
53 |   almalinux)
54 |     install_almalinux
55 |     ;;
56 |   *)
57 |     echo "Unable to determine OS..."
58 |     exit 1
59 |     ;;
60 | esac


--------------------------------------------------------------------------------
/xllm/core/layers/pos_embedding.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class PosEmbedding : public torch::nn::ModuleHolder<RotaryEmbeddingImpl> {
24 |  public:
25 |   using torch::nn::ModuleHolder<RotaryEmbeddingImpl>::ModuleHolder;
26 |   using Impl __attribute__((__unused__)) = RotaryEmbeddingImpl;
27 | 
28 |   PosEmbedding(const ModelContext& context)
29 |       : ModuleHolder(std::make_shared<RotaryEmbeddingImpl>(context)) {}
30 | };
31 | 
32 | }  // namespace layer
33 | }  // namespace xllm
34 | 


--------------------------------------------------------------------------------
/xllm/core/framework/tokenizer/tokenizer_factory.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "fast_tokenizer.h"
19 | #include "rec_tokenizer.h"
20 | #include "sentencepiece_tokenizer.h"
21 | #include "tiktoken_tokenizer.h"
22 | #include "tokenizer_args.h"
23 | #include "tokenizer_proxy.h"
24 | 
25 | namespace xllm {
26 | 
27 | class TokenizerFactory {
28 |  public:
29 |   static std::unique_ptr<Tokenizer> create_tokenizer(
30 |       const std::string& model_weights_path,
31 |       TokenizerArgs tokenizer_args,
32 |       bool proxy = true);
33 | };
34 | 
35 | }  // namespace xllm
36 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/word_embedding_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <map>
19 | #include <vector>
20 | 
21 | #include "core/layers/npu/npu_base_layer.h"
22 | 
23 | namespace xllm {
24 | namespace layer {
25 | 
26 | class WordEmbeddingLoader : public BaseLoader {
27 |  public:
28 |   WordEmbeddingLoader(uint64_t weight_count, const ModelContext& context);
29 | 
30 |   void load_state_dict(const StateDict& state_dict) override;
31 |   void verify_loaded_weights(const std::string& prefix) const override;
32 | };
33 | 
34 | }  // namespace layer
35 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/runtime/dit_executor.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "dit_executor.h"
17 | 
18 | #include <glog/logging.h>
19 | 
20 | #include "common/metrics.h"
21 | 
22 | namespace xllm {
23 | 
24 | DiTExecutor::DiTExecutor(DiTModel* model, const runtime::Options& options)
25 |     : model_(model), options_(options) {}
26 | 
27 | DiTForwardInput DiTExecutor::prepare_inputs(DiTBatch& batch) {
28 |   return batch.prepare_forward_input();
29 | }
30 | 
31 | DiTForwardOutput DiTExecutor::forward(const DiTForwardInput& input) {
32 |   return model_->forward(input);
33 | }
34 | 
35 | }  // namespace xllm
36 | 


--------------------------------------------------------------------------------
/xllm/core/util/uuid.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "uuid.h"
18 | 
19 | #include <absl/random/distributions.h>
20 | 
21 | namespace xllm {
22 | 
23 | std::string ShortUUID::random(size_t len) {
24 |   if (len == 0) {
25 |     len = 22;
26 |   }
27 | 
28 |   std::string uuid(len, ' ');
29 |   for (size_t i = 0; i < len; i++) {
30 |     const size_t rand = absl::Uniform<size_t>(
31 |         absl::IntervalClosedOpen, gen_, 0, alphabet_.size());
32 |     uuid[i] = alphabet_[rand];
33 |   }
34 |   return uuid;
35 | }
36 | 
37 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/zh/features/groupgemm.md:
--------------------------------------------------------------------------------
 1 | # GroupGEMM算子优化
 2 | 
 3 | # 背景
 4 | 混合专家(Mixture of Experts, MoE)架构已成为扩展大规模语言模型的重要范式，其核心思想是将输入token动态路由至不同的专家子网络进行处理。在推理过程中，GroupGEMM算子是MoE架构的关键计算单元，负责高效执行多个专家矩阵乘法的并行计算，且在整个推理耗时中占据主导地位。
 5 | 
 6 | ## 功能介绍
 7 | 结合当前GroupGEMM的性能瓶颈为I/O受限，提出了一种优化方案，通过索引重排替代数据拷贝，取消了对token向量的多次复制，改为维护专家分配的索引表。通过该行号索引，直接将token映射到相应的专家计算单元，并将token的分配调度与矩阵乘法融合为一个单一的kernel。
 8 | 
 9 | 
10 | ## 用户接口
11 | 
12 | ### 算子直调API
13 | ```c++
14 | aclnnStatus aclnnIndexGroupMatmulGetWorkspaceSize(
15 |     const aclTensorList *x,
16 |     const aclTensorList *weight,
17 |     const aclTensorList *scale,
18 |     const aclTensorList *perTokenScale,
19 |     const aclTensor *groupList,
20 |     const aclTensorList *out,
21 |     uint64_t *workspaceSize,
22 |     aclOpExecutor **executor);
23 | 
24 | aclnnStatus aclnnIndexGroupMatmul(
25 |     void *workspace,
26 |     uint64_t workspaceSize,
27 |     aclOpExecutor *executor,
28 |     aclrtStream stream);
29 | ```
30 | 
31 | - `x`: 输入的张量列表，包含待处理的数据。
32 | - `weight`: 权重张量，包含模型的参数。
33 | - `scale`: 缩放因子，用于调整输入张量的值。
34 | - `perTokenScale`:每个token的缩放因子，用于动态调整。
35 | - `groupList`: 专家组列表，指示哪些专家参与计算。
36 | - `out`: 输出张量列表，存储计算结果。
37 | 
38 | ## 性能效果
39 | ![groupmatmul](../../assets/groupmatmul_performance.png)
40 | 
41 | * 优化后的GroupMatmul算子在计算时间上表现出明显的优势，尤其是在k为128，m为64情况下，如图所示，优化后算子计算延时 **减少50%**。
42 | 


--------------------------------------------------------------------------------
/xllm/core/framework/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | include_directories(.)
 5 | if(USE_NPU)
 6 |   include_directories(
 7 |     ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
 8 |   )
 9 | endif()
10 | add_subdirectory(batch)
11 | add_subdirectory(block)
12 | add_subdirectory(chat_template)
13 | add_subdirectory(kv_cache)
14 | add_subdirectory(model)
15 | add_subdirectory(parallel_state)
16 | add_subdirectory(prefix_cache)
17 | add_subdirectory(request)
18 | add_subdirectory(sampling)
19 | add_subdirectory(state_dict)
20 | add_subdirectory(tokenizer)
21 | add_subdirectory(eplb)
22 | add_subdirectory(xtensor)
23 | add_subdirectory(dit_cache)
24 | 
25 | 
26 | cc_library(
27 |   NAME
28 |     model_loader
29 |   HDRS
30 |     hf_model_loader.h
31 |     dit_model_context.h
32 |     dit_model_loader.h
33 |     model_loader.h
34 |   SRCS
35 |     hf_model_loader.cpp
36 |     dit_model_context.cpp
37 |     dit_model_loader.cpp
38 |     model_loader.cpp
39 |   DEPS
40 |     :common
41 |     :model
42 |     :models
43 |     $<$<BOOL:${USE_NPU}>:npu_layers>
44 |     :tokenizer
45 |     torch
46 | )
47 | 
48 | cc_library(
49 |   NAME
50 |     model_context
51 |   HDRS
52 |     model_context.h
53 |   SRCS
54 |     model_context.cpp
55 |   DEPS
56 |     torch
57 |     $<$<BOOL:${USE_NPU}>:torch_npu>
58 | )
59 | 


--------------------------------------------------------------------------------
/xllm/core/framework/block/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME 
 7 |     block
 8 |   HDRS
 9 |     block.h
10 |     block_manager.h
11 |     block_manager_pool.h
12 |     block_manager_impl.h
13 |     concurrent_block_manager_impl.h
14 |     hierarchy_block_manager_pool.h
15 |   SRCS 
16 |     block.cpp
17 |     block_manager_pool.cpp
18 |     concurrent_block_manager_impl.cpp
19 |     block_manager_impl.cpp
20 |     hierarchy_block_manager_pool.cpp
21 |   DEPS
22 |     $<$<BOOL:${USE_NPU}>:torch_npu>
23 |     $<$<BOOL:${USE_NPU}>:graph>
24 |     :request
25 |     :common
26 |     glog::glog
27 |     Boost::serialization
28 |     SMHasherSupport
29 |     torch
30 | )
31 | target_link_libraries(block PRIVATE Folly::folly)
32 | 
33 | if(USE_NPU)
34 |   set(TEST_SRCS
35 |     block_manager_test.cpp
36 |   )
37 | 
38 |   cc_test(
39 |     NAME
40 |       block_test
41 |     SRCS
42 |       ${TEST_SRCS}
43 |     DEPS
44 |       :block
45 |       :flags
46 |       :kv_cache
47 |       :prefix_cache
48 |       absl::random_random
49 |       Boost::serialization
50 |       GTest::gtest_main
51 |   )
52 | 
53 |   target_link_libraries(block_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto ascendcl Folly::folly)
54 |   add_dependencies(block_test brpc-static)
55 | endif()


--------------------------------------------------------------------------------
/xllm/core/layers/llama_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class LlamaDecoderLayer
24 |     : public torch::nn::ModuleHolder<LlamaDecoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<LlamaDecoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = LlamaDecoderLayerImpl;
28 | 
29 |   LlamaDecoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<LlamaDecoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/core/layers/qwen2_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen2DecoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen2DecoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen2DecoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen2DecoderLayerImpl;
28 | 
29 |   Qwen2DecoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<Qwen2DecoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/core/layers/qwen3_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen3DecoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen3DecoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen3DecoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen3DecoderLayerImpl;
28 | 
29 |   Qwen3DecoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<Qwen3DecoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/buffer/atb_buffer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include "atb/atb_infer.h"
21 | 
22 | namespace xllm {
23 | 
24 | class AtbBuffer {
25 |  public:
26 |   explicit AtbBuffer(uint64_t bufferSize, at::Device device);
27 |   ~AtbBuffer();
28 |   void* get_buffer(uint64_t bufferSize);
29 | 
30 |  private:
31 |   torch::Tensor create_attensor(uint64_t bufferSize) const;
32 | 
33 |  private:
34 |   uint64_t buffer_size_ = 0;
35 |   torch::Tensor at_tensor_;
36 |   at::Device device_;
37 | 
38 |   at::TensorOptions options_;
39 | };
40 | 
41 | }  // namespace xllm
42 | 


--------------------------------------------------------------------------------
/xllm/function_call/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | add_subdirectory(partial_json_parser)
 5 | 
 6 | cc_library (
 7 |   NAME
 8 |     function_call
 9 |   HDRS
10 |         core_types.h
11 |         base_format_detector.h
12 |         qwen25_detector.h
13 |         kimik2_detector.h
14 |         deepseekv3_detector.h
15 |         glm45_detector.h
16 |         function_call_parser.h
17 |         function_call.h
18 |         utils.h
19 |   SRCS
20 |         base_format_detector.cpp
21 |         qwen25_detector.cpp
22 |         kimik2_detector.cpp
23 |         deepseekv3_detector.cpp
24 |         glm45_detector.cpp
25 |         function_call_parser.cpp
26 |         utils.cpp
27 |   DEPS
28 |     nlohmann_json::nlohmann_json
29 |     glog::glog
30 |     proto::xllm_proto
31 |     partial_json_parser
32 |     common
33 | )
34 | 
35 | function(add_detector_test TEST_NAME)
36 |   cc_test(
37 |     NAME
38 |       ${TEST_NAME}
39 |     SRCS
40 |       ${TEST_NAME}.cpp
41 |     DEPS
42 |       :function_call
43 |       GTest::gtest
44 |       GTest::gtest_main
45 |       nlohmann_json::nlohmann_json
46 |   )
47 | endfunction()
48 | 
49 | add_detector_test(qwen25_detector_test)
50 | add_detector_test(kimik2_detector_test)
51 | add_detector_test(deepseekv3_detector_test)
52 | add_detector_test(glm45_detector_test)
53 | 
54 | 


--------------------------------------------------------------------------------
/xllm/core/util/net.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <string>
19 | #include <vector>
20 | 
21 | namespace xllm {
22 | namespace net {
23 | 
24 | std::string get_local_ip_addr();
25 | int get_local_free_port();
26 | uint64_t convert_ip_port_to_uint64(const std::string& ip, uint16_t port);
27 | void parse_host_port_from_addr(const std::string& addr,
28 |                                std::string& host,
29 |                                int& port);
30 | 
31 | std::string extract_ip(const std::string& input);
32 | std::string extract_port(const std::string& input);
33 | }  // namespace net
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/rms_norm_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <map>
19 | #include <vector>
20 | 
21 | #include "base_loader.h"
22 | 
23 | namespace xllm {
24 | namespace layer {
25 | 
26 | class RMSNORMLoader : public BaseLoader {
27 |  public:
28 |   RMSNORMLoader(uint64_t weight_count, const ModelContext& context);
29 | 
30 |   void load_state_dict(const StateDict& state_dict) override;
31 | 
32 |   void verify_loaded_weights(const std::string& weight_str) const override;
33 | 
34 |  protected:
35 |   int rank_id_;
36 |   torch::ScalarType dtype_;
37 | };
38 | 
39 | }  // namespace layer
40 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/scheduler/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | 
 5 | add_subdirectory(profile)
 6 | 
 7 | cc_library(
 8 |   NAME
 9 |     scheduler
10 |   HDRS
11 |     chunked_prefill_scheduler.h
12 |     zero_eviction_scheduler.h
13 |     continuous_scheduler.h
14 |     disagg_pd_scheduler.h
15 |     pd_ooc_scheduler.h
16 |     async_response_processor.h
17 |     scheduler.h
18 |     dit_scheduler.h
19 |     prefill_only_scheduler.h
20 |     scheduler_factory.h
21 |     decode_priority_queue.h
22 |     perf_model.h
23 |   SRCS
24 |     chunked_prefill_scheduler.cpp
25 |     zero_eviction_scheduler.cpp
26 |     continuous_scheduler.cpp
27 |     disagg_pd_scheduler.cpp
28 |     pd_ooc_scheduler.cpp
29 |     async_response_processor.cpp
30 |     dit_scheduler.cpp
31 |     prefill_only_scheduler.cpp
32 |     scheduler_factory.cpp
33 |     perf_model.cpp
34 |   DEPS
35 |     :batch
36 |     :request
37 |     :runtime
38 |     :profile
39 |     glog::glog
40 |     Folly::folly
41 |     absl::time
42 |     absl::synchronization
43 | )
44 | 
45 | cc_test(
46 |   NAME
47 |     chunked_prefill_scheduler_test
48 |     continuous_scheduler_test
49 |   SRCS
50 |     chunked_prefill_scheduler_test.cpp
51 |     continuous_scheduler_test.cpp
52 |   DEPS
53 |     :scheduler
54 |     GTest::gtest_main
55 |     $<$<BOOL:${USE_NPU}>:nnopbase>
56 | )
57 | 
58 | 


--------------------------------------------------------------------------------
/xllm/core/util/type_traits.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | #include <optional>
19 | 
20 | namespace xllm {
21 | 
22 | template <typename value_type>
23 | struct remove_optional {
24 |   using type = value_type;
25 | };
26 | 
27 | // specialization for optional
28 | template <typename value_type>
29 | struct remove_optional<std::optional<value_type>> {
30 |   using type = value_type;
31 | };
32 | 
33 | /// alias template for remove_optional
34 | template <typename value_type>
35 | using remove_optional_t = typename remove_optional<value_type>::type;
36 | 
37 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/glm4_vision_encode_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Glm4VisionEncoderLayer
24 |     : public torch::nn::ModuleHolder<Glm4VisionEncoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Glm4VisionEncoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Glm4VisionEncoderLayerImpl;
28 | 
29 |   Glm4VisionEncoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<Glm4VisionEncoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/llama_decoder_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <map>
19 | #include <vector>
20 | 
21 | #include "base_loader.h"
22 | 
23 | namespace xllm {
24 | namespace layer {
25 | 
26 | class LlamaDecoderLoader : public BaseLoader {
27 |  public:
28 |   LlamaDecoderLoader(uint64_t weight_count, const ModelContext& context);
29 | 
30 |   void load_state_dict(const StateDict& state_dict) override;
31 |   void verify_loaded_weights() const override;
32 |   void merge_loaded_weights() override;
33 | 
34 |   bool enableAddNorm_;
35 |   int rank_id_;
36 | };
37 | 
38 | }  // namespace layer
39 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/qwen2_vision_encode_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen2VisionEncoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen2VisionEncoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen2VisionEncoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen2VisionEncoderLayerImpl;
28 | 
29 |   Qwen2VisionEncoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<Qwen2VisionEncoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/request/finish_reason.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "finish_reason.h"
17 | 
18 | #include <glog/logging.h>
19 | 
20 | namespace xllm {
21 | 
22 | std::optional<std::string> FinishReason::to_string() {
23 |   switch (value) {
24 |     case Value::NONE:
25 |       return std::nullopt;
26 |     case Value::STOP:
27 |       return "stop";
28 |     case Value::LENGTH:
29 |       return "length";
30 |     case Value::FUNCTION_CALL:
31 |       return "function_call";
32 |     default:
33 |       LOG(WARNING) << "Unknown finish reason: " << static_cast<int>(value);
34 |   }
35 |   return std::nullopt;
36 | }
37 | 
38 | }  // namespace xllm
39 | 


--------------------------------------------------------------------------------
/xllm/core/layers/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | 
 3 | cc_library(
 4 |   NAME
 5 |     common_layers
 6 |   HDRS
 7 |     qwen2_attention.h
 8 |     qwen2_vision_attention.h
 9 |     rms_norm.h
10 |     rotary_embedding.h
11 |     rotary_embedding_util.h
12 |     fused_moe.h
13 |     dense_mlp.h
14 |     qwen2_decoder_layer.h
15 |     qwen2_5_vision_layer.h
16 |     qwen3_moe_decoder_layer.h
17 |     linear.h
18 |     word_embedding_impl.h
19 |     layer_utils.h
20 |     indexer.h
21 |     deep_ep.h
22 |     activation.h
23 |     attention_metadata.h
24 |   SRCS
25 |     qwen2_attention.cpp
26 |     qwen2_vision_attention.cpp
27 |     rms_norm.cpp
28 |     rotary_embedding.cpp
29 |     rotary_embedding_util.cpp
30 |     fused_moe.cpp
31 |     dense_mlp.cpp
32 |     qwen2_decoder_layer.cpp
33 |     qwen2_5_vision_layer.cpp
34 |     qwen3_moe_decoder_layer.cpp
35 |     linear.cpp
36 |     word_embedding_impl.cpp
37 |     layer_utils.cpp
38 |     indexer.cpp
39 |     deep_ep.cpp
40 |     activation.cpp
41 |     attention_metadata.cpp
42 |   DEPS
43 |     "-Wl,--whole-archive"
44 |     "-Wl,--no-whole-archive"
45 |     :kv_cache
46 |     :prefix_cache
47 |     :block
48 |     :parallel_state
49 |     :state_dict
50 |     :model
51 |     :kernels
52 |     glog::glog
53 |     gflags::gflags
54 |     torch
55 |     :platform
56 | )
57 | 
58 | add_subdirectory(tests)
59 | 


--------------------------------------------------------------------------------
/xllm/core/layers/qwen3_vision_encode_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen3VisionEncoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen3VisionEncoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen3VisionEncoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen3VisionEncoderLayerImpl;
28 | 
29 |   Qwen3VisionEncoderLayer(const ModelContext& context)
30 |       : ModuleHolder(std::make_shared<Qwen3VisionEncoderLayerImpl>(context)) {}
31 | };
32 | 
33 | }  // namespace layer
34 | }  // namespace xllm
35 | 


--------------------------------------------------------------------------------
/xllm/api_service/call.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <brpc/controller.h>
19 | 
20 | #include <string>
21 | 
22 | namespace xllm {
23 | 
24 | class Call {
25 |  public:
26 |   Call(brpc::Controller* controller);
27 |   virtual ~Call() = default;
28 | 
29 |   std::string get_x_request_id() { return x_request_id_; }
30 |   std::string get_x_request_time() { return x_request_time_; }
31 | 
32 |   virtual bool is_disconnected() const = 0;
33 | 
34 |  protected:
35 |   void init();
36 | 
37 |  protected:
38 |   brpc::Controller* controller_;
39 | 
40 |   std::string x_request_id_;
41 |   std::string x_request_time_;
42 | };
43 | 
44 | }  // namespace xllm
45 | 


--------------------------------------------------------------------------------
/xllm/core/framework/sampling/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME 
 6 |     sampler
 7 |   HDRS
 8 |     sampling_params.h
 9 |     logits_utils.h
10 |     rejection_sampler.h
11 |     sampler.h
12 |     beam_searcher.h
13 |     rec_constrained_decoding.h
14 |   SRCS
15 |     sampling_params.cpp
16 |     logits_utils.cpp
17 |     rejection_sampler.cpp
18 |     sampler.cpp
19 |     beam_searcher.cpp
20 |     rec_constrained_decoding.cpp
21 |   DEPS
22 |     :common
23 |     glog::glog
24 |     torch
25 |     :kernels
26 |     $<$<BOOL:${USE_NPU}>:xllm_ops>
27 | )
28 | 
29 | cc_test(
30 |   NAME
31 |     sampler_test
32 |   SRCS
33 |     rejection_sampler_test.cpp
34 |     rejection_sampler.cpp
35 |     sampling_params_test.cpp
36 |   DEPS
37 |     absl::strings
38 |     GTest::gtest_main
39 |     :flags
40 |     :sampler
41 |     glog::glog
42 | )
43 | target_link_libraries(sampler_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto leveldb::leveldb ZLIB::ZLIB protobuf::libprotobuf)
44 | target_link_libraries(sampler_test
45 |                       PUBLIC
46 |                       Python::Python
47 |                       $<$<BOOL:${USE_NPU}>:ascendcl>
48 |                       $<$<BOOL:${USE_NPU}>:hccl>
49 |                       $<$<BOOL:${USE_NPU}>:c_sec>
50 |                       $<$<BOOL:${USE_NPU}>:nnopbase>)
51 | add_dependencies(sampler_test brpc-static)


--------------------------------------------------------------------------------
/xllm/core/layers/qwen3_moe_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen3MoeDecoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen3MoeDecoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen3MoeDecoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen3MoeDecoderLayerImpl;
28 | 
29 |   Qwen3MoeDecoderLayer(const ModelContext& context, int32_t layer_id)
30 |       : Qwen3MoeDecoderLayer(
31 |             std::make_shared<Qwen3MoeDecoderLayerImpl>(context, layer_id)) {}
32 | };
33 | 
34 | }  // namespace layer
35 | }  // namespace xllm
36 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/ilu/rope.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ilu_ops_api.h"
17 | #include "utils.h"
18 | 
19 | namespace xllm::kernel::ilu {
20 | 
21 | void apply_rope_pos_ids_cos_sin_cache(torch::Tensor& query,
22 |                                       torch::Tensor& key,
23 |                                       torch::Tensor& cos_sin_cache,
24 |                                       torch::Tensor& positions,
25 |                                       bool interleave) {
26 |   const int64_t head_size = cos_sin_cache.size(-1) / 2;
27 |   infer::vllm_rotary_embedding(
28 |       positions, query, key, head_size, cos_sin_cache, !interleave);
29 | }
30 | 
31 | }  // namespace xllm::kernel::ilu
32 | 


--------------------------------------------------------------------------------
/xllm/cc_api/README.md:
--------------------------------------------------------------------------------
 1 | ### How to compile xllm dynamic library
 2 | Run the following command in root directory:
 3 | ```
 4 | python setup.py build --device a3 --generate-so true
 5 | ```
 6 | 
 7 | If you want to debug, it needs to set DEBUG environment variable.
 8 | ```
 9 | export DEBUG=1
10 | ```
11 | 
12 | ### How to install dynamic library
13 | Run installation script xllm/cc_api/install.sh, headers and dynamic library will be installed in /usr/local/xllm directory.
14 | ```
15 | cd xllm/cc_api
16 | 
17 | sh install.sh
18 | ```
19 | 
20 | You will see the following files in /usr/local/xllm directory:
21 | ```
22 | [root@A03-R40-I189-101-4100046 cc_api]# tree /usr/local/xllm
23 | /usr/local/xllm
24 | |-- include
25 | |   |-- llm.h
26 | |   |-- macros.h
27 | |   `-- types.h
28 | `-- lib
29 |     |-- libcust_opapi.so
30 |     `-- libxllm.so
31 | 
32 | 3 directories, 5 files
33 | ```
34 | 
35 | ### How to run cc_api examples
36 | It provides two examples which use cc_api to create xllm instance and run inference. The  single_llm_instance.cpp creates one instance which is used in most LLM scenes. The multiple_llm_instances.cpp creates two instances which is used in multiple-models scene or one model with multiple versions. 
37 | 
38 | You can follow the commands to compile and run these examples:
39 | ```
40 | cd examples && mkdir build
41 | cd build && cmake .. && make && cd ..
42 | 
43 | sh start-llm-instance.sh
44 | ```


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/xllm_ops/beam_search.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <torch_npu/csrc/libs/init_npu.h>
18 | #include <torch_npu/torch_npu.h>
19 | 
20 | #include <vector>
21 | 
22 | #include "acl/acl.h"
23 | #include "aclnn_beam_search.h"
24 | #include "acltensor_utils.h"
25 | #include "util/tensor_helper.h"
26 | 
27 | namespace xllm_ops {
28 | void beam_search(const torch::Tensor& logprobs,
29 |                  const torch::Tensor& top_tokens,
30 |                  const torch::Tensor& top_logprobs,
31 |                  torch::Tensor& src_seq_idxes,
32 |                  torch::Tensor& out_logprobs,
33 |                  torch::Tensor& out_token_ids);
34 | }  // namespace xllm_ops


--------------------------------------------------------------------------------
/xllm/core/layers/qwen2dot5_vision_encode_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class Qwen2dot5VisionEncoderLayer
24 |     : public torch::nn::ModuleHolder<Qwen2dot5VisionEncoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<Qwen2dot5VisionEncoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = Qwen2dot5VisionEncoderLayerImpl;
28 | 
29 |   Qwen2dot5VisionEncoderLayer(const ModelContext& context)
30 |       : ModuleHolder(
31 |             std::make_shared<Qwen2dot5VisionEncoderLayerImpl>(context)) {}
32 | };
33 | 
34 | }  // namespace layer
35 | }  // namespace xllm
36 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/brpc"]
 2 | 	path = third_party/brpc
 3 | 	url = https://gitcode.com/xLLM-AI/brpc.git
 4 | [submodule "third_party/cpprestsdk"]
 5 | 	path = third_party/cpprestsdk
 6 | 	url = https://gitcode.com/xLLM-AI/cpprestsdk.git
 7 | [submodule "third_party/hccl_transfer"]
 8 | 	path = third_party/hccl_transfer
 9 | 	url = https://gitcode.com/xLLM-AI/hccl_transfer.git
10 | [submodule "third_party/minja"]
11 | 	path = third_party/minja
12 | 	url = https://gitcode.com/xLLM-AI/minja.git
13 | [submodule "third_party/sentencepiece"]
14 | 	path = third_party/sentencepiece
15 | 	url = https://gitcode.com/xLLM-AI/sentencepiece.git
16 | [submodule "third_party/smhasher"]
17 | 	path = third_party/smhasher
18 | 	url = https://gitcode.com/xLLM-AI/smhasher.git
19 | [submodule "third_party/xllm_ops"]
20 | 	path = third_party/xllm_ops
21 | 	url = https://gitcode.com/xLLM-AI/xllm_ops.git
22 | [submodule "third_party/etcd_cpp_apiv3"]
23 | 	path = third_party/etcd_cpp_apiv3
24 | 	url = https://gitcode.com/xLLM-AI/etcd-cpp-apiv3.git
25 | [submodule "third_party/spdlog"]
26 | 	path = third_party/spdlog
27 | 	url = https://gitcode.com/xLLM-AI/spdlog.git
28 | [submodule "third_party/Mooncake"]
29 | 	path = third_party/Mooncake
30 | 	url = https://gitcode.com/xLLM-AI/Mooncake.git
31 | [submodule "third_party/torch_npu_ops"]
32 | 	path = third_party/torch_npu_ops
33 | 	url = https://gitcode.com/xLLM-AI/torch_npu_ops.git
34 | 


--------------------------------------------------------------------------------
/xllm/core/layers/siglip_encoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | class SiglipEncoderLayer
24 |     : public torch::nn::ModuleHolder<SiglipEncoderLayerImpl> {
25 |  public:
26 |   using torch::nn::ModuleHolder<SiglipEncoderLayerImpl>::ModuleHolder;
27 |   using Impl __attribute__((__unused__)) = SiglipEncoderLayerImpl;
28 | 
29 |   SiglipEncoderLayer(const ModelContext& context,
30 |                      const std::string& prefix = "")
31 |       : ModuleHolder(
32 |             std::make_shared<SiglipEncoderLayerImpl>(context, prefix)) {}
33 | };
34 | 
35 | }  // namespace layer
36 | }  // namespace xllm
37 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/siglip_encoder_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #pragma once
16 | 
17 | #include <map>
18 | #include <vector>
19 | 
20 | #include "base_loader.h"
21 | 
22 | namespace xllm {
23 | namespace layer {
24 | 
25 | class SiglipEncoderUpLoader : public BaseLoader {
26 |  public:
27 |   explicit SiglipEncoderUpLoader(const ModelContext& context);
28 | 
29 |   void load_state_dict(const StateDict& state_dict) override;
30 | };
31 | 
32 | class SiglipEncoderDownLoader : public BaseLoader {
33 |  public:
34 |   explicit SiglipEncoderDownLoader(const ModelContext& context);
35 | 
36 |   void load_state_dict(const StateDict& state_dict) override;
37 | };
38 | 
39 | }  // namespace layer
40 | }  // namespace xllm


--------------------------------------------------------------------------------
/CONTRIBUTING_zh.md:
--------------------------------------------------------------------------------
 1 | <!-- Copyright 2025 JD.com
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this project except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License. -->
14 | 
15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md)
16 | 
17 | # xLLM 贡献指南
18 | 
19 | xLLM致力于为每一位用户和开发者提供开放的XX，因此无论您是XX开发者还是专注于XX用户，我们都欢迎您参与我们的项目。
20 | 您可以通过以下方法为项目作出贡献：
21 | 
22 | + 撰写/翻译/修改文档
23 | + 提出或回答问题
24 | + 提供使用或测试样例
25 | + 提供建议或其他评论
26 | + 参与[issues](https://github.com/xxx/xLLM/issues) 或[discussions](https://github.com/xxx/xLLM/discussions)
27 | + 提交Pull request
28 | + 分享相关研究或应用场景
29 | + 其他任何对xLLM的帮助
30 | 
31 | 如果您希望参与xLLM的开发，请参考以下提示：
32 | 
33 | ## 1. 选择参与贡献的issue
34 | + 您可以选择带有`PR welcome`标签的issue，包括:
35 |     + 可复现的bug
36 |     + 计划实现的功能
37 | 
38 | ## 2. 配置开发环境
39 | + 在开发之前，可以参考我们的 **[文档](http://xxx/docs/)**
40 | + 关于环境配置，参见 **[Readme file](/README.md)**
41 | 
42 | ## 3. 项目构建和运行
43 | + 您可以运行如下样例：
44 | 
45 | ## 4. 测试
46 | 
47 | 在pr提交之后，我们会对代码进行格式化及进一步测试。
48 | 我们的测试目前还很不完善，因此欢迎开发者为测试作出贡献！


--------------------------------------------------------------------------------
/docs/zh/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 | ---
 5 | <style>
 6 | .md-content h1:first-of-type {
 7 |     display: none;
 8 | }
 9 | </style>
10 | 
11 | <div style="text-align:center">
12 |     <img src="../assets/logo_with_llm.png" alt="xLLM" style="width:50%; height:auto;">
13 | </div>
14 | 
15 | ## 简介
16 | 
17 | **xLLM** 是一个高效且易用的开源智能推理框架，为模型在国产芯片上的推理提供企业级服务保障与高性能引擎计算能力。
18 | 
19 | #### 背景
20 | 当前，百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景，对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性，硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题，制约了请求的高效推理与系统的可扩展性。xLLM 推理引擎提升了 “通信 - 计算 - 存储” 全链路的资源利用效率，为大语言模型在实际业务中的规模化落地提供了关键技术支撑。
21 | 
22 | --- 
23 | 
24 | ## 核心特性
25 | xLLM 提供了强大的智能计算能力，通过硬件系统的算力优化与算法驱动的决策控制，联合加速推理过程，实现高吞吐、低延迟的分布式推理服务。
26 | 
27 | ### 全图化/多层流水线执行编排
28 | 
29 | - 框架调度层的异步解耦调度，减少计算空泡；
30 | - 模型图层的计算和通信异步并行，重叠计算与通信；
31 | - 算子内核层的异构计算单元深度流水，重叠计算与访存。
32 | 
33 | ### 动态shape的图执行优化
34 | 
35 | - 基于参数化与多图缓存方法的动态尺寸适配，提升静态图灵活性；
36 | - 受管控的显存池，保证地址安全可复用；
37 | - 集成适配性能关键的自定义算子（如 *PageAttention*, *AllReduce*）。
38 | 
39 | ### MoE算子优化
40 | 
41 | - *GroupMatmul* 优化，提升计算效率；
42 | - *Chunked Prefill* 优化，支撑长序列输入。
43 | 
44 | ### 高效显存优化
45 | 
46 | - 离散物理内存与连续虚拟内存的映射管理；
47 | - 按需分配内存空间，减少内存碎片与浪费；
48 | - 智能调度内存空间，增加内存页复用，减小分配延迟；
49 | - 国产芯片相应算子适配。
50 | 
51 | ### 全局多级KV Cache管理
52 | 
53 | - 多级缓存的kv智能卸载与预取；
54 | - 以kv cache为中心的分布式存储架构；
55 | - 多节点间kv的智能传输路由。
56 | 
57 | ### 算法优化
58 | 
59 | - 投机推理优化，多核并行提升效率；
60 | - MoE专家的动态负载均衡，实现专家分布的高效调整。
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/xllm/api_service/qwen3_rerank_service_impl.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "api_service/rerank_service_impl.h"
19 | 
20 | namespace xllm {
21 | using RerankCall = NonStreamCall<proto::RerankRequest, proto::RerankResponse>;
22 | 
23 | // a class to handle completion requests
24 | class Qwen3RerankServiceImpl final : public RerankServiceImpl {
25 |  public:
26 |   Qwen3RerankServiceImpl(LLMMaster* master,
27 |                          const std::vector<std::string>& models);
28 | 
29 |   // brpc call_data needs to use shared_ptr
30 |   void process_async_impl(std::shared_ptr<RerankCall> call) override;
31 | 
32 |  private:
33 |   DISALLOW_COPY_AND_ASSIGN(Qwen3RerankServiceImpl);
34 | };
35 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/common/activation.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "activation.h"
17 | 
18 | #include "kernels/ops_api.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | ActivationImpl::ActivationImpl(const std::string& act_mode, bool is_gated)
24 |     : act_mode_(act_mode), is_gated_(is_gated) {}
25 | 
26 | void ActivationImpl::forward(torch::Tensor& input, torch::Tensor& output) {
27 |   xllm::kernel::ActivationParams activation_params;
28 |   activation_params.input = input;
29 |   activation_params.output = output;
30 |   activation_params.act_mode = act_mode_;
31 |   activation_params.is_gated = is_gated_;
32 |   xllm::kernel::active(activation_params);
33 | }
34 | 
35 | }  // namespace layer
36 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/zh/features/xllm_service_overview.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # xLLM Service
 4 | [:simple-github: xLLM Service](https://github.com/jd-opensource/xllm-service)
 5 | 
 6 | ## 简介
 7 | 
 8 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架，为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。
 9 | 
10 | xLLM-service 旨在解决企业级服务场景中的关键挑战：
11 | 
12 | - 如何于在离线混合部署环境中，保障在线服务的SLA，提升离线任务的资源利用率。
13 | - 如何适应实际业务中动态变化的请求负载，如输入/输出长度出现剧烈波动。
14 | - 解决多模态模型请求的性能瓶颈。
15 | - 保障集群计算实例的高可靠性。
16 | 
17 | #### 背景
18 | 当前，百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景，对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性，硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题，制约了请求的高效推理与系统的可扩展性。xLLM-service + xLLM推理引擎提升了全链路效率，为大语言模型在实际业务中的规模化落地提供了关键技术支撑。
19 | 
20 | --- 
21 | 
22 | ## 整体架构
23 | xLLM-service 整体架构如图所示:
24 | 
25 | ![1](../../assets/service_arch.png)
26 | 
27 | ## 核心组件
28 | 
29 | ### ETCD Cluster
30 | 用于元信息管理，包括模型，xllm实例，请求等元信息的存储与管理。同时提供xllm节点注册与发现服务。
31 | 
32 | ### Fault Tolerance
33 | xLLM-service 提供容错管理，保障服务质量以及稳定性。
34 | 
35 | ### Global Scheduler
36 | 实现全局感知调度，根据当前系统状态，将请求精准调度至最优实例执行，有效提升整体服务响应效率与资源利用率。
37 | 
38 | ### Global KV Cache Manager
39 | 负责全局 KV Cache 管理，核心能力包括分布式 KV 缓存感知、Prefix 前缀匹配、KV Cache 动态迁移等，优化缓存资源使用效率。
40 | 
41 | ### Instance Manager
42 | 聚焦实例全生命周期管理，所有 xllm 实例启动后需向本模块注册，模块基于预设策略，为实例提供调度适配、容错处理等支持。
43 | 
44 | ### Event Plane
45 | 作为指标与事件中枢，接收各实例上报的 Metrics 数据，对统计指标进行统一收集与整理，为服务调度、容错、扩缩容等决策提供数据支撑。
46 | 
47 | ### Planner
48 | 承担策略分析与决策职能，基于 Event Plane 上报的 Metrics 数据（含实例运行时指标、机器负载指标等），分析服务扩缩容需求、热点实例扩展必要性，输出资源调整与实例优化策略。


--------------------------------------------------------------------------------
/xllm/core/framework/model/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | # Define the base dependencies
 5 | set(BASE_DEPS
 6 |   :common
 7 |   :flags
 8 |   :layers
 9 |   :prefix_cache
10 |   :block
11 |   :processors
12 |   :chat_template
13 |   glog::glog
14 |   torch
15 |   torch_python
16 | )
17 | 
18 | if(USE_NPU)
19 |   list(APPEND BASE_DEPS :platform_npu)
20 | endif()
21 | 
22 | # Define the library
23 | cc_library(
24 |   NAME 
25 |     model
26 |   HDRS
27 |     causal_lm.h
28 |     causal_vlm.h
29 |     dit_model.h
30 |     embedding_lm.h
31 |     embedding_vlm.h
32 |     mm_embedding_vlm.h
33 |     model_args.h
34 |     npu_dp_ep_padding.h
35 |     model_input_params.h
36 |   SRCS
37 |     npu_dp_ep_padding.cpp
38 |   DEPS
39 |     ${BASE_DEPS}
40 | )
41 | target_link_libraries(model PRIVATE :kv_cache)
42 | 
43 | if(USE_NPU)
44 | cc_test(
45 |     NAME 
46 |       npu_dp_ep_padding_test
47 |     SRCS
48 |       npu_dp_ep_padding_test.cpp
49 |     DEPS
50 |       :flags
51 |       :parallel_state
52 |       torch
53 |       model
54 |       absl::synchronization
55 |       absl::time
56 |       GTest::gtest_main
57 | )
58 | 
59 | target_link_libraries(npu_dp_ep_padding_test
60 |                  PUBLIC Python::Python
61 |                  $<$<BOOL:${USE_NPU}>:ascendcl>
62 |                  $<$<BOOL:${USE_NPU}>:hccl>
63 |                  $<$<BOOL:${USE_NPU}>:c_sec>
64 |                  $<$<BOOL:${USE_NPU}>:nnopbase>)
65 | endif()
66 | 


--------------------------------------------------------------------------------
/xllm/core/framework/prefix_cache/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_binary)
 2 | include(cc_library)
 3 | include(cc_test)
 4 | 
 5 | cc_library(
 6 |   NAME 
 7 |     prefix_cache
 8 |   HDRS
 9 |     prefix_cache.h
10 |     prefix_cache_with_upload.h
11 |     prefix_cache_factory.h
12 |   SRCS 
13 |     prefix_cache.cpp
14 |     prefix_cache_with_upload.cpp
15 |     prefix_cache_factory.cpp
16 |   DEPS
17 |     $<$<BOOL:${USE_NPU}>:torch_npu>
18 |     $<$<BOOL:${USE_NPU}>:graph>
19 |     :request
20 |     :common
21 |     glog::glog
22 |     Boost::serialization
23 |     SMHasherSupport
24 |     torch
25 | )
26 | 
27 | 
28 | if(USE_NPU)
29 |   cc_test(
30 |     NAME
31 |       prefix_test
32 |     SRCS
33 |       prefix_cache_test.cpp
34 |     DEPS
35 |       :flags
36 |       :kv_cache
37 |       :prefix_cache
38 |       :block
39 |       absl::random_random
40 |       Boost::serialization
41 |       GTest::gtest_main
42 |   )
43 | 
44 |   target_link_libraries(prefix_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto Folly::folly)
45 |   add_dependencies(prefix_test brpc-static)
46 | endif()
47 | 
48 | cc_binary(
49 |   NAME
50 |     prefix_cache_benchmark
51 |   SRCS
52 |     prefix_cache_benchmark.cpp
53 |   DEPS
54 |     :kv_cache
55 |     :prefix_cache
56 |     :block
57 |     benchmark::benchmark
58 |     benchmark::benchmark_main
59 | )
60 | 
61 | target_link_libraries(prefix_cache_benchmark PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto)
62 | add_dependencies(prefix_cache_benchmark brpc-static)
63 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/mlu/random_sample.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "mlu_ops_api.h"
17 | 
18 | namespace xllm::kernel::mlu {
19 | 
20 | torch::Tensor random_sample(const torch::Tensor& probs) {
21 |   torch::Tensor flat_probs;
22 |   if (probs.dim() == 3) {
23 |     flat_probs = probs.reshape({-1, probs.size(2)});
24 |   } else {
25 |     flat_probs = probs;
26 |   }
27 |   auto output =
28 |       torch::empty({flat_probs.size(0), 1},
29 |                    torch::dtype(torch::kInt64).device(probs.device()));
30 |   tmo::torch_api::random_sample(flat_probs, output, true, torch::Generator());
31 |   if (probs.dim() == 3) {
32 |     return output.reshape({probs.size(0), probs.size(1)});
33 |   }
34 |   return output.flatten();
35 | }
36 | 
37 | }  // namespace xllm::kernel::mlu


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/qwen3_decoder_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <map>
19 | #include <vector>
20 | 
21 | #include "base_loader.h"
22 | 
23 | namespace xllm {
24 | namespace layer {
25 | 
26 | class Qwen3DecoderLoader : public BaseLoader {
27 |  public:
28 |   Qwen3DecoderLoader(uint64_t weight_count,
29 |                      const ModelContext& context,
30 |                      bool enableAddNorm);
31 | 
32 |   void load_state_dict(const StateDict& state_dict) override;
33 |   void verify_loaded_weights() const override;
34 |   void merge_loaded_weights() override;
35 | 
36 |  protected:
37 |   torch::Tensor at_placeholder_;
38 |   bool enableAddNorm_;
39 |   int rank_id_;
40 | };
41 | 
42 | }  // namespace layer
43 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/request/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME 
 6 |     request
 7 |   HDRS
 8 |     dit_request.h
 9 |     dit_request_params.h
10 |     finish_reason.h
11 |     incremental_decoder.h
12 |     mm_codec.h
13 |     mm_data.h
14 |     mm_handler.h
15 |     mm_input.h
16 |     request_base.h
17 |     request.h
18 |     dit_request.h
19 |     request_output.h
20 |     dit_request_output.h
21 |     dit_request_params.h
22 |     request_params.h
23 |     sequence.h
24 |     sequence_logprob_state.h
25 |     sequence_kv_state.h
26 |     sequences_group.h
27 |     request_state.h
28 |     stopping_checker.h
29 |     priority_comparator.h
30 |   SRCS
31 |     dit_request.cpp
32 |     finish_reason.cpp
33 |     incremental_decoder.cpp
34 |     mm_codec.cpp
35 |     mm_data.cpp
36 |     mm_handler.cpp
37 |     mm_input.cpp
38 |     request.cpp
39 |     dit_request.cpp
40 |     request_output.cpp
41 |     dit_request_output.cpp
42 |     request_params.cpp
43 |     dit_request_params.cpp
44 |     sequence.cpp
45 |     sequence_logprob_state.cpp
46 |     sequence_kv_state.cpp
47 |     sequences_group.cpp
48 |     request_state.cpp
49 |     stopping_checker.cpp
50 |     priority_comparator.cpp
51 |   DEPS
52 |     :kv_cache
53 |     :prefix_cache
54 |     :block
55 |     :tokenizer
56 |     :chat_template
57 |     glog::glog
58 |     absl::strings
59 |     absl::time
60 |     proto::xllm_proto
61 |     torch
62 |     ${OpenCV_LIBS}
63 | )
64 | 
65 | 


--------------------------------------------------------------------------------
/xllm/core/framework/eplb/eplb_policy_test.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "eplb_policy.h"
17 | 
18 | #include <glog/logging.h>
19 | #include <gtest/gtest.h>
20 | #include <torch/torch.h>
21 | 
22 | namespace xllm {
23 | 
24 | TEST(EplbPolicyTest, Build) {
25 |   std::string rank_table_file;
26 |   EplbPolicy eplb_policy(5, 4, 1);
27 |   std::vector<torch::Tensor> tensors;
28 |   tensors.push_back(torch::arange(0, 16));
29 | 
30 |   auto expert_load = torch::stack(tensors, 0);
31 |   expert_load[0] =
32 |       torch::tensor({100, 100, 100, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 100});
33 |   auto [rebalance_expert, enable_update_vec] =
34 |       eplb_policy.rebalance_experts(expert_load);
35 |   LOG(INFO) << "rebalance_expert:" << rebalance_expert;
36 | }
37 | 
38 | }  // namespace xllm
39 | 


--------------------------------------------------------------------------------
/xllm/core/util/device_name_utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <torch/torch.h>
20 | 
21 | #include <vector>
22 | 
23 | namespace xllm {
24 | 
25 | class DeviceNameUtils {
26 |  public:
27 |   static std::vector<torch::Device> parse_devices(
28 |       const std::string& device_str);
29 | 
30 |   template <typename T>
31 |   static std::string to_string(const std::vector<T>& items) {
32 |     std::stringstream ss;
33 |     for (size_t i = 0; i < items.size(); ++i) {
34 |       const auto& item = items[i];
35 |       if (i == 0) {
36 |         ss << item;
37 |       } else {
38 |         ss << "," << item;
39 |       }
40 |     }
41 |     return ss.str();
42 |   }
43 | };
44 | 
45 | }  // namespace xllm
46 | 


--------------------------------------------------------------------------------
/xllm/core/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(cc_library)
 2 | include(cc_test)
 3 | 
 4 | cc_library(
 5 |   NAME 
 6 |     common
 7 |   HDRS
 8 |     etcd_client.h
 9 |     global_flags.h
10 |     instance_name.h
11 |     macros.h
12 |     message.h
13 |     metrics.h
14 |     $<$<BOOL:${USE_NPU}>:mspti_helper.h>
15 |     options.h
16 |     rate_limiter.h
17 |     types.h
18 |     device_monitor.h
19 |     version_singleton.h
20 |   SRCS
21 |     etcd_client.cpp
22 |     global_flags.cpp
23 |     metrics.cpp
24 |     $<$<BOOL:${USE_NPU}>:mspti_helper.cpp>
25 |     options.cpp
26 |     rate_limiter.cpp
27 |     device_monitor.cpp
28 |   DEPS
29 |     util
30 |     absl::random_random
31 |     absl::strings
32 |     torch
33 |     $<$<BOOL:${USE_NPU}>:torch_npu>
34 |     $<$<BOOL:${USE_MSPTI}>:mspti>
35 |     $<$<BOOL:${USE_NPU}>:ms_tools_ext>
36 |     Boost::serialization
37 |     cpprest
38 |     etcd-cpp-api
39 |     $<$<BOOL:${USE_MLU}>:torch_mlu>
40 | )
41 | 
42 | cc_library(
43 |   NAME
44 |     flags
45 |   HDRS
46 |     global_flags.h
47 |   SRCS
48 |     global_flags.cpp
49 |   DEPS
50 |     gflags::gflags
51 | )
52 | 
53 | cc_test(
54 |   NAME 
55 |     common_test
56 |   SRCS
57 |     rate_limiter_test.cpp
58 |   DEPS
59 |     common
60 |     absl::synchronization
61 |     absl::time
62 |     GTest::gtest_main
63 |     gflags::gflags
64 |     glog::glog
65 | )
66 | target_link_libraries(common PRIVATE OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
67 | add_dependencies(common brpc-static)
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/xllm/core/common/interruption_bus.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <functional>
19 | #include <vector>
20 | 
21 | namespace xllm {
22 | class ForwardInterruptedException : public std::exception {};
23 | 
24 | class InterruptionBus {
25 |  public:
26 |   void subscribe(std::function<void(bool)> func) { observers_.push_back(func); }
27 | 
28 |   void publish(bool interruption) {
29 |     for (auto it = observers_.begin(); it != observers_.end(); ++it) {
30 |       auto& observer = *it;
31 |       observer(interruption);
32 |     }
33 |   }
34 | 
35 |   static InterruptionBus& get_instance() {
36 |     static InterruptionBus instance;
37 |     return instance;
38 |   }
39 | 
40 |  private:
41 |   std::vector<std::function<void(bool)>> observers_;
42 | };
43 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/common/layer_utils.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "layer_utils.h"
17 | 
18 | #include "framework/parallel_state/parallel_state.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | void update_dummy_run_input(int64_t dp_rank,
24 |                             torch::Tensor& positions,
25 |                             ModelInputParams& input_params) {
26 |   auto& dp_ranks = input_params.dp_global_token_nums;
27 |   bool is_dummy_run = dp_ranks[dp_rank] == 0;
28 |   for (size_t i = 0; i < dp_ranks.size(); i++) {
29 |     if (dp_ranks[i] == 0) {
30 |       dp_ranks[i] = 1;
31 |     }
32 |   }
33 |   if (is_dummy_run) {
34 |     positions = torch::tensor({1}).to(torch::kInt32).to(positions.device());
35 |   }
36 | }
37 | 
38 | }  // namespace layer
39 | }  // namespace xllm
40 | 


--------------------------------------------------------------------------------
/xllm/core/layers/deepseek_v2_decoder_layer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "config.h"
19 | 
20 | namespace xllm {
21 | namespace layer {
22 | 
23 | // DeepSeek V3.2 used different structure but
24 | // it is still compatible with DeepSeek V2.
25 | class DeepseekV2DecoderLayer
26 |     : public torch::nn::ModuleHolder<DeepseekV2DecoderLayerImpl> {
27 |  public:
28 |   using torch::nn::ModuleHolder<DeepseekV2DecoderLayerImpl>::ModuleHolder;
29 |   using Impl __attribute__((__unused__)) = DeepseekV2DecoderLayerImpl;
30 | 
31 |   DeepseekV2DecoderLayer(const ModelContext& context, const int32_t layer_id)
32 |       : ModuleHolder(
33 |             std::make_shared<DeepseekV2DecoderLayerImpl>(context, layer_id)) {}
34 | };
35 | 
36 | }  // namespace layer
37 | }  // namespace xllm
38 | 


--------------------------------------------------------------------------------
/xllm/parser/reasoning_parser.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <memory>
20 | #include <string>
21 | 
22 | #include "parser/detector_registry.h"
23 | 
24 | namespace xllm {
25 | 
26 | class ReasoningParser {
27 |  public:
28 |   ReasoningParser(const std::string& model_type,
29 |                   bool stream_reasoning = true,
30 |                   bool force_reasoning = false);
31 | 
32 |   // Non-streaming call: one-time parsing
33 |   ReasoningResult parse_non_stream(const std::string& text);
34 |   // Streaming call: incremental parsing
35 |   ReasoningResult parse_stream_chunk(const std::string& chunk_text);
36 | 
37 |  private:
38 |   std::unique_ptr<ReasoningDetector> detector_;
39 | };
40 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/en/features/topk_topp.md:
--------------------------------------------------------------------------------
 1 | # Topk & Topp Operator Optimization
 2 | 
 3 | ## Background
 4 | 
 5 | In natural language generation tasks, the topK and topP sampling strategies are widely used to control the diversity and quality of generated text. However, in small models, the computation time for these two strategies is relatively long. This is mainly due to the fewer parameters in small models, which leads to reduced efficiency in sorting and filtering when processing probability distributions, thereby affecting generation speed. Therefore, optimizing the implementation of topK and topP in small models can enhance their sampling efficiency.
 6 | 
 7 | ## Feature Introduction
 8 | 
 9 | The implementation of the topKtopP operator merges multiple small operators, such as sorting, topK, softmax, and topP, into a single large operator, thereby improving computational efficiency and performance.
10 | 
11 | ## User Interface
12 | 
13 | ### Operator Call API
14 | 
15 | ```c++
16 | void top_k_top_p(torch::Tensor& logits,
17 |                  const torch::Tensor& topK,
18 |                  const torch::Tensor& topP);
19 | ```
20 | 
21 | - `logits`: The input logits tensor containing the model's output scores.
22 | - `topK`: The threshold tensor for selecting the top K probabilities.
23 | - `topP`: The threshold tensor for selecting the cumulative probabilities.
24 | 
25 | ## Performance Effect
26 | 
27 | * After using the topKtopP fused operator, in the qwen2-0.5B model, TTOT **decreased by 37%**, and TTFT **increased by 10%**.


--------------------------------------------------------------------------------
/xllm/core/common/instance_name.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <string>
19 | 
20 | namespace xllm {
21 | 
22 | class InstanceName {
23 |  public:
24 |   static InstanceName* name() {
25 |     static InstanceName n;
26 |     return &n;
27 |   }
28 | 
29 |   void set_name(const std::string& name) {
30 |     name_ = name;
31 |     name_hash_ = std::to_string(std::hash<std::string>{}(name_));
32 |   }
33 | 
34 |   std::string get_name() const { return name_; }
35 | 
36 |   std::string get_name_hash() const { return name_hash_; }
37 | 
38 |  private:
39 |   InstanceName() {}
40 |   InstanceName(const InstanceName&) = delete;
41 |   InstanceName& operator=(const InstanceName&) = delete;
42 | 
43 |  private:
44 |   std::string name_;
45 |   std::string name_hash_;
46 | };
47 | 
48 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/layers/npu/buffer/atb_workspace.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include <cstdint>
21 | #include <memory>
22 | 
23 | #include "atb_buffer.h"
24 | 
25 | namespace xllm {
26 | 
27 | class AtbWorkspace {
28 |  public:
29 |   AtbWorkspace() = default;
30 | 
31 |   AtbWorkspace(at::Device device);
32 | 
33 |   ~AtbWorkspace();
34 | 
35 |   AtbWorkspace(const AtbWorkspace&) = delete;
36 | 
37 |   AtbWorkspace& operator=(const AtbWorkspace&) = delete;
38 | 
39 |   AtbWorkspace(AtbWorkspace&&) = default;
40 | 
41 |   AtbWorkspace& operator=(AtbWorkspace&&) = default;
42 | 
43 |   void* get_workspace_buffer(uint64_t bufferSize);
44 | 
45 |  private:
46 |   static std::map<int32_t, std::unique_ptr<AtbBuffer>> buffer_map_;
47 | };
48 | 
49 | }  // namespace xllm
50 | 


--------------------------------------------------------------------------------
/xllm/server/xllm_server_registry.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <mutex>
19 | #include <unordered_map>
20 | 
21 | #include "xllm_server.h"
22 | 
23 | namespace xllm {
24 | 
25 | class ServerRegistry {
26 |  public:
27 |   static ServerRegistry& get_instance() {
28 |     static ServerRegistry instance;
29 |     return instance;
30 |   }
31 | 
32 |   XllmServer* register_server(const std::string& name);
33 |   void unregister_server(const std::string& name);
34 |   XllmServer* get_server(const std::string& name);
35 | 
36 |  private:
37 |   ServerRegistry() = default;
38 |   ~ServerRegistry() = default;
39 |   DISALLOW_COPY_AND_ASSIGN(ServerRegistry);
40 | 
41 |   std::unordered_map<std::string, std::unique_ptr<XllmServer>> servers_;
42 |   std::mutex mutex_;
43 | };
44 | 
45 | }  // namespace xllm
46 | 


--------------------------------------------------------------------------------
/docs/en/features/ppmatmul.md:
--------------------------------------------------------------------------------
 1 | # PpMatmul Operator Optimization
 2 | 
 3 | ## Background
 4 | 
 5 | In the inference of large models, matrix multiplication accounts for a high proportion and takes a long time. We have optimized the implementation of the matrix multiplication operator.
 6 | 
 7 | ## Feature Introduction
 8 | 
 9 | The PpMatmul operator uses a Tiling strategy to decompose matrix multiplication into multiple smaller matrix multiplication tasks. However, when the number of tiles is small, tasks cannot be evenly distributed across all NPU cores, leading to the tail effect problem, which affects computational efficiency. We optimize the performance of the PpMatmul operator by prefetching memory or redistributing tasks.
10 | 
11 | ## User Interface
12 | 
13 | ### Operator Direct Call API
14 | 
15 | ```cpp
16 | aclnnStatus aclnnPpMatmulOptGetWorkspaceSize(
17 |     const aclTensor *a,
18 |     const aclTensor *b,
19 |     const aclTensor *out,
20 |     uint64_t *workspaceSize,
21 |     aclOpExecutor **executor);
22 | 
23 | aclnnStatus aclnnPpMatmulOpt(
24 |     void *workspace,
25 |     uint64_t workspaceSize,
26 |     aclOpExecutor *executor,
27 |     aclrtStream stream);
28 | ```
29 | 
30 | - `a`: Input matrix A.
31 | - `b`: Input matrix B.
32 | - `out`: Output matrix, storing the computation result.
33 | 
34 | ## Performance Effect
35 | 
36 | For cases with a small number of tiles (e.g., when M is small, corresponding to a small batch size), there is an **18%** performance improvement of the operator compared to before optimization when (TP=4).


--------------------------------------------------------------------------------
/xllm/core/platform/npu/npu_layer_synchronizer.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <acl/acl.h>
19 | 
20 | #include <atomic>
21 | #include <vector>
22 | 
23 | namespace xllm {
24 | 
25 | class NPULayerSynchronizerImpl {
26 |  public:
27 |   NPULayerSynchronizerImpl(const int64_t num_layers,
28 |                            const int32_t timeout = -1);
29 |   virtual ~NPULayerSynchronizerImpl();
30 | 
31 |   aclrtEvent* get_event(const int64_t layer_index);
32 |   std::atomic<bool>* get_event_flag(const int64_t layer_index);
33 |   bool synchronize_layer(const int64_t layer_index);
34 |   uint32_t get_event_size() { return events_.size(); };
35 | 
36 |  private:
37 |   std::vector<aclrtEvent> events_;
38 |   std::vector<std::atomic<bool>> event_record_flags_;
39 |   const int32_t timeout_;
40 | };
41 | 
42 | }  // namespace xllm
43 | 


--------------------------------------------------------------------------------
/xllm/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import os
 3 | import xllm
 4 | import sys
 5 | 
 6 | def get_python_version():
 7 |     return f"{sys.version_info.major}{sys.version_info.minor}"
 8 | 
 9 | install_path_x86 = os.path.dirname(xllm.__file__) + f"/xllm_export.cpython-{get_python_version()}-x86_64-linux-gnu.so"
10 | install_path_arm = os.path.dirname(xllm.__file__) + f"/xllm_export.cpython-{get_python_version()}-aarch64-linux-gnu.so"
11 | if os.path.exists(install_path_x86):
12 |     install_path = install_path_x86
13 | elif os.path.exists(install_path_arm):
14 |     install_path = install_path_arm
15 | else:
16 |     raise ValueError("cannot open shared object file: No such file or directory, required ", install_path_x86, " or ", install_path_arm)
17 | export_so_path = os.path.abspath(install_path)
18 | spec = importlib.util.spec_from_file_location("xllm_export", export_so_path)
19 | xllm_export = importlib.util.module_from_spec(spec)
20 | 
21 | from xllm.pybind.embedding import Embedding
22 | from xllm.pybind.llm import LLM
23 | from xllm.pybind.vlm import VLM
24 | from xllm.pybind.args import ArgumentParser
25 | from xllm_export import (LLMMaster, Options, RequestParams, RequestOutput,
26 |                          SequenceOutput, Status, StatusCode, MMType, MMData)
27 | 
28 | __all__ = [
29 |     "ArgumentParser",
30 |     "Embedding",
31 |     "LLM",
32 |     "LLMMaster",
33 |     "VLM",
34 |     "VLMMaster"
35 |     "Options",
36 |     "RequestParams",
37 |     "RequestOutput",
38 |     "SequenceOutput",
39 |     "Status",
40 |     "StatusCode",
41 | ]
42 | 


--------------------------------------------------------------------------------
/xllm/core/framework/request/finish_reason.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <optional>
19 | #include <string>
20 | 
21 | namespace xllm {
22 | class FinishReason {
23 |  public:
24 |   enum Value : uint8_t { NONE = 0, STOP = 1, LENGTH, FUNCTION_CALL };
25 | 
26 |   FinishReason() = default;
27 |   FinishReason(Value v) : value(v) {}
28 |   operator Value() const { return value; }
29 |   explicit operator bool() const = delete;
30 | 
31 |   bool operator==(FinishReason rhs) const { return value == rhs.value; }
32 |   bool operator!=(FinishReason rhs) const { return value != rhs.value; }
33 | 
34 |   bool operator==(Value v) const { return value == v; }
35 |   bool operator!=(Value v) const { return value != v; }
36 | 
37 |   std::optional<std::string> to_string();
38 | 
39 |  private:
40 |   Value value;
41 | };
42 | }  // namespace xllm
43 | 


--------------------------------------------------------------------------------
/xllm/core/util/pretty_print.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "pretty_print.h"
18 | 
19 | #include <array>
20 | #include <iomanip>
21 | #include <sstream>
22 | 
23 | namespace xllm {
24 | 
25 | std::string readable_size(size_t bytes) {
26 |   static const std::array<const char*, 5> suffixes = {
27 |       "B", "KB", "MB", "GB", "TB"};
28 |   const size_t bytes_in_kb = 1024;
29 |   double size = static_cast<double>(bytes);
30 |   size_t suffix_index = 0;
31 |   while (size >= bytes_in_kb && suffix_index < suffixes.size() - 1) {
32 |     size /= bytes_in_kb;
33 |     ++suffix_index;
34 |   }
35 |   std::stringstream stream;
36 |   stream << std::fixed << std::setprecision(2) << size << " "
37 |          << suffixes.at(suffix_index);
38 |   return stream.str();
39 | }
40 | 
41 | }  // namespace xllm
42 | 


--------------------------------------------------------------------------------
/xllm/api_service/models_service_impl.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #pragma once
18 | 
19 | #include <string>
20 | 
21 | #include "core/common/macros.h"
22 | #include "models.pb.h"
23 | 
24 | namespace xllm {
25 | 
26 | class ModelsServiceImpl final {
27 |  public:
28 |   ModelsServiceImpl(const std::vector<std::string>& model_names,
29 |                     const std::vector<std::string>& model_versions);
30 | 
31 |   bool list_models(const proto::ModelListRequest* request,
32 |                    proto::ModelListResponse* response);
33 |   std::string list_model_versions();
34 | 
35 |  private:
36 |   DISALLOW_COPY_AND_ASSIGN(ModelsServiceImpl);
37 | 
38 |   std::vector<std::string> model_names_;
39 |   std::vector<std::string> model_versions_;
40 |   uint32_t created_;
41 | };
42 | 
43 | }  // namespace xllm
44 | 


--------------------------------------------------------------------------------
/xllm/core/kernels/npu/xllm_ops/acltensor_utils.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch_npu/csrc/libs/init_npu.h>
19 | #include <torch_npu/torch_npu.h>
20 | 
21 | #include <vector>
22 | 
23 | #include "acl/acl.h"
24 | #include "util/tensor_helper.h"
25 | 
26 | namespace xllm_ops_utils {
27 | struct type_info {
28 |   static aclDataType get_acl_type(const torch::ScalarType& dtype);
29 | };
30 | 
31 | void create_acltensor(aclTensor** tensor, const torch::Tensor& tensor_data);
32 | void check_tensor(const torch::Tensor& t,
33 |                   const std::string& name,
34 |                   const std::string& func_name = "");
35 | void check_tensor_shapes_equal(const torch::Tensor& a,
36 |                                const torch::Tensor& b,
37 |                                const std::string& func_name = "");
38 | }  // namespace xllm_ops_utils


--------------------------------------------------------------------------------
/xllm/core/util/timer.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "timer.h"
18 | 
19 | #include <absl/time/clock.h>
20 | #include <absl/time/time.h>
21 | 
22 | namespace xllm {
23 | 
24 | Timer::Timer() : start_(absl::Now()) {}
25 | 
26 | // reset the timer
27 | void Timer::reset() { start_ = absl::Now(); }
28 | 
29 | // get the elapsed time in seconds
30 | double Timer::elapsed_seconds() const {
31 |   return absl::ToDoubleSeconds(absl::Now() - start_);
32 | }
33 | 
34 | // get the elapsed time in milliseconds
35 | double Timer::elapsed_milliseconds() const {
36 |   return absl::ToDoubleMilliseconds(absl::Now() - start_);
37 | }
38 | 
39 | // get the elapsed time in microseconds
40 | double Timer::elapsed_microseconds() const {
41 |   return absl::ToDoubleMicroseconds(absl::Now() - start_);
42 | }
43 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/dit_cache/dit_non_cache.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include "dit_cache_impl.h"
18 | 
19 | namespace xllm {
20 | 
21 | class DiTNonCache : public DitCacheImpl {
22 |  public:
23 |   DiTNonCache() = default;
24 |   ~DiTNonCache() override = default;
25 | 
26 |   DiTNonCache(const DiTNonCache&) = delete;
27 |   DiTNonCache& operator=(const DiTNonCache&) = delete;
28 |   DiTNonCache(DiTNonCache&&) = default;
29 |   DiTNonCache& operator=(DiTNonCache&&) = default;
30 | 
31 |   void init(const DiTCacheConfig& cfg) override;
32 | 
33 |   bool on_before_block(const CacheBlockIn& blockin) override;
34 |   CacheBlockOut on_after_block(const CacheBlockIn& blockin) override;
35 | 
36 |   bool on_before_step(const CacheStepIn& stepin) override;
37 |   CacheStepOut on_after_step(const CacheStepIn& stepin) override;
38 | };
39 | 
40 | }  // namespace xllm
41 | 


--------------------------------------------------------------------------------
/xllm/core/framework/xtensor/options.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include <vector>
21 | 
22 | #include "common/macros.h"
23 | 
24 | namespace xllm {
25 | namespace xtensor {
26 | struct Options {
27 |   // devices for xtensor manager pool
28 |   PROPERTY(std::vector<torch::Device>, devices);
29 | 
30 |   // num of layers
31 |   PROPERTY(int64_t, num_layers) = 0;
32 | 
33 |   // total pages for xtensor manager
34 |   PROPERTY(int64_t, num_total_pages) = 0;
35 | 
36 |   // key or value cache size in bytes per token
37 |   PROPERTY(int64_t, cache_size_per_token) = 0;
38 | 
39 |   // Index ID for internal server ID, which must be set different values
40 |   // if the model supports multiple version or there are multiple models.
41 |   PROPERTY(int64_t, server_idx) = 0;
42 | };
43 | }  // namespace xtensor
44 | }  // namespace xllm


--------------------------------------------------------------------------------
/docs/mkdocs/overrides/.icons/gitcodeai.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1755324951889" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5508" width="200" height="200" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M660.48 212.352c5.952-3.392 12.352-6.912 19.2-10.88l2.368 12.032c1.28 6.272 2.368 11.456 2.88 16.512 4.032 44.288 26.304 76.928 57.856 84.224 46.144 10.752 89.664-7.04 113.088-46.08 28.16-46.848 16-104.32-32-138.88C690.56 33.216 544.128 6.72 386.304 53.056 46.144 153.344-70.272 571.264 171.2 827.52c103.296 109.632 234.432 156.928 383.36 153.344 190.72-4.48 328-99.52 415.744-264.256 62.08-116.864-5.44-244.608-134.848-271.168a837.376 837.376 0 0 0-224.768-14.4c-24.96 2.432-49.28 9.792-71.424 21.76-24.768 12.8-31.872 39.488-29.12 65.92 2.56 24.064 21.056 38.528 43.008 42.176a1837.44 1837.44 0 0 0 133.76 14.592c12.928 1.088 25.984 1.28 39.04 1.344 18.752 0.192 37.376 0.384 55.68 3.392 52.032 8.576 69.888 50.816 43.136 96-6.592 10.88-14.208 21.056-22.848 30.336a259.392 259.392 0 0 1-131.392 77.76c-92.416 22.592-184.896 23.872-276.8-5.12-104.704-33.088-167.168-109.952-169.344-213.504-0.832-63.872 15.36-126.72 46.976-182.272 14.272-25.856 22.08-52.48 19.84-81.856-0.896-12.48-1.408-24.96-1.92-38.4a2748.8 2748.8 0 0 0-1.024-22.4c10.624 2.176 21.12 4.992 31.36 8.32 40.064 16.128 79.488 23.488 122.88 11.648A222.08 222.08 0 0 1 517.76 256a188.608 188.608 0 0 0 115.648-28.288c8.64-5.056 17.344-9.92 26.944-15.36z" fill="#ACACAC" p-id="5509"></path></svg>


--------------------------------------------------------------------------------
/xllm/api_service/api_service_impl.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | #include <absl/container/flat_hash_set.h>
18 | 
19 | #include <memory>
20 | 
21 | #include "call.h"
22 | #include "core/distributed_runtime/llm_master.h"
23 | 
24 | namespace xllm {
25 | 
26 | template <typename T>
27 | class APIServiceImpl {
28 |  public:
29 |   APIServiceImpl(const std::vector<std::string>& models)
30 |       : models_(models.begin(), models.end()) {
31 |     CHECK(!models_.empty());
32 |   }
33 |   virtual ~APIServiceImpl() = default;
34 | 
35 |   void process_async(std::shared_ptr<Call> call) {
36 |     std::shared_ptr<T> call_cast = std::dynamic_pointer_cast<T>(call);
37 |     process_async_impl(call_cast);
38 |   }
39 | 
40 |   virtual void process_async_impl(std::shared_ptr<T> call) = 0;
41 | 
42 |  protected:
43 |   absl::flat_hash_set<std::string> models_;
44 | };
45 | 
46 | }  // namespace xllm
47 | 


--------------------------------------------------------------------------------
/xllm/core/layers/npu/loader/qwen3_decoder_manual_loader.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <map>
19 | #include <vector>
20 | 
21 | #include "base_manual_loader.h"
22 | #include "core/layers/npu/npu_base_layer.h"
23 | 
24 | namespace xllm {
25 | namespace layer {
26 | 
27 | class Qwen3DecoderManualLoader : public BaseManualLoader {
28 |  public:
29 |   Qwen3DecoderManualLoader(uint64_t weight_count,
30 |                            const ModelContext& context,
31 |                            bool enableAddNorm);
32 | 
33 |   void load_state_dict(const StateDict& state_dict) override;
34 |   void verify_loaded_weights() const override;
35 |   void merge_loaded_weights() override;
36 | 
37 |  protected:
38 |   void merge_host_at_weights();
39 |   at::Tensor at_placeholder_;
40 |   bool enableAddNorm_;
41 |   int rank_id_;
42 | };
43 | 
44 | }  // namespace layer
45 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/dit_cache/dit_cache.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "dit_cache.h"
17 | 
18 | namespace xllm {
19 | 
20 | bool DiTCache::init(const DiTCacheConfig& cfg) {
21 |   active_cache_ = create_dit_cache(cfg);
22 |   if (!active_cache_) {
23 |     return false;
24 |   }
25 |   active_cache_->init(cfg);
26 |   return true;
27 | }
28 | 
29 | bool DiTCache::on_before_block(const CacheBlockIn& blockin) {
30 |   return active_cache_->on_before_block(blockin);
31 | }
32 | 
33 | CacheBlockOut DiTCache::on_after_block(const CacheBlockIn& blockin) {
34 |   return active_cache_->on_after_block(blockin);
35 | }
36 | 
37 | bool DiTCache::on_before_step(const CacheStepIn& stepin) {
38 |   return active_cache_->on_before_step(stepin);
39 | }
40 | 
41 | CacheStepOut DiTCache::on_after_step(const CacheStepIn& stepin) {
42 |   return active_cache_->on_after_step(stepin);
43 | }
44 | 
45 | }  // namespace xllm
46 | 


--------------------------------------------------------------------------------
/xllm/parser/reasoning_parser.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | ==============================================================================*/
16 | 
17 | #include "xllm/parser/reasoning_parser.h"
18 | 
19 | namespace xllm {
20 | ReasoningParser::ReasoningParser(const std::string& model_type,
21 |                                  bool stream_reasoning,
22 |                                  bool force_reasoning) {
23 |   detector_ = DetectorRegistry::getInstance().getDetector(
24 |       model_type, stream_reasoning, force_reasoning);
25 | }
26 | 
27 | ReasoningResult ReasoningParser::parse_non_stream(const std::string& text) {
28 |   return detector_->detect_and_parse(const_cast<std::string&>(text));
29 | }
30 | 
31 | ReasoningResult ReasoningParser::parse_stream_chunk(
32 |     const std::string& chunk_text) {
33 |   return detector_->parse_streaming_increment(
34 |       const_cast<std::string&>(chunk_text));
35 | }
36 | 
37 | }  // namespace xllm


--------------------------------------------------------------------------------
/xllm/core/framework/batch/mposition.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include <torch/torch.h>
19 | 
20 | #include <limits>
21 | #include <vector>
22 | 
23 | namespace xllm {
24 | 
25 | class Sequence;
26 | struct ModelArgs;
27 | 
28 | class MPositionHelper {
29 |  public:
30 |   MPositionHelper(Sequence& seq, const ModelArgs& args)
31 |       : seq_(seq), args_(args) {}
32 | 
33 |   torch::Tensor get_positions();
34 | 
35 |  private:
36 |   std::tuple<torch::Tensor, int> get_positions_p(
37 |       torch::Tensor image_grid_thw,
38 |       torch::Tensor video_grid_thw,
39 |       torch::Tensor second_per_grid_ts);
40 |   std::tuple<torch::Tensor, int> get_positions_glm(
41 |       torch::Tensor image_grid_thw,
42 |       torch::Tensor video_grid_thw);
43 | 
44 |   torch::Tensor get_positions_d();
45 | 
46 |  private:
47 |   Sequence& seq_;
48 |   const ModelArgs& args_;
49 | };
50 | 
51 | }  // namespace xllm
52 | 


--------------------------------------------------------------------------------
/xllm/core/distributed_runtime/pd_ooc_service_impl.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     https://github.com/jd-opensource/xllm/blob/main/LICENSE
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #pragma once
17 | 
18 | #include "disagg_pd.pb.h"
19 | #include "disagg_pd_service_impl.h"
20 | 
21 | namespace xllm {
22 | 
23 | class Engine;
24 | class Request;
25 | class PDOOCScheduler;
26 | 
27 | // a class to handle disagg_pd OOC requests
28 | class PDOOCServiceImpl final : public DisaggPDServiceImpl {
29 |  public:
30 |   explicit PDOOCServiceImpl(PDOOCScheduler* scheduler, Engine* engine);
31 |   ~PDOOCServiceImpl() = default;
32 | 
33 |   virtual void decode_recv_multi_generations(
34 |       const proto::DisaggGenerationsRequests* request,
35 |       proto::Status* response);
36 | 
37 |   virtual void prefill_recv_pull_signal(const proto::PullSignal* request,
38 |                                         proto::Status* response);
39 | 
40 |  private:
41 |   PDOOCScheduler* pd_ooc_scheduler_;  // not owned
42 | };
43 | 
44 | }  // namespace xllm
45 | 


--------------------------------------------------------------------------------