├── examples ├── __init__.py ├── generate_embedding.py └── generate.py ├── version.txt ├── xllm ├── pybind │ ├── __init__.py │ ├── CMakeLists.txt │ └── util.py ├── core │ ├── kernels │ │ ├── npu │ │ │ ├── CMakeLists.txt │ │ │ ├── xllm_ops │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── replace_token.h │ │ │ │ ├── top_k_top_p.h │ │ │ │ ├── beam_search.h │ │ │ │ └── acltensor_utils.h │ │ │ ├── matmul.cpp │ │ │ └── active.cpp │ │ ├── mlu │ │ │ ├── CMakeLists.txt │ │ │ ├── gather_split.cpp │ │ │ └── random_sample.cpp │ │ ├── cuda │ │ │ ├── CMakeLists.txt │ │ │ └── matmul.cpp │ │ ├── CMakeLists.txt │ │ └── ilu │ │ │ ├── CMakeLists.txt │ │ │ ├── matmul.cpp │ │ │ ├── activation.cpp │ │ │ └── rope.cpp │ ├── framework │ │ ├── tokenizer │ │ │ ├── tokenizers │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Cargo.toml │ │ │ ├── CMakeLists.txt │ │ │ └── tokenizer_factory.h │ │ ├── state_dict │ │ │ ├── safetensors │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Cargo.toml │ │ │ └── CMakeLists.txt │ │ ├── prefix_cache │ │ │ ├── prefix_cache_factory.h │ │ │ ├── prefix_cache_factory.cpp │ │ │ ├── prefix_cache_with_upload.h │ │ │ └── CMakeLists.txt │ │ ├── chat_template │ │ │ └── CMakeLists.txt │ │ ├── dit_cache │ │ │ ├── CMakeLists.txt │ │ │ ├── dit_non_cache.h │ │ │ └── dit_cache.cpp │ │ ├── eplb │ │ │ ├── CMakeLists.txt │ │ │ └── eplb_policy_test.cpp │ │ ├── xtensor │ │ │ ├── CMakeLists.txt │ │ │ ├── phy_page.cpp │ │ │ ├── phy_page.h │ │ │ └── options.h │ │ ├── batch │ │ │ ├── CMakeLists.txt │ │ │ └── mposition.h │ │ ├── request │ │ │ ├── dit_request_params.h │ │ │ ├── finish_reason.cpp │ │ │ ├── CMakeLists.txt │ │ │ └── finish_reason.h │ │ ├── kv_cache │ │ │ ├── CMakeLists.txt │ │ │ └── kv_cache_event.h │ │ ├── parallel_state │ │ │ └── CMakeLists.txt │ │ ├── CMakeLists.txt │ │ ├── block │ │ │ └── CMakeLists.txt │ │ ├── sampling │ │ │ └── CMakeLists.txt │ │ └── model │ │ │ └── CMakeLists.txt │ ├── layers │ │ ├── ilu │ │ │ └── CMakeLists.txt │ │ ├── cuda │ │ │ └── CMakeLists.txt │ │ ├── mlu │ │ │ └── CMakeLists.txt │ │ ├── common │ │ │ ├── tests │ │ │ │ └── CMakeLists.txt │ │ │ ├── layer_utils.h │ │ │ ├── activation.h │ │ │ ├── CMakeLists.txt │ │ │ ├── activation.cpp │ │ │ └── layer_utils.cpp │ │ ├── npu │ │ │ ├── loader │ │ │ │ ├── lm_head_loader.h │ │ │ │ ├── column_parallel_linear_loader.h │ │ │ │ ├── word_embedding_loader.h │ │ │ │ ├── rms_norm_loader.h │ │ │ │ ├── llama_decoder_loader.h │ │ │ │ ├── siglip_encoder_loader.h │ │ │ │ ├── qwen3_decoder_loader.h │ │ │ │ └── qwen3_decoder_manual_loader.h │ │ │ └── buffer │ │ │ │ ├── atb_buffer.h │ │ │ │ └── atb_workspace.h │ │ ├── lm_head.h │ │ ├── glm4_decoder_layer.h │ │ ├── word_embedding.h │ │ ├── pos_embedding.h │ │ ├── llama_decoder_layer.h │ │ ├── qwen2_decoder_layer.h │ │ ├── qwen3_decoder_layer.h │ │ ├── glm4_vision_encode_layer.h │ │ ├── qwen2_vision_encode_layer.h │ │ ├── qwen3_vision_encode_layer.h │ │ ├── qwen3_moe_decoder_layer.h │ │ ├── qwen2dot5_vision_encode_layer.h │ │ ├── siglip_encoder_layer.h │ │ └── deepseek_v2_decoder_layer.h │ ├── CMakeLists.txt │ ├── platform │ │ ├── npu │ │ │ ├── CMakeLists.txt │ │ │ └── npu_layer_synchronizer.h │ │ └── CMakeLists.txt │ ├── scheduler │ │ ├── profile │ │ │ └── CMakeLists.txt │ │ ├── scheduler_factory.h │ │ └── CMakeLists.txt │ ├── distributed_runtime │ │ ├── spawn_worker_server │ │ │ └── CMakeLists.txt │ │ └── pd_ooc_service_impl.h │ ├── common │ │ ├── rate_limiter_test.cpp │ │ ├── rate_limiter.h │ │ ├── CMakeLists.txt │ │ ├── interruption_bus.h │ │ └── instance_name.h │ ├── util │ │ ├── pretty_print.h │ │ ├── uuid.h │ │ ├── timer.h │ │ ├── uuid.cpp │ │ ├── net.h │ │ ├── type_traits.h │ │ ├── device_name_utils.h │ │ ├── pretty_print.cpp │ │ └── timer.cpp │ └── runtime │ │ └── dit_executor.cpp ├── models │ ├── CMakeLists.txt │ └── llm │ │ └── npu │ │ └── llama3.h ├── cc_api │ ├── examples │ │ ├── start-llm-instance.sh │ │ └── service_request.h │ ├── macros.h │ └── README.md ├── server │ ├── CMakeLists.txt │ └── xllm_server_registry.h ├── parser │ ├── CMakeLists.txt │ ├── reasoning_parser.h │ └── reasoning_parser.cpp ├── launch_xllm.py ├── proto │ ├── CMakeLists.txt │ ├── tensor.proto │ ├── rerank.proto │ ├── models.proto │ └── xtensor_manager.proto ├── function_call │ ├── partial_json_parser │ │ ├── CMakeLists.txt │ │ └── include │ │ │ └── partial_json_parser │ │ │ └── options.h │ └── CMakeLists.txt ├── processors │ ├── CMakeLists.txt │ ├── input_processor.h │ └── pywarpper_image_processor.h ├── api_service │ ├── CMakeLists.txt │ ├── call.h │ ├── qwen3_rerank_service_impl.h │ ├── models_service_impl.h │ └── api_service_impl.h └── __init__.py ├── .style.yapf ├── third_party └── .clang-format ├── cmake ├── CMakeTestRustCompiler.cmake ├── CMakeRustCompiler.cmake.in └── CMakeDetermineRustCompiler.cmake ├── docs ├── assets │ ├── logo.png │ ├── xllm_arch.png │ ├── logo_with_llm.png │ ├── moe_eplevel1.jpg │ ├── moe_eplevel2.jpg │ ├── service_arch.png │ ├── wechat_qrcode.jpg │ ├── pd_architecture.jpg │ ├── eplb_architecture.png │ ├── groupmatmul_performance.png │ ├── globalkvcache_architecture.png │ ├── multi_streams_architecture.jpg │ └── async_schedule_architecture.jpg ├── zh │ ├── xLLM_Technical_Report_zh.pdf │ ├── features │ │ ├── basics.md │ │ ├── continuous_scheduler.md │ │ ├── zero_evict_scheduler.md │ │ ├── multimodal.md │ │ ├── chunked_scheduler.md │ │ ├── prefix_cache.md │ │ ├── topk_topP.md │ │ ├── multi_streams.md │ │ ├── ppmatmul.md │ │ ├── moe_params.md │ │ ├── acl_graph.md │ │ ├── xtensor_memory.md │ │ ├── eplb.md │ │ ├── async_schedule.md │ │ ├── global_kvcache.md │ │ ├── groupgemm.md │ │ └── xllm_service_overview.md │ ├── .readthedocs.yaml │ └── index.md ├── mkdocs │ ├── javascripts │ │ └── mathjax.js │ ├── stylesheets │ │ └── extra.css │ └── overrides │ │ └── .icons │ │ ├── email-fill.svg │ │ └── gitcodeai.svg ├── en │ ├── features │ │ ├── continuous_scheduler.md │ │ ├── zero_evict_scheduler.md │ │ ├── chunked_scheduler.md │ │ ├── prefix_cache.md │ │ ├── topk_topp.md │ │ └── ppmatmul.md │ └── .readthedocs.yaml └── requirements.txt ├── .clang-format ├── .pre-commit-config.yaml ├── cibuild ├── install │ ├── install_ninja.sh │ ├── install_user.sh │ ├── install_ccache.sh │ ├── install_cmake.sh │ ├── install_gcc.sh │ ├── install_python.sh │ └── install_base.sh ├── build_mlu.sh └── build_npu.sh ├── MANIFEST.in ├── .github └── ISSUE_TEMPLATE │ ├── question.yaml │ ├── bug-report.yaml │ └── feature-request.yml ├── .gitignore ├── tools └── README.md ├── .gitmodules └── CONTRIBUTING_zh.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.7.0 2 | -------------------------------------------------------------------------------- /xllm/pybind/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | -------------------------------------------------------------------------------- /third_party/.clang-format: -------------------------------------------------------------------------------- 1 | DisableFormat: true 2 | SortIncludes: Never -------------------------------------------------------------------------------- /cmake/CMakeTestRustCompiler.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_Rust_COMPILER_WORKS 1 CACHE INTERNAL "") 2 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | add_subdirectory(xllm_ops) -------------------------------------------------------------------------------- /docs/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/logo.png -------------------------------------------------------------------------------- /docs/assets/xllm_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/xllm_arch.png -------------------------------------------------------------------------------- /docs/assets/logo_with_llm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/logo_with_llm.png -------------------------------------------------------------------------------- /docs/assets/moe_eplevel1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/moe_eplevel1.jpg -------------------------------------------------------------------------------- /docs/assets/moe_eplevel2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/moe_eplevel2.jpg -------------------------------------------------------------------------------- /docs/assets/service_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/service_arch.png -------------------------------------------------------------------------------- /docs/assets/wechat_qrcode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/wechat_qrcode.jpg -------------------------------------------------------------------------------- /docs/assets/pd_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/pd_architecture.jpg -------------------------------------------------------------------------------- /docs/assets/eplb_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/eplb_architecture.png -------------------------------------------------------------------------------- /docs/zh/xLLM_Technical_Report_zh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/zh/xLLM_Technical_Report_zh.pdf -------------------------------------------------------------------------------- /docs/assets/groupmatmul_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/groupmatmul_performance.png -------------------------------------------------------------------------------- /docs/assets/globalkvcache_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/globalkvcache_architecture.png -------------------------------------------------------------------------------- /docs/assets/multi_streams_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/multi_streams_architecture.jpg -------------------------------------------------------------------------------- /docs/assets/async_schedule_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jd-opensource/xllm/HEAD/docs/assets/async_schedule_architecture.jpg -------------------------------------------------------------------------------- /docs/zh/features/basics.md: -------------------------------------------------------------------------------- 1 | # 基础知识 2 | 3 | - xLLM使用一卡一进程模式,多卡之间使用rpc进行函数调用,模型计算过程中的数据通信使用device集合通信库。 4 | 5 | - HCCL/LCCL是高性能集合通信,提供单机多卡以及多机多卡间的数据并行、模型并行集合通信方案。 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /xllm/core/framework/tokenizer/tokenizers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cargo_library) 2 | 3 | cargo_library( 4 | NAME 5 | rust_tokenizers 6 | HDRS 7 | tokenizers.h 8 | ) 9 | 10 | -------------------------------------------------------------------------------- /xllm/core/framework/state_dict/safetensors/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cargo_library) 2 | 3 | cargo_library( 4 | NAME 5 | rust_safetensors 6 | HDRS 7 | safetensors.h 8 | ) 9 | 10 | -------------------------------------------------------------------------------- /xllm/core/layers/ilu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | ilu_layers 6 | HDRS 7 | attention.h 8 | SRCS 9 | attention.cpp 10 | DEPS 11 | :common_layers 12 | ) 13 | -------------------------------------------------------------------------------- /xllm/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | # Define the library 4 | cc_library( 5 | NAME 6 | models 7 | HDRS 8 | model_registry.h 9 | models.h 10 | SRCS 11 | model_registry.cpp 12 | DEPS 13 | :model 14 | ) 15 | -------------------------------------------------------------------------------- /xllm/core/layers/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | cuda_layers 6 | HDRS 7 | attention.h 8 | flashinfer_workspace.h 9 | SRCS 10 | attention.cpp 11 | flashinfer_workspace.cpp 12 | DEPS 13 | :common_layers 14 | ) 15 | -------------------------------------------------------------------------------- /xllm/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(common) 2 | add_subdirectory(distributed_runtime) 3 | add_subdirectory(framework) 4 | add_subdirectory(kernels) 5 | add_subdirectory(layers) 6 | add_subdirectory(platform) 7 | add_subdirectory(runtime) 8 | add_subdirectory(scheduler) 9 | add_subdirectory(util) -------------------------------------------------------------------------------- /xllm/core/platform/npu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | platform_npu 6 | HDRS 7 | npu_layer_synchronizer.h 8 | SRCS 9 | npu_layer_synchronizer.cpp 10 | DEPS 11 | torch_npu 12 | glog::glog 13 | torch 14 | ascendcl 15 | ) -------------------------------------------------------------------------------- /docs/zh/features/continuous_scheduler.md: -------------------------------------------------------------------------------- 1 | # continuous调度器 2 | 3 | ## 功能介绍 4 | xLLM实现了支持continuous batching的调度策略,continuous_batch是一种动态批处理策略,它不等待批次填满,而是在有请求时就开始处理,同时持续接收新请求并将其加入正在执行的批次中,从而在保持高吞吐量的同时显著降低延迟。 5 | 6 | ## 使用方式 7 | continuous batching的调度策略在xLLM提供了实现,如果不开其它调度策略,则默认使用continuous batching。 8 | 9 | -------------------------------------------------------------------------------- /xllm/core/framework/state_dict/safetensors/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust_safetensors" 3 | version = "0.6.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | name = "rust_safetensors" 8 | crate-type = ["staticlib"] 9 | 10 | [dependencies] 11 | thiserror = "1.0" 12 | safetensors = "0.6.0" 13 | 14 | -------------------------------------------------------------------------------- /xllm/core/framework/tokenizer/tokenizers/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust_tokenizers" 3 | version = "0.21.0" 4 | edition = "2018" 5 | 6 | [lib] 7 | name = "rust_tokenizers" 8 | crate-type = ["staticlib"] 9 | 10 | [dependencies] 11 | tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] } 12 | -------------------------------------------------------------------------------- /xllm/core/framework/prefix_cache/prefix_cache_factory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "prefix_cache.h" 5 | 6 | namespace xllm { 7 | 8 | std::unique_ptr create_prefix_cache( 9 | const int32_t block_size, 10 | const bool& enable_cache_upload = false); 11 | 12 | } // namespace xllm 13 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: Google 3 | UseTab: Never 4 | IndentWidth: 2 5 | ColumnLimit: 80 6 | 7 | BinPackParameters: false 8 | BinPackArguments: false 9 | ExperimentalAutoDetectBinPacking: false 10 | AllowAllParametersOfDeclarationOnNextLine: false 11 | DerivePointerAlignment: false 12 | PointerAlignment: Left 13 | ... 14 | -------------------------------------------------------------------------------- /xllm/cc_api/examples/start-llm-instance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | clear 4 | 5 | # export ASDOPS_LOG_LEVEL=DEBUG 6 | # export ASDOPS_LOG_TO_STDOUT=1 7 | export ASCEND_RT_VISIBLE_DEVICES=12 8 | python3 -c "import torch; import torch_npu; torch_npu.npu.set_device('npu:0')" 9 | 10 | # build/single_llm_instance 11 | build/multiple_llm_instances 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit install 2 | # pre-commit run --all-files 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/mirrors-clang-format 6 | rev: v20.1.6 7 | hooks: 8 | - id: clang-format 9 | types_or: [c++, c, cuda] 10 | exclude: ^(cibuild/|tools/|third_party/|cmake/|build/|.*\.ptx\.h$) 11 | 12 | -------------------------------------------------------------------------------- /cibuild/install/install_ninja.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$NINJA_VERSION" ] 6 | 7 | url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip" 8 | 9 | pushd /tmp 10 | wget --no-verbose --output-document=ninja-linux.zip "$url" 11 | unzip ninja-linux.zip -d /usr/local/bin 12 | rm -f ninja-linux.zip 13 | popd -------------------------------------------------------------------------------- /xllm/server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | xllm_server 6 | HDRS 7 | xllm_server.h 8 | xllm_server_registry.h 9 | SRCS 10 | xllm_server.cpp 11 | xllm_server_registry.cpp 12 | DEPS 13 | :api_service 14 | :request 15 | absl::strings 16 | glog::glog 17 | proto::xllm_proto 18 | ) 19 | -------------------------------------------------------------------------------- /xllm/core/layers/mlu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | mlu_layers 6 | HDRS 7 | attention.h 8 | deepseek_v2_attention.h 9 | deepseek_v2_decoder_layer_impl.h 10 | SRCS 11 | attention.cpp 12 | deepseek_v2_attention.cpp 13 | deepseek_v2_decoder_layer_impl.cpp 14 | DEPS 15 | :common_layers 16 | ) 17 | -------------------------------------------------------------------------------- /cibuild/install/install_user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # mirror jenkins user in container 6 | echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd 7 | echo "jenkins:x:1000:" >> /etc/group 8 | # needed on focal or newer 9 | echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow 10 | 11 | # allow sudo 12 | echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins -------------------------------------------------------------------------------- /xllm/core/scheduler/profile/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | profile 8 | HDRS 9 | profile_manager.h 10 | time_predictor.h 11 | SRCS 12 | profile_manager.cpp 13 | time_predictor.cpp 14 | DEPS 15 | :batch 16 | :request 17 | :runtime 18 | glog::glog 19 | absl::time 20 | ) 21 | -------------------------------------------------------------------------------- /xllm/parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | # Define the library 5 | cc_library( 6 | NAME 7 | reasoning 8 | HDRS 9 | reasoning_detector.h 10 | reasoning_parser.h 11 | detector_registry.h 12 | SRCS 13 | reasoning_detector.cpp 14 | reasoning_parser.cpp 15 | detector_registry.cpp 16 | DEPS 17 | absl::strings 18 | glog::glog 19 | ) -------------------------------------------------------------------------------- /cibuild/install/install_ccache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$CCACHE_VERSION" ] 6 | 7 | ARCH=$(uname -m) 8 | url=https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-${ARCH}.tar.xz 9 | 10 | pushd /tmp 11 | curl -L "$url" | xz -d | tar -x 12 | cp ./ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/bin/ccache 13 | popd 14 | 15 | # set max cache size to 25GiB 16 | /usr/bin/ccache -M 25Gi -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include CMakeLists.txt 3 | include LICENSE 4 | include .gitmodules 5 | recursive-include src *.* 6 | recursive-include xllm *.py 7 | recursive-include examples *.py 8 | recursive-include third_party * 9 | recursive-include docs *.* 10 | recursive-include tools *.* 11 | recursive-include scripts *.* 12 | recursive-include proto *.* 13 | prune */__pycache__ 14 | global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp 15 | -------------------------------------------------------------------------------- /xllm/core/kernels/mlu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | file(GLOB_RECURSE MLU_HEADER_FILES 4 | "${CMAKE_CURRENT_LIST_DIR}/*.h" 5 | ) 6 | 7 | file(GLOB_RECURSE MLU_SOURCE_FILES 8 | "${CMAKE_CURRENT_LIST_DIR}/*.cpp" 9 | ) 10 | 11 | cc_library( 12 | NAME 13 | mlu_kernels 14 | HDRS 15 | ${MLU_HEADER_FILES} 16 | SRCS 17 | ${MLU_SOURCE_FILES} 18 | DEPS 19 | torch 20 | cnclep 21 | torch_mlu_ops 22 | python3.10 23 | ) 24 | -------------------------------------------------------------------------------- /docs/zh/features/zero_evict_scheduler.md: -------------------------------------------------------------------------------- 1 | # zero_evict调度器 2 | 3 | ## 功能介绍 4 | xLLM支持zero_evict调度策略。zero_evict调度策略是一种尽可能减少请求淘汰率的调度算法,可以减少淘汰请求的prefill计算,减少TPOT。 5 | 这种调度算法通过模拟轮次,检测请求是否调度可以被调度且不导致其它请求被淘汰。 6 | 7 | ## 使用方式 8 | 上述策略已在xLLM实现,并向外暴露gflag参数,控制功能的开关。 9 | 10 | - 开启zero_evict策略,并设置max_decode_token_per_sequence。 11 | ``` 12 | --use_zero_evict=true 13 | --max_decode_token_per_sequence=256 14 | ``` 15 | 16 | ## 性能效果 17 | 开启zero_evict之后,在Qwen3-8B模型上,限制E2E时延,TPOT时延 **下降27%**。 18 | -------------------------------------------------------------------------------- /xllm/launch_xllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | import sys 5 | import xllm 6 | 7 | 8 | def launch_xllm(): 9 | system = platform.system() 10 | binary_name = { 11 | "Linux": "xllm", 12 | # "Windows" 13 | # "Darwin" 14 | }.get(system, "xllm") 15 | 16 | bin_path = os.path.dirname(xllm.__file__) + "/xllm" 17 | 18 | result = subprocess.run([str(bin_path)] + sys.argv[1:]) 19 | return result.returncode 20 | -------------------------------------------------------------------------------- /xllm/proto/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(proto_library) 2 | 3 | proto_library( 4 | NAME 5 | xllm_proto 6 | SRCS 7 | tensor.proto 8 | common.proto 9 | rec.proto 10 | completion.proto 11 | chat.proto 12 | multimodal.proto 13 | embedding.proto 14 | rerank.proto 15 | models.proto 16 | worker.proto 17 | disagg_pd.proto 18 | xllm_service.proto 19 | xservice.proto 20 | image_generation.proto 21 | xtensor_manager.proto 22 | ) 23 | -------------------------------------------------------------------------------- /xllm/core/framework/state_dict/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_test) 2 | include(cc_library) 3 | 4 | include_directories(..) 5 | 6 | add_subdirectory(safetensors) 7 | 8 | cc_library( 9 | NAME 10 | state_dict 11 | HDRS 12 | state_dict.h 13 | utils.h 14 | rec_vocab_dict.h 15 | SRCS 16 | state_dict.cpp 17 | utils.cpp 18 | rec_vocab_dict.cpp 19 | DEPS 20 | rust_safetensors 21 | torch 22 | glog::glog 23 | Folly::folly 24 | util 25 | ) 26 | 27 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/xllm_ops/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | xllm_ops 6 | HDRS 7 | replace_token.h 8 | top_k_top_p.h 9 | acltensor_utils.h 10 | beam_search.h 11 | SRCS 12 | replace_token.cpp 13 | top_k_top_p.cpp 14 | acltensor_utils.cpp 15 | beam_search.cpp 16 | DEPS 17 | atb 18 | torch_npu 19 | gflags::gflags 20 | nlohmann_json::nlohmann_json 21 | opapi 22 | spdlog::spdlog 23 | ) 24 | -------------------------------------------------------------------------------- /docs/mkdocs/javascripts/mathjax.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.startup.output.clearCache() 16 | MathJax.typesetClear() 17 | MathJax.texReset() 18 | MathJax.typesetPromise() 19 | }) -------------------------------------------------------------------------------- /xllm/function_call/partial_json_parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | partial_json_parser 7 | SRCS 8 | src/parser.cpp 9 | INCLUDES 10 | include 11 | DEPS 12 | nlohmann_json::nlohmann_json 13 | ) 14 | 15 | cc_test( 16 | NAME 17 | partial_json_parser_test 18 | SRCS 19 | test/test_examples.cpp 20 | test/test_property_based.cpp 21 | DEPS 22 | :partial_json_parser 23 | GTest::gtest 24 | GTest::gtest_main 25 | ) -------------------------------------------------------------------------------- /xllm/core/framework/chat_template/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library ( 5 | NAME 6 | chat_template 7 | HDRS 8 | jinja_chat_template.h 9 | SRCS 10 | jinja_chat_template.cpp 11 | DEPS 12 | :minja 13 | :tokenizer 14 | nlohmann_json::nlohmann_json 15 | glog::glog 16 | ) 17 | 18 | cc_test ( 19 | NAME 20 | chat_template_test 21 | SRCS 22 | jinja_chat_template_test.cpp 23 | DEPS 24 | :chat_template 25 | GTest::gtest_main 26 | ) 27 | 28 | -------------------------------------------------------------------------------- /cibuild/install/install_cmake.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$CMAKE_VERSION" ] 6 | 7 | # Remove existing CMake installation 8 | rm -f /usr/local/bin/cmake 9 | 10 | path="v${CMAKE_VERSION}" 11 | file="cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" 12 | 13 | # Download and install specific CMake version in /usr/local 14 | pushd /tmp 15 | wget -q "https://github.com/Kitware/CMake/releases/download/${path}/${file}" 16 | tar -C /usr/local --strip-components 1 --no-same-owner -zxf ${file} 17 | rm -f cmake-*.tar.gz 18 | popd -------------------------------------------------------------------------------- /xllm/core/framework/dit_cache/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | dit_cache 7 | HDRS 8 | dit_cache_type.h 9 | dit_cache_config.h 10 | dit_cache_impl.h 11 | dit_cache.h 12 | dit_non_cache.h 13 | fbcache.h 14 | fbcache_taylorseer.h 15 | taylorseer.h 16 | SRCS 17 | dit_cache_impl.cpp 18 | dit_cache.cpp 19 | dit_non_cache.cpp 20 | fbcache.cpp 21 | fbcache_taylorseer.cpp 22 | taylorseer.cpp 23 | DEPS 24 | torch 25 | glog::glog 26 | Folly::folly 27 | ) -------------------------------------------------------------------------------- /cmake/CMakeRustCompiler.cmake.in: -------------------------------------------------------------------------------- 1 | 2 | # ported from https://github.com/Devolutions/CMakeRust 3 | set(CMAKE_Rust_COMPILER "@CMAKE_Rust_COMPILER@") 4 | set(CMAKE_Rust_COMPILER_ID "@CMAKE_Rust_COMPILER_ID@") 5 | set(CMAKE_Rust_COMPILER_VERSION "@CMAKE_Rust_COMPILER_VERSION@") 6 | set(CMAKE_Rust_COMPILER_LOADED @CMAKE_Rust_COMPILER_LOADED@) 7 | set(CMAKE_Rust_PLATFORM_ID "@CMAKE_Rust_PLATFORM_ID@") 8 | 9 | SET(CMAKE_Rust_SOURCE_FILE_EXTENSIONS rs) 10 | SET(CMAKE_Rust_LINKER_PREFERENCE 40) 11 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC") 12 | 13 | -------------------------------------------------------------------------------- /xllm/core/kernels/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | file(GLOB_RECURSE CUDA_HEADER_FILES 4 | "${CMAKE_CURRENT_LIST_DIR}/*.h" 5 | "${CMAKE_CURRENT_LIST_DIR}/*.cuh" 6 | ) 7 | 8 | file(GLOB_RECURSE CUDA_SOURCE_FILES 9 | "${CMAKE_CURRENT_LIST_DIR}/*.cpp" 10 | "${CMAKE_CURRENT_LIST_DIR}/*.cu" 11 | ) 12 | 13 | cc_library( 14 | NAME 15 | cuda_kernels 16 | HDRS 17 | ${CUDA_HEADER_FILES} 18 | SRCS 19 | ${CUDA_SOURCE_FILES} 20 | DEPS 21 | tvm_ffi 22 | torch 23 | :util 24 | :platform 25 | ) 26 | -------------------------------------------------------------------------------- /docs/zh/features/multimodal.md: -------------------------------------------------------------------------------- 1 | # 多模态支持 2 | 本文档主要介绍xLLM推理引擎中多模态的支持进展,包括支持模型及模态类型,以及离在线接口等。 3 | 4 | ## 支持模型 5 | - Qwen2.5-VL: 包括7B/32B/72B。 6 | - Qwen3-VL: 包括2B/4B/8B/32B。 7 | - Qwen3-VL-MoE: 包括A3B/A22B。 8 | - MiniCPM-V-2_6: 7B。 9 | 10 | ## 模态类型 11 | - 图片: 支持单图、多图的输入,以及图片+Prompt组合、纯文本Promot等输入方式。 12 | 13 | 14 | !!! warning "注意事项" 15 | - 目前多模态后端不支持prefix cache以及chunk prefill,正在支持中。 16 | - 目前,xLLM统一基于JinJa渲染ChatTemplate,部署MiniCPM-V-2_6,模型目录需提供ChatTemplate文件。 17 | - 图片支持Base64输入以及图片Url。 18 | - 目前多模态模型主要支持了图片模态,视频、音频等模态正在推进中。 19 | 20 | -------------------------------------------------------------------------------- /xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | 3 | cc_binary( 4 | NAME 5 | spawn_worker 6 | HDRS 7 | spawn_worker_server.h 8 | SRCS 9 | spawn_worker_server.cpp 10 | spawn_worker_server_process.cpp 11 | DEPS 12 | :models 13 | :model 14 | :distributed_runtime 15 | absl::strings 16 | xllm_kernels 17 | ascendcl 18 | nnopbase 19 | atb 20 | atb_customize 21 | c_sec 22 | spdlog::spdlog 23 | ) 24 | 25 | add_dependencies(export_module spawn_worker) 26 | -------------------------------------------------------------------------------- /docs/zh/features/chunked_scheduler.md: -------------------------------------------------------------------------------- 1 | # chunked调度器 2 | 3 | ## 功能介绍 4 | xLLM支持chunked prefill调度策略。Chunked prefill是一种优化大语言模型推理的技术,将长prompt分割成多个较小的chunk进行分批处理,而不是一次性处理整个prompt。 5 | 这种方法可以有效降低显存峰值使用量,提高Device利用率,并且能够更好地与decode阶段的请求进行调度和混合处理。 6 | 7 | ## 使用方式 8 | 上述策略已在xLLM实现,并向外暴露gflag参数,控制功能的开关。 9 | 10 | - 开启chunked prefill,并设置chunked_size,如果不手动设置chunked size,则默认等于max_tokens_per_batch。 11 | ```bash 12 | --enable_chunked_prefill=true 13 | --max_tokens_per_chunk_for_prefill=20480 # optional 14 | ``` 15 | 16 | ## 性能效果 17 | 开启chunked_prefill之后,在Qwen3-8B模型上,限制TPOT 50ms,TTFT时延 **下降46%**。 18 | -------------------------------------------------------------------------------- /xllm/core/framework/prefix_cache/prefix_cache_factory.cpp: -------------------------------------------------------------------------------- 1 | #include "prefix_cache_factory.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "prefix_cache_with_upload.h" 7 | 8 | namespace xllm { 9 | 10 | std::unique_ptr create_prefix_cache( 11 | int32_t block_size, 12 | const bool& enable_cache_upload) { 13 | if (enable_cache_upload) { 14 | return std::make_unique(block_size); 15 | } 16 | return std::make_unique(block_size); 17 | } 18 | 19 | } // namespace xllm 20 | -------------------------------------------------------------------------------- /xllm/core/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | if(USE_NPU) 4 | add_subdirectory(npu) 5 | endif() 6 | 7 | if(USE_MLU) 8 | add_subdirectory(mlu) 9 | endif() 10 | 11 | if(USE_CUDA) 12 | add_subdirectory(cuda) 13 | endif() 14 | 15 | if(USE_ILU) 16 | add_subdirectory(ilu) 17 | endif() 18 | 19 | cc_library( 20 | NAME 21 | kernels 22 | HDRS 23 | param.h 24 | ops_api.h 25 | SRCS 26 | ops_api.cpp 27 | DEPS 28 | torch 29 | $<$:torch_npu_kernels> 30 | $<$:mlu_kernels> 31 | $<$:cuda_kernels> 32 | $<$:ilu_kernels> 33 | ) -------------------------------------------------------------------------------- /docs/zh/features/prefix_cache.md: -------------------------------------------------------------------------------- 1 | # prefix cache 优化 2 | 3 | ## 功能介绍 4 | xLLM支持prefix_cache匹配。prefix_cache基于mermer_hash,使用lru淘汰策略,提供更极致的匹配效率,同时提高prefix_cache命中率。 5 | 同时对prefix_cache进行了优化,支持continuous_scheduler、chunked_scheduler和zero_evict_scheduler,在prefill之后即更新 6 | prefix_cache,提高匹配时效性,同时对于chunked_scheduler,支持多阶段chunked_prefill匹配,减少计算量并尽可能减少kv_cache占用。 7 | 8 | ## 使用方式 9 | prefix_cache已在xLLM实现,并向外暴露gflag参数,控制功能的开关。 10 | 11 | - 开启zero_evict策略,并设置max_decode_token_per_sequence。 12 | ``` 13 | --enable_prefix_cache=true 14 | ``` 15 | 16 | ## 性能效果 17 | 开启prefix_cache之后,在Qwen3-8B模型上,限制TPOT50ms,E2E时延 **下降10%**。 18 | 19 | !!! warning "注意" 20 | 暂不支持PD分离调度器 -------------------------------------------------------------------------------- /xllm/core/platform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | platform 6 | HDRS 7 | stream.h 8 | device.h 9 | vmm_api.h 10 | SRCS 11 | stream.cpp 12 | device.cpp 13 | vmm_api.cpp 14 | DEPS 15 | torch 16 | $<$:torch_npu> 17 | $<$:ascendcl> 18 | $<$:torch_mlu> 19 | $<$:cnrt> 20 | $<$:cndrv> 21 | $<$,$>:cuda> 22 | $<$,$>:cudart> 23 | ) 24 | 25 | if(USE_NPU) 26 | add_subdirectory(npu) 27 | endif() 28 | -------------------------------------------------------------------------------- /xllm/proto/tensor.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option go_package = "jd.com/jd-infer/xllm;xllm"; 4 | package xllm.proto; 5 | 6 | message TensorContents 7 | { 8 | repeated bool bool_contents = 1; 9 | repeated int32 int_contents = 2; 10 | repeated int64 int64_contents = 3; 11 | repeated uint32 uint_contents = 4; 12 | repeated uint64 uint64_contents = 5; 13 | repeated float fp32_contents = 6; 14 | repeated double fp64_contents = 7; 15 | repeated bytes bytes_contents = 8; 16 | } 17 | 18 | message Tensor { 19 | string name = 1; 20 | string datatype = 2; 21 | repeated int64 shape = 3; 22 | TensorContents contents = 4; 23 | } -------------------------------------------------------------------------------- /docs/en/features/continuous_scheduler.md: -------------------------------------------------------------------------------- 1 | # Continuous Scheduler 2 | 3 | ## Feature Introduction 4 | xLLM implements a scheduling strategy that supports continuous batching. Continuous batching is a dynamic batching strategy that does not wait for a batch to be filled. Instead, it starts processing as soon as requests are available, while continuously accepting new requests and adding them to the currently executing batch. This approach significantly reduces latency while maintaining high throughput. 5 | 6 | ## Usage 7 | The continuous batching scheduling strategy is implemented in xLLM. If no other scheduling strategies are enabled, continuous batching is used by default. -------------------------------------------------------------------------------- /xllm/core/kernels/ilu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | set(CMAKE_CUDA_ARCHITECTURES ivcore11) 3 | file(GLOB_RECURSE ILU_HEADER_FILES 4 | "${CMAKE_CURRENT_LIST_DIR}/*.h" 5 | ) 6 | 7 | file(GLOB_RECURSE ILU_SOURCE_FILES 8 | "${CMAKE_CURRENT_LIST_DIR}/*.cpp" 9 | "${CMAKE_CURRENT_LIST_DIR}/*.cu" 10 | ) 11 | 12 | find_package(Python3 REQUIRED COMPONENTS Interpreter Development) 13 | 14 | cc_library( 15 | NAME 16 | ilu_kernels 17 | HDRS 18 | ${ILU_HEADER_FILES} 19 | SRCS 20 | ${ILU_SOURCE_FILES} 21 | DEPS 22 | torch 23 | :util 24 | ixformer_kernels 25 | ixformer 26 | ${Python3_LIBRARIES} 27 | cuinfer 28 | ) 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yaml: -------------------------------------------------------------------------------- 1 | name: ❓ Question 2 | description: Submit a question 3 | title: "[Question]: " 4 | labels: ["question"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues). 11 | - type: textarea 12 | attributes: 13 | label: ❓ Describe the question 14 | description: | 15 | Please provide a clear and concise description of your question. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | -------------------------------------------------------------------------------- /docs/zh/features/topk_topP.md: -------------------------------------------------------------------------------- 1 | # Topk&Topp算子优化 2 | 3 | ## 背景 4 | 在自然语言生成任务中,topK和topP采样策略被广泛应用于控制生成文本的多样性和质量。然而,在小模型中,这两种策略的计算耗时相对较长。这主要是由于小模型的参数较少,导致在处理概率分布时,排序和筛选的效率降低,从而影响了生成速度。因此,优化小模型中topK和topP的实现,可以提升其采样效率。 5 | 6 | 7 | ## 功能介绍 8 | 9 | topKtopP算子的实现将排序、topK、softmax和topP等多个小算子融合为一个大算子,从而提高了计算效率和性能。 10 | 11 | 12 | ## 用户接口 13 | ### 算子调用API 14 | ```c++ 15 | void top_k_top_p(torch::Tensor& logits, 16 | const torch::Tensor& topK, 17 | const torch::Tensor& topP); 18 | ``` 19 | 20 | - `logits`: 输入的logits张量,包含模型的输出分数。 21 | - `topK`: 用于选择的前K个概率的阈值张量。 22 | - `topP`: 用于选择的累积概率的阈值张量。 23 | 24 | 25 | ## 性能效果 26 | 27 | * 使用topKtopP融合算子后,在qwen2-0.5B模型中,TTOT **下降37%**,TTFT **提升10%**。 28 | -------------------------------------------------------------------------------- /xllm/pybind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(pybind_extension) 2 | 3 | pybind_extension( 4 | NAME 5 | xllm_export 6 | COPTS 7 | -DPY_MODULE_NAME=xllm_export 8 | SRCS 9 | bind.cpp 10 | DEFINES 11 | PYBIND11_DETAILED_ERROR_MESSAGES=1 12 | LINKDIRS 13 | ${TORCH_INSTALL_PREFIX}/lib 14 | DEPS 15 | :master 16 | :request 17 | :util 18 | absl::strings 19 | brpc 20 | gflags::gflags 21 | glog::glog 22 | Python::Module 23 | torch_python 24 | torch 25 | c10 26 | ) 27 | target_link_options(xllm_export PRIVATE -Wl,-Bsymbolic) 28 | target_link_libraries(common PRIVATE leveldb::leveldb ZLIB::ZLIB OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf) 29 | add_dependencies(common brpc-static) 30 | 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Visual Studio Code 2 | /.vscode* 3 | 4 | # Idea 5 | /.idea 6 | /cmake-build-debug/ 7 | /cmake-build-release/ 8 | 9 | # CMake 10 | /build* 11 | 12 | # vcpkg 13 | /.vcpkg* 14 | 15 | # cache 16 | /.*cache 17 | 18 | # deps 19 | /.deps 20 | 21 | # libtorch 22 | /libtorch 23 | 24 | # tests 25 | /Testing* 26 | 27 | # rust 28 | Cargo.lock 29 | 30 | 31 | # distribution / packaging 32 | .Python 33 | build/ 34 | dist/ 35 | eggs/ 36 | .eggs/ 37 | sdist/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # Python module builds 45 | *.egg-info/ 46 | xllm/*.pyd 47 | xllm/*.so 48 | xllm/version.py 49 | **/__pycache__/* 50 | 51 | # compile_commands.json from nvbench 52 | compile_commands.json 53 | 54 | # ascend kernel meta files 55 | /kernel_meta 56 | 57 | # local files 58 | /local -------------------------------------------------------------------------------- /docs/en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version, and other tools you might need 8 | build: 9 | os: ubuntu-24.04 10 | tools: 11 | python: "3.13" 12 | jobs: 13 | pre_build: 14 | - cp -r docs/en/* docs/ 15 | - find docs/ -name "*.md" -exec sed -i 's#../assets/#assets/#g' {} \; 16 | 17 | # Build documentation with Mkdocs 18 | mkdocs: 19 | configuration: mkdocs_en.yml 20 | 21 | # Optionally, but recommended, 22 | # declare the Python requirements required to build your documentation 23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 24 | python: 25 | install: 26 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /docs/zh/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version, and other tools you might need 8 | build: 9 | os: ubuntu-24.04 10 | tools: 11 | python: "3.13" 12 | jobs: 13 | pre_build: 14 | - cp -r docs/zh/* docs/ 15 | - find docs/ -name "*.md" -exec sed -i 's#../assets/#assets/#g' {} \; 16 | 17 | # Build documentation with Mkdocs 18 | mkdocs: 19 | configuration: mkdocs_zh.yml 20 | 21 | # Optionally, but recommended, 22 | # declare the Python requirements required to build your documentation 23 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 24 | python: 25 | install: 26 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /docs/zh/features/multi_streams.md: -------------------------------------------------------------------------------- 1 | # 多流并行 2 | 3 | ## 背景 4 | 大模型分布式推理场景中需要引入额外的通信操作,将不同设备上的计算结果聚合在一起。以Deepseek这类大规模的MoE模型为例,分布式规模通常较大,通信开销也会随之变大。计算和通信都采用同一个stream的话,在通信的同时,device计算资源会出现浪费,一直等待通信完成才能开始后面的计算。 5 | 6 | 7 | ## 功能介绍 8 | xLLM在模型图层支持了多流并行功能,将输入的batch拆分成2个micro batches,一个流执行一个micro batch的计算操作,另一个流执行另一个micro batch的通信操作,计算和通信同时执行,从而掩盖通信开销。 9 | ![异步调度](../../assets/multi_streams_architecture.jpg) 10 | 11 | 12 | ## 使用方式 13 | 14 | xLLM中提供了gflags参数`enable_multi_stream_parallel`,默认false,如需开启在xLLM的服务启动脚本中设置为true即可,示例如下: 15 | ```shell 16 | --enable_multi_stream_parallel=true 17 | ``` 18 | 19 | 20 | ## 性能效果 21 | prefill双流并行开启后,基本可掩盖75以上的通信开销,在DeepSeek-R1模型上,只输出1个token的情况下 22 | 23 | - TTFT下降 **7%** 24 | - 吞吐 **提升7%** 25 | 26 | 27 | !!! warning "注意" 28 | 双流并行目前只支持prefill阶段,请求输入越长,收益越大。 29 | 目前仅支持DeepSeek、Qwen3 dense(非MoE)模型。 -------------------------------------------------------------------------------- /cibuild/build_mlu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | function error() { 5 | echo "Require build command, e.g. python setup.py build" 6 | exit 1 7 | } 8 | 9 | IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251104" 10 | 11 | RUN_OPTS=( 12 | --rm 13 | -t 14 | --privileged 15 | --ipc=host 16 | --network=host 17 | --pid=host 18 | --shm-size '128gb' 19 | -v /export/home:/export/home 20 | -v /usr/bin/cnmon:/usr/bin/cnmon 21 | -v /export/home/mlu_vcpkg_cache:/root/.cache/vcpkg # cached vcpkg installed dir 22 | -w /export/home 23 | ) 24 | 25 | CMD="$*" 26 | [[ -z "${CMD}" ]] && error 27 | 28 | [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command is missing." && exit 1 29 | 30 | docker run "${RUN_OPTS[@]}" "${IMAGE}" bash -c "set -euo pipefail; cd $(pwd); ${CMD}" 31 | -------------------------------------------------------------------------------- /xllm/proto/rerank.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option go_package = "jd.com/jd-infer/xllm;xllm"; 4 | package xllm.proto; 5 | 6 | import "common.proto"; 7 | 8 | message RerankRequest { 9 | string model = 1; 10 | string query = 2; 11 | repeated string documents = 3; 12 | optional int32 top_n = 4; 13 | optional int32 truncate_prompt_tokens = 5; 14 | 15 | optional string user = 6; 16 | 17 | optional string service_request_id = 7; 18 | } 19 | 20 | message RerankDocument { 21 | string text = 1; 22 | } 23 | 24 | message RerankResult { 25 | int32 index = 1; 26 | 27 | RerankDocument document = 2; 28 | 29 | float relevance_score = 3; 30 | } 31 | 32 | message RerankResponse { 33 | string id = 1; 34 | 35 | string model = 2; 36 | 37 | Usage usage = 3; 38 | 39 | repeated RerankResult results = 4; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /xllm/processors/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | # Define the base dependencies 5 | set(BASE_DEPS 6 | :common 7 | :layers 8 | :kv_cache 9 | :prefix_cache 10 | :block 11 | :chat_template 12 | glog::glog 13 | torch 14 | torch_python 15 | ) 16 | 17 | 18 | # Define the library 19 | cc_library( 20 | NAME 21 | processors 22 | HDRS 23 | image_processor.h 24 | clip_image_processor.h 25 | minicpmv_image_processor.h 26 | qwen2_vl_image_processor.h 27 | glm4v_image_processor.h 28 | pywarpper_image_processor.h 29 | input_processor.h 30 | SRCS 31 | image_processor.cpp 32 | clip_image_processor.cpp 33 | minicpmv_image_processor.cpp 34 | qwen2_vl_image_processor.cpp 35 | glm4v_image_processor.cpp 36 | pywarpper_image_processor.cpp 37 | DEPS 38 | ${BASE_DEPS} 39 | ) 40 | -------------------------------------------------------------------------------- /docs/zh/features/ppmatmul.md: -------------------------------------------------------------------------------- 1 | # PpMatmul 算子优化 2 | 3 | ## 背景 4 | 5 | 针对大模型推理中矩阵乘法占比高、耗时长的问题,优化了矩阵乘法算子的实现。 6 | 7 | ## 功能介绍 8 | 9 | PpMatmul 算子使用 Tiling 切分策略,将矩阵乘法分解为多个小的矩阵乘法任务。然而当 tile 数量较小时任务无法被均匀分配到所有 npu 核心上,导致 tail effect 问题,影响计算效率。我们通过预取内存或重新划分任务的方式,优化 PpMatmul 算子的性能。 10 | 11 | ## 用户接口 12 | 13 | ### 算子直调 API 14 | 15 | ```cpp 16 | aclnnStatus aclnnPpMatmulOptGetWorkspaceSize( 17 | const aclTensor *a, 18 | const aclTensor *b, 19 | const aclTensor *out, 20 | uint64_t *workspaceSize, 21 | aclOpExecutor **executor); 22 | 23 | aclnnStatus aclnnPpMatmulOpt( 24 | void *workspace, 25 | uint64_t workspaceSize, 26 | aclOpExecutor *executor, 27 | aclrtStream stream); 28 | ``` 29 | 30 | - `a`: 输入矩阵 A。 31 | - `b`: 输入矩阵 B。 32 | - `out`: 输出矩阵,存储计算结果。 33 | 34 | ## 性能效果 35 | 36 | 对于 tile 数量较小的情况(例如 M 较小,对应于 batch size 较小的情况),在(TP=4)时,算子较优化前有 **18%** 的性能提升。 -------------------------------------------------------------------------------- /cmake/CMakeDetermineRustCompiler.cmake: -------------------------------------------------------------------------------- 1 | # ported from https://github.com/Devolutions/CMakeRust 2 | if(NOT CMAKE_Rust_COMPILER) 3 | find_package(Rust) 4 | if(RUST_FOUND) 5 | set(CMAKE_Rust_COMPILER "${RUSTC_EXECUTABLE}") 6 | set(CMAKE_Rust_COMPILER_ID "Rust") 7 | set(CMAKE_Rust_COMPILER_VERSION "${RUST_VERSION}") 8 | set(CMAKE_Rust_PLATFORM_ID "Rust") 9 | endif() 10 | endif() 11 | 12 | message(STATUS "Cargo Home: ${CARGO_HOME}") 13 | message(STATUS "Rust Compiler Version: ${RUSTC_VERSION}") 14 | 15 | mark_as_advanced(CMAKE_Rust_COMPILER) 16 | 17 | if(CMAKE_Rust_COMPILER) 18 | set(CMAKE_Rust_COMPILER_LOADED 1) 19 | endif(CMAKE_Rust_COMPILER) 20 | 21 | configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeRustCompiler.cmake.in 22 | ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${CMAKE_VERSION}/CMakeRustCompiler.cmake IMMEDIATE @ONLY) 23 | 24 | set(CMAKE_Rust_COMPILER_ENV_VAR "RUSTC") 25 | 26 | -------------------------------------------------------------------------------- /xllm/proto/models.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option go_package = "jd.com/jd-infer/xllm;xllm"; 4 | package xllm.proto; 5 | 6 | message ModelCard { 7 | // The model identifier, which can be referenced in the API endpoints. 8 | optional string id = 1; 9 | 10 | // The Unix timestamp (in seconds) when the model was created. 11 | optional uint32 created = 2; 12 | 13 | // the object type, which is always "model". 14 | optional string object = 3; 15 | 16 | // the organization that owns the model. 17 | optional string owned_by = 4 [json_name = "owned_by"]; 18 | } 19 | 20 | message ModelList { 21 | optional string object = 1; 22 | repeated ModelCard data = 2; 23 | } 24 | 25 | message ModelListRequest { 26 | // The model identifier. 27 | // string model = 1; 28 | } 29 | 30 | message ModelListResponse { 31 | // The list of models. 32 | repeated ModelCard data = 1; 33 | } -------------------------------------------------------------------------------- /xllm/pybind/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psutil 3 | import signal 4 | import socket 5 | import sys 6 | 7 | def terminate_process(pid, timeout=30): 8 | try: 9 | parent = psutil.Process(pid) 10 | except psutil.NoSuchProcess: 11 | return 12 | 13 | children = parent.children(recursive=True) 14 | procs = children + [parent] 15 | 16 | for p in procs: 17 | try: 18 | p.terminate() 19 | except psutil.NoSuchProcess: 20 | pass 21 | 22 | gone, alive = psutil.wait_procs(procs, timeout=timeout) 23 | for p in alive: 24 | try: 25 | p.kill() 26 | except psutil.NoSuchProcess: 27 | pass 28 | 29 | def get_free_port(): 30 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 31 | s.bind(('0.0.0.0', 0)) 32 | _, port = s.getsockname() 33 | return port 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug report 2 | description: Raise an issue here if you find a bug. 3 | title: "[Bug]: " 4 | labels: ["bug"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues). 11 | - type: textarea 12 | attributes: 13 | label: Your environment 14 | description: | 15 | Please provide what the environment you are running. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: 🐛 Describe the bug 21 | description: | 22 | Please provide a clear and concise description of what the bug is. 23 | validations: 24 | required: true 25 | - type: markdown 26 | attributes: 27 | value: > 28 | Thanks for report the bug! -------------------------------------------------------------------------------- /cibuild/install/install_gcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$GCC_VERSION" ] 6 | 7 | install_ubuntu() { 8 | # Need the official toolchain repo to get alternate packages 9 | add-apt-repository ppa:ubuntu-toolchain-r/test 10 | apt-get update 11 | apt-get install -y g++-$GCC_VERSION 12 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 13 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 14 | update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 15 | 16 | 17 | # Cleanup package manager 18 | apt-get autoclean && apt-get clean 19 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 20 | } 21 | 22 | 23 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 24 | case "$ID" in 25 | ubuntu) 26 | install_ubuntu 27 | ;; 28 | *) 29 | echo "Unable to determine OS..." 30 | exit 1 31 | ;; 32 | esac -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | # mkdocs-material 3 | # mkdocs-minify-plugin 4 | # python-markdown-math 5 | # regex 6 | # ruff 7 | 8 | # jinja2~=3.1 9 | # markdown~=3.2 10 | # mkdocs~=1.6 11 | # mkdocs-material-extensions~=1.3 12 | # pygments~=2.16 13 | # pymdown-extensions~=10.2 14 | 15 | # # Requirements for plugins 16 | # babel~=2.10 17 | # colorama~=0.4 18 | # paginate~=0.5 19 | # backrefs~=5.7.post1 20 | # requests~=2.26 21 | 22 | # Requirements for core 23 | jinja2 24 | markdown 25 | mkdocs 26 | mkdocs-material 27 | mkdocs-material-extensions 28 | pygments 29 | pymdown-extensions 30 | mkdocs-minify-plugin 31 | python-markdown-math 32 | 33 | mkdocs-git-revision-date-localized-plugin 34 | # Requirements for plugins 35 | babel 36 | colorama 37 | paginate 38 | backrefs 39 | requests 40 | 41 | # Temporarily pin click until this is resolved in MkDocs, see 42 | # https://github.com/mkdocs/mkdocs/issues/4014#issuecomment-3146508306 43 | click -------------------------------------------------------------------------------- /docs/zh/features/moe_params.md: -------------------------------------------------------------------------------- 1 | # EP并行 2 | ## 背景介绍 3 | 在部署DeepSeek-R1 671B参数规模模型时传统分布式部署面临、显存利用率低、通信开销大、硬件成本高昂等核心瓶颈,因此需要引入ep并行。 4 | + 在同等资源下,单张卡上的Expert越少,可用于KV Cache的显存越多,可Cache的token个数越多。 5 | + 因MLA的特性,同等资源下TP Size越小,冗余的KV Cache就越少,可Cache的token个数越多。 6 | + 采用大规模ep并行部署,可以将同一个expert的token计算集中到同一设备上,提高硬件利用率 7 | ## 参数设置 8 | + dp_size:设置Attention部分的dp规模大小,默认值为1,可设置为2的指数倍,当dp_size不等于卡数时,dp组内为tp并行. 9 | + ep_size:设置MoE部分的ep规模大小,默认值为1,可设置为2的指数倍,当ep_size不等于卡数时,dp组内为tp并行. 10 | + enable_mla :默认为false,当模型使用mla时需要设置为true. 11 | + expert_parallel_degree :ep并行相关参数,不开启ep时默认设置为0,开启ep时默认为1,此时为ep level1,当ep_size等于卡数时可以设置为2开启ep level2. 12 | ## 方案设计 13 | + 当开启ep时,默认为ep level1,此时attn与moe部分计算完成后,通过All Gather全卡通讯将数据发送到下一阶段,以64卡attn部分dp32tp2 moe部分ep32tp2为例,执行流程如下: 14 | ![Alt text](../../assets/moe_eplevel1.jpg) 15 | + 当ep_size设置为卡数时,可以开启ep level2,此时attn部分与moe部分之间通讯变为ALL2ALL,只向需要的卡发送数据,降低通讯量与通讯开销,以64卡部署为例,执行流程如下: 16 | ![Alt text](../../assets/moe_eplevel2.jpg) 17 | -------------------------------------------------------------------------------- /docs/mkdocs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | :root > * { 2 | --md-primary-fg-color: #F1002B; 3 | --md-primary-fg-color--light: #F1002B; 4 | --md-primary-fg-color--dark: #af0510; 5 | 6 | --md-accent-fg-color: #F1002B; 7 | --md-accent-fg-color--light: #F1002B; 8 | --md-accent-fg-color--dark: #af0510; 9 | } 10 | 11 | /* :root > * { 12 | --md-footer-bg-color: var(--md-primary-fg-color); 13 | --md-footer-fg-color: var(--md-primary-bg-color); 14 | --md-footer-fg-color--light: var(--md-primary-bg-color--light); 15 | --md-footer-fg-color--lighter: var(--md-primary-bg-color--lighter); 16 | } */ 17 | 18 | [data-md-color-scheme="jd"] { 19 | --md-primary-fg-color: #FB002B; 20 | --md-primary-fg-color--light: #FB002B; 21 | --md-primary-fg-color--dark: #af0510; 22 | 23 | --md-accent-fg-color: #FB002B; 24 | --md-accent-fg-color--light: #FB002B; 25 | --md-accent-fg-color--dark: #af0510; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /xllm/models/llm/npu/llama3.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "llama.h" 19 | 20 | namespace xllm { 21 | // register the causal model 22 | REGISTER_CAUSAL_MODEL(llama3, LlamaForCausalLM); 23 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/common/rate_limiter_test.cpp: -------------------------------------------------------------------------------- 1 | #include "rate_limiter.h" 2 | 3 | #include 4 | 5 | #include "global_flags.h" 6 | 7 | namespace xllm { 8 | 9 | TEST(RequestLimiterTest, Basic) { 10 | // Set the maximum number of concurrent requests to 1. 11 | FLAGS_max_concurrent_requests = 1; 12 | RateLimiter rate_limiter; 13 | // The current number of concurrent requests is 0, no rate limiting is 14 | // applied. 15 | EXPECT_EQ(rate_limiter.is_limited(), false); 16 | // The current number of concurrent requests is 1, rate limiting is applied. 17 | EXPECT_EQ(rate_limiter.is_limited(), true); 18 | // Decrease the number of concurrent requests by one, changing the concurrency 19 | // from 1 to 0. 20 | rate_limiter.decrease_one_request(); 21 | // The current number of concurrent requests is 0, no rate limiting is 22 | // applied. 23 | EXPECT_EQ(rate_limiter.is_limited(), false); 24 | } 25 | 26 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/eplb/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | include_directories( 6 | ${CMAKE_SOURCE_DIR}/xllm/core/kernels/ascend 7 | ${CMAKE_SOURCE_DIR}/xllm/core/kernels/ascend/core/include 8 | ) 9 | 10 | cc_library( 11 | NAME 12 | eplb 13 | HDRS 14 | eplb_executor.h 15 | eplb_manager.h 16 | eplb_policy.h 17 | expert_weight_buffer_shm.h 18 | expert_buffer_manager.h 19 | SRCS 20 | eplb_executor.cpp 21 | eplb_manager.cpp 22 | eplb_policy.cpp 23 | expert_weight_buffer_shm.cpp 24 | expert_buffer_manager.cpp 25 | DEPS 26 | :request 27 | :common 28 | glog::glog 29 | torch 30 | :platform 31 | ) 32 | 33 | set(TEST_SRCS 34 | eplb_policy_test.cpp 35 | ) 36 | 37 | cc_test( 38 | NAME 39 | eplb_policy_test 40 | SRCS 41 | ${TEST_SRCS} 42 | DEPS 43 | torch 44 | :eplb 45 | fmt::fmt 46 | GTest::gtest_main 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /xllm/core/framework/tokenizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | add_subdirectory(tokenizers) 5 | 6 | cc_library( 7 | NAME 8 | tokenizer 9 | HDRS 10 | tokenizer_args.h 11 | tokenizer.h 12 | tokenizer_factory.h 13 | tiktoken_tokenizer.h 14 | sentencepiece_tokenizer.h 15 | fast_tokenizer.h 16 | tokenizer_proxy.h 17 | rec_tokenizer.h 18 | SRCS 19 | tokenizer_factory.cpp 20 | tiktoken_tokenizer.cpp 21 | sentencepiece_tokenizer.cpp 22 | fast_tokenizer.cpp 23 | tokenizer_proxy.cpp 24 | rec_tokenizer.cpp 25 | DEPS 26 | :common 27 | :sentencepiece 28 | absl::flat_hash_map 29 | absl::strings 30 | glog::glog 31 | rust_tokenizers 32 | re2::re2 33 | ) 34 | 35 | cc_test( 36 | NAME 37 | fast_tokenizer_test 38 | SRCS 39 | tests/fast_tokenizer_tests.cpp 40 | DEPS 41 | :tokenizer 42 | glog::glog 43 | GTest::gtest_main 44 | ) 45 | 46 | -------------------------------------------------------------------------------- /xllm/core/util/pretty_print.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | 20 | namespace xllm { 21 | 22 | std::string readable_size(size_t bytes); 23 | 24 | } // namespace xllm 25 | -------------------------------------------------------------------------------- /docs/en/features/zero_evict_scheduler.md: -------------------------------------------------------------------------------- 1 | # Zero Evict Scheduler 2 | 3 | ## Feature Introduction 4 | xLLM supports the zero evict scheduling strategy. The zero evict scheduling strategy is an algorithm designed to minimize request eviction rates, reducing the need for prefill computation on evicted requests and consequently improving TPOT (Time Per Output Token). 5 | This scheduling algorithm employs simulation rounds to detect whether a request can be scheduled without causing the eviction of other requests. 6 | 7 | ## Usage 8 | The aforementioned strategy has been implemented in xLLM and is exposed through gflags parameters to control the feature's on/off state. 9 | 10 | - Enable the zero evict strategy and set the maximum decode tokens per sequence. 11 | ``` 12 | --use_zero_evict=true 13 | --max_decode_token_per_sequence=256 14 | ``` 15 | 16 | ## Performance Impact 17 | After enabling zero evict, on the Qwen3-8B model with an E2E latency constraint, the TPOT latency **decreased by 27%**. -------------------------------------------------------------------------------- /xllm/core/framework/xtensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | 4 | cc_library( 5 | NAME 6 | xtensor 7 | HDRS 8 | options.h 9 | phy_page.h 10 | phy_page_pool.h 11 | xtensor.h 12 | xtensor_manager.h 13 | xtensor_manager_client.h 14 | remote_xtensor_manager.h 15 | xtensor_manager_service.h 16 | xtensor_manager_server.h 17 | xtensor_manager_pool.h 18 | multi_layer_xtensor.h 19 | multi_layer_xtensor_transfer.h 20 | SRCS 21 | phy_page.cpp 22 | phy_page_pool.cpp 23 | xtensor.cpp 24 | xtensor_manager.cpp 25 | xtensor_manager_client.cpp 26 | remote_xtensor_manager.cpp 27 | xtensor_manager_service.cpp 28 | xtensor_manager_server.cpp 29 | xtensor_manager_pool.cpp 30 | multi_layer_xtensor.cpp 31 | multi_layer_xtensor_transfer.cpp 32 | DEPS 33 | torch 34 | :request 35 | :common 36 | glog::glog 37 | proto::xllm_proto 38 | :collective_service 39 | :platform 40 | ) -------------------------------------------------------------------------------- /xllm/proto/xtensor_manager.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | option go_package = "jd.com/jd-infer/xllm;xllm"; 4 | package xllm.proto; 5 | option cc_enable_arenas = true; 6 | option cc_generic_services = true; 7 | 8 | import "common.proto"; 9 | 10 | message SeqId { 11 | int32 seq_id = 1; 12 | } 13 | 14 | message AllocatePagesRequest { 15 | int32 seq_id = 1; 16 | uint64 num_tokens = 2; 17 | } 18 | 19 | message NumPages { 20 | uint64 num_pages = 1; 21 | } 22 | 23 | message Utilization { 24 | double utilization = 1; 25 | } 26 | 27 | // PageManager receive action from master engine. 28 | service DistributeXTensorManager{ 29 | rpc Hello (Status) returns (Status); 30 | rpc Allocate (AllocatePagesRequest) returns (Status); 31 | rpc Deallocate (SeqId) returns (Empty); 32 | rpc Cache (SeqId) returns (Empty); 33 | rpc NumFreePagesPerLayer (Empty) returns (NumPages); 34 | rpc NumUsedPagesPerLayer (Empty) returns (NumPages); 35 | rpc KvCacheUtilization (Empty) returns (Utilization); 36 | } -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # NPU Timeline Generation Guide 2 | ## Prerequisites 3 | - Python environment 4 | - Chrome browser (for visualization) 5 | ## Implementation Steps 6 | ### 1. Code Modification 7 | #### Register the subscriber 8 | Add the following at the beginning of your program: 9 | ```cpp 10 | MsptiMetrics::register_subscriber(); 11 | ``` 12 | #### Add tracing to ACLNN functions (work for msprof as well) 13 | Insert the following macro in your ACLNN functions where you want to measure performance: 14 | ```cpp 15 | LLM_MSTX_RANGE(); 16 | ``` 17 | #### Release the subscriber 18 | Add this at the end of your program: 19 | ```cpp 20 | MsptiMetrics::release_subscriber(); 21 | ``` 22 | ### 2. Log Processing 23 | After running your program, process the generated log file using the timeline script: 24 | ```bash 25 | python npu_timeline.py -i custom_log.log -o custom_output.json 26 | ``` 27 | ### 3. Visualization 28 | Open Chrome browser 29 | Navigate to: chrome://tracing 30 | Load the generated JSON file: custom_output.json -------------------------------------------------------------------------------- /xllm/core/framework/prefix_cache/prefix_cache_with_upload.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "prefix_cache.h" 6 | #include "util/double_buffer.h" 7 | 8 | namespace xllm { 9 | class PrefixCacheWithUpload final : public PrefixCache { 10 | public: 11 | explicit PrefixCacheWithUpload(uint32_t block_size); 12 | 13 | ~PrefixCacheWithUpload(); 14 | 15 | // insert the token ids and blocks into the prefix tree 16 | // and set hash key to the corresponding block 17 | // return the length of new inserted tokens 18 | size_t insert(const Slice& token_ids, 19 | std::vector& blocks) override; 20 | 21 | // evict blocks hold by the prefix cache 22 | // return the actual number of evicted blocks 23 | size_t evict(size_t n_blocks) override; 24 | 25 | virtual KvCacheEvent* get_upload_kvcache_events() override; 26 | 27 | private: 28 | ThreadPool threadpool_; 29 | 30 | DoubleBuffer db_kvcache_events_; 31 | }; 32 | 33 | } // namespace xllm 34 | -------------------------------------------------------------------------------- /xllm/core/layers/common/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_test) 2 | 3 | if(USE_MLU) 4 | list(APPEND TEST_SRCS indexer_tests.cpp mla_tests.cpp deepseek_v2_decoder_layer_tests.cpp) 5 | endif() 6 | 7 | # Add test for common test 8 | cc_test( 9 | NAME 10 | layer_test 11 | SRCS 12 | dense_mlp_tests.cpp 13 | fused_moe_tests.cpp 14 | tests_utils.cpp 15 | ${TEST_SRCS} 16 | DEPS 17 | :common_layers 18 | :parallel_state 19 | :model 20 | :model_context 21 | :state_dict 22 | glog::glog 23 | torch 24 | GTest::gtest_main 25 | ) 26 | 27 | # Add test for DeepEP 28 | # This test must exist individually, because it contains forked processes 29 | # which does not allow any device init on main process 30 | cc_test( 31 | NAME 32 | deep_ep_test 33 | SRCS 34 | deep_ep_tests.cpp 35 | tests_utils.cpp 36 | DEPS 37 | :common_layers 38 | :parallel_state 39 | :model 40 | :model_context 41 | :state_dict 42 | GTest::gtest_main 43 | torch 44 | glog::glog 45 | ) 46 | -------------------------------------------------------------------------------- /xllm/cc_api/macros.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | namespace xllm { 19 | 20 | #ifdef XLLM_CAPI_WEAK 21 | #define XLLM_CAPI_EXPORT \ 22 | __attribute__((visibility("default"))) __attribute((weak)) 23 | #else 24 | #define XLLM_CAPI_EXPORT __attribute__((visibility("default"))) 25 | #endif // XLLM_CAPI_WEAK 26 | 27 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/batch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | batch 8 | HDRS 9 | dit_batch.h 10 | batch.h 11 | batch_factory.h 12 | batch_input_builder.h 13 | mposition.h 14 | SRCS 15 | dit_batch.cpp 16 | batch.cpp 17 | batch_factory.cpp 18 | batch_input_builder.cpp 19 | mposition.cpp 20 | beam_search.h 21 | DEPS 22 | :request 23 | :runtime 24 | :common 25 | glog::glog 26 | ) 27 | 28 | cc_test( 29 | NAME 30 | batch_test 31 | SRCS 32 | batch_test.cpp 33 | DEPS 34 | :batch 35 | absl::time 36 | GTest::gtest_main 37 | $<$:torch_npu> 38 | ) 39 | target_link_libraries(batch_test 40 | PUBLIC 41 | Python::Python 42 | $<$:ascendcl> 43 | $<$:hccl> 44 | $<$:c_sec> 45 | $<$:nnopbase>) 46 | 47 | -------------------------------------------------------------------------------- /docs/en/features/chunked_scheduler.md: -------------------------------------------------------------------------------- 1 | # Chunked Scheduler 2 | 3 | ## Feature Introduction 4 | xLLM supports the chunked prefill scheduling strategy. Chunked prefill is a technique that optimizes large language model inference by splitting long prompts into smaller chunks for batch processing, rather than processing the entire prompt at once. 5 | This method can effectively reduce peak GPU memory usage, improve device utilization, and better schedule and mix processing with requests from the decode stage. 6 | 7 | ## Usage 8 | The aforementioned strategy has been implemented in xLLM and is exposed through gflags parameters to control the feature's on/off state. 9 | 10 | - Enable chunked prefill and set the chunked size, if not set chunked size, its default value is equal to max_tokens_per_batch. 11 | ```bash 12 | --enable_chunked_prefill=true 13 | --max_tokens_per_chunk_for_prefill=20480 # optional 14 | ``` 15 | 16 | 17 | 18 | ## Performance Impact 19 | After enabling chunked prefill, on the Qwen3-8B model with a TPOT constraint of 50ms, the TTFT latency **decreased by 46%**. -------------------------------------------------------------------------------- /examples/generate_embedding.py: -------------------------------------------------------------------------------- 1 | # python examples/generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0' 2 | # python generate_embedding.py --model='/path/models/Qwen3-8B' --devices='npu:0,npu:1' 3 | 4 | from xllm import ArgumentParser, Embedding, RequestParams 5 | 6 | # Create an EmbeddingLM. 7 | parser = ArgumentParser() 8 | emb = Embedding(**vars(parser.parse_args())) 9 | 10 | # Create a reqeust params, include sampling params 11 | request_params = RequestParams() 12 | request_params.is_embeddings = True 13 | request_params.max_tokens = 1 14 | 15 | inputs = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | 22 | outputs = emb.embedding(inputs, request_params, True) 23 | 24 | # Print the outputs. 25 | for i, output in enumerate(outputs): 26 | input_str = output.prompt 27 | generated_embedding = output.outputs[0].embeddings 28 | print(f"Input: {input_str!r}, Generated embedding: {generated_embedding!r}") 29 | 30 | emb.finish() 31 | 32 | -------------------------------------------------------------------------------- /cibuild/build_npu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | function error() { 5 | echo "Require build command, e.g. python setup.py build" 6 | exit 1 7 | } 8 | 9 | IMAGE="quay.io/jd_xllm/xllm-ai:xllm-dev-hb-rc2-x86" 10 | 11 | RUN_OPTS=( 12 | --rm 13 | -t 14 | --privileged 15 | --ipc=host 16 | --network=host 17 | --device=/dev/davinci0 18 | --device=/dev/davinci_manager 19 | --device=/dev/devmm_svm 20 | --device=/dev/hisi_hdc 21 | -v /var/queue_schedule:/var/queue_schedule 22 | -v /usr/local/Ascend/driver:/usr/local/Ascend/driver 23 | -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi 24 | -v /usr/local/sbin/:/usr/local/sbin/ 25 | -v /export/home:/export/home 26 | -v /export/home/npu_vcpkg_cache:/root/.cache/vcpkg # cached vcpkg installed dir 27 | -v /etc/hccn.conf:/etc/hccn.conf 28 | -w /export/home 29 | ) 30 | 31 | CMD="$*" 32 | [[ -z "${CMD}" ]] && error 33 | 34 | [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command is missing." && exit 1 35 | 36 | docker run "${RUN_OPTS[@]}" "${IMAGE}" bash -c "set -euo pipefail; cd $(pwd); ${CMD}" 37 | -------------------------------------------------------------------------------- /xllm/core/framework/request/dit_request_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "dit_request_output.h" 11 | #include "dit_request_state.h" 12 | #include "image_generation.pb.h" 13 | #include "request.h" 14 | #include "tensor.pb.h" 15 | namespace xllm { 16 | 17 | struct DiTRequestParams { 18 | DiTRequestParams() = default; 19 | DiTRequestParams(const proto::ImageGenerationRequest& request, 20 | const std::string& x_rid, 21 | const std::string& x_rtime); 22 | 23 | bool verify_params(DiTOutputCallback callback) const; 24 | 25 | // request id 26 | std::string request_id; 27 | std::string x_request_id; 28 | std::string x_request_time; 29 | 30 | std::string model; 31 | 32 | DiTInputParams input_params; 33 | // Mandatory: Generation control parameters (encapsulates all fields related 34 | // to "image generation process") 35 | DiTGenerationParams generation_params; 36 | }; 37 | 38 | } // namespace xllm -------------------------------------------------------------------------------- /docs/zh/features/acl_graph.md: -------------------------------------------------------------------------------- 1 | # ACLGraph 2 | 3 | 4 | ## 功能介绍 5 | 6 | 为了优化Host侧调度性能,NPU近期推出了类似CUDA Graph的图模式方案ACLGraph。与采用CPU密集小任务提交、NPU频繁启动小Kernel的传统模式相比,ACLGraph模式通过在CPU一次提交大任务后,NPU内部流式执行小kernel,显著降低了启动时间和NPU气泡。 7 | 8 | 在xLLM引擎中使用ACLGraph功能,我们实现了以下特性: 9 | ### 动态维度参数化 10 | - 将关键动态维度(如批大小和序列长度)作为整图输入参数,从而提高灵活性。在进行图的内存分配和内核配置时,利用这些动态参数计算实际所需值,例如通过公式 $block\_table\_size = batch\_size \times (max\_seq\_len / block\_size)$ 计算block_table的大小。在图启动阶段,则将实际的批大小和最大序列长度作为参数传入,以确保kerenel能够使用正确的stride来访问数据。 11 | 12 | ### 多shape复用的显存池 13 | - 为了避免多shape使用单独显存buffer(输入、输出和中间Tensor)导致浪费,我们采用了可扩张的显存池。多shape复用基地址,不同shape对池基地址的偏移量(Offset)不同。 14 | 15 | 16 | ## 使用方式 17 | 18 | 上述功能已经在xLLM引擎内部进行了实现,对用户透明,用户无需关注内部实现细节,在适用的场景直接开启相关功能即可。通过gflags参数`enable_aclgraph`开启。参数默认为false,如需开启在xLLM的服务启动脚本中设置为true即可,示例如下: 19 | ```shell 20 | --enable_aclgraph=true 21 | ``` 22 | 23 | 24 | ## 性能效果 25 | - 开启ACLGraph功能后,在Qwen3-0.6B和Qwen3-1.7B等模型上,decode阶段吞吐 **提升8%-10%**。 26 | 27 | !!! warning "注意事项" 28 | - 为新模型添加ACLGraph支持时,需要check计算过程中用到的kerenel是否实现了动态维度参数化。如果没有,需要重新实现kernel。 29 | 30 | !!! tip "未来计划" 31 | * 支持MoE模型Attention DP和FFN EP之间的通信操作适配不同shape。 32 | -------------------------------------------------------------------------------- /xllm/function_call/partial_json_parser/include/partial_json_parser/options.h: -------------------------------------------------------------------------------- 1 | #ifndef PARTIAL_JSON_PARSER_OPTIONS_H 2 | #define PARTIAL_JSON_PARSER_OPTIONS_H 3 | 4 | namespace partial_json_parser { 5 | 6 | // TypeOptions enum that matches the Go implementation exactly 7 | enum TypeOptions { 8 | STR = 1 << 0, // 1 9 | NUM = 1 << 1, // 2 10 | ARR = 1 << 2, // 4 11 | OBJ = 1 << 3, // 8 12 | NULL_TYPE = 1 << 4, // 16 (using NULL_TYPE to avoid conflict with NULL macro) 13 | BOOL = 1 << 5, // 32 14 | NAN_TYPE = 1 << 6, // 64 (using NAN_TYPE to avoid conflict with NAN macro) 15 | INFINITY_TYPE = 16 | 1 17 | << 7, // 128 (using INFINITY_TYPE to avoid conflict with INFINITY macro) 18 | NEG_INFINITY = 1 << 8, // 256 19 | 20 | // Composite options - exactly matching Go implementation 21 | INF = INFINITY_TYPE | NEG_INFINITY, 22 | SPECIAL = NULL_TYPE | BOOL | INF | NAN_TYPE, 23 | ATOM = STR | NUM | SPECIAL, 24 | COLLECTION = ARR | OBJ, 25 | ALL = ATOM | COLLECTION 26 | }; 27 | 28 | } // namespace partial_json_parser 29 | 30 | #endif // PARTIAL_JSON_PARSER_OPTIONS_H -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a request for a new feature 3 | title: "[Feature]: " 4 | labels: ["feature"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed. please search: [existing issues](https://github.com/jd-opensource/xllm/issues). 11 | - type: textarea 12 | attributes: 13 | label: 🚀 The motivation and feature 14 | description: > 15 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Alternatives 21 | description: > 22 | A description of any alternative solutions or features you've considered, if any. 23 | - type: textarea 24 | attributes: 25 | label: Additional context 26 | description: > 27 | Add any other context or screenshots about the feature request. 28 | - type: markdown 29 | attributes: 30 | value: > 31 | Thanks for contributing 🎉! 32 | -------------------------------------------------------------------------------- /xllm/core/common/rate_limiter.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | namespace xllm { 21 | 22 | class RateLimiter final { 23 | public: 24 | RateLimiter() = default; 25 | 26 | ~RateLimiter() = default; 27 | 28 | bool is_limited(); 29 | 30 | void decrease_one_request(); 31 | 32 | private: 33 | std::atomic num_concurrent_requests_{0}; 34 | }; 35 | 36 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/kernels/cuda/matmul.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "cuda_ops_api.h" 17 | 18 | namespace xllm::kernel::cuda { 19 | 20 | torch::Tensor matmul(torch::Tensor a, 21 | torch::Tensor b, 22 | std::optional bias) { 23 | namespace F = torch::nn::functional; 24 | return F::linear(a, b, bias.value_or(torch::Tensor())); 25 | } 26 | 27 | } // namespace xllm::kernel::cuda -------------------------------------------------------------------------------- /xllm/core/kernels/ilu/matmul.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ilu_ops_api.h" 17 | 18 | namespace xllm::kernel::ilu { 19 | 20 | torch::Tensor matmul(torch::Tensor a, 21 | torch::Tensor b, 22 | std::optional bias) { 23 | namespace F = torch::nn::functional; 24 | return F::linear(a, b, bias.value_or(torch::Tensor())); 25 | } 26 | 27 | } // namespace xllm::kernel::ilu -------------------------------------------------------------------------------- /xllm/processors/input_processor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | 22 | #include "core/framework/request/mm_data.h" 23 | 24 | namespace xllm { 25 | 26 | class InputProcessor { 27 | public: 28 | virtual ~InputProcessor() = default; 29 | 30 | virtual void process(std::string& prompt, const MMData& mm_data) = 0; 31 | }; 32 | 33 | } // namespace xllm 34 | -------------------------------------------------------------------------------- /cibuild/install/install_python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | PYTHON_VERSION="$1" 6 | shift 7 | 8 | NO_RC_PYTHON_VERSION="${PYTHON_VERSION%rc*}" 9 | 10 | url="https://www.python.org/ftp/python/${NO_RC_PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz" 11 | 12 | pushd /tmp 13 | wget "$url" 14 | tar xvzf "Python-${PYTHON_VERSION}.tgz" 15 | cd "Python-${PYTHON_VERSION}" 16 | 17 | # Extract major and minor version number 18 | MAJOR=$(echo "${PYTHON_VERSION}" | cut -d . -f 1) 19 | MINOR=$(echo "${PYTHON_VERSION}" | cut -d . -f 2) 20 | 21 | INSTALL_FOLDER="/opt/python/cp${MAJOR}${MINOR}-cp${MAJOR}${MINOR}" 22 | 23 | ./configure \ 24 | --enable-shared \ 25 | --enable-ipv6 \ 26 | --prefix=${INSTALL_FOLDER} \ 27 | LDFLAGS=-Wl,-rpath=${INSTALL_FOLDER}/lib,--disable-new-dtags 28 | 29 | make -j$(nproc) install 30 | # upgrade pip, setuptools and wheel 31 | ${INSTALL_FOLDER}/bin/python3 -m pip install --upgrade pip setuptools wheel 32 | # create symlinks 33 | cp ${INSTALL_FOLDER}/bin/pip3 ${INSTALL_FOLDER}/bin/pip 34 | ln -s ${INSTALL_FOLDER}/bin/python3 ${INSTALL_FOLDER}/bin/python 35 | 36 | rm -rf "Python-${PYTHON_VERSION}" 37 | popd 38 | 39 | -------------------------------------------------------------------------------- /xllm/core/framework/xtensor/phy_page.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "phy_page.h" 17 | 18 | namespace xllm { 19 | PhyPage::PhyPage(torch::Device device) : device_(device) { 20 | int32_t device_id = device_.index(); 21 | 22 | // create a physical memory handle for the device 23 | vmm::create_phy_mem_handle(phy_handle_, device_id); 24 | } 25 | 26 | PhyPage::~PhyPage() { vmm::release_phy_mem_handle(phy_handle_); } 27 | } // namespace xllm -------------------------------------------------------------------------------- /docs/en/features/prefix_cache.md: -------------------------------------------------------------------------------- 1 | # Prefix Cache Optimization 2 | 3 | ## Feature Introduction 4 | xLLM supports prefix cache matching. The prefix cache is based on `murmur_hash` and uses an LRU eviction policy, delivering superior matching efficiency and increased prefix cache hit rates. 5 | Additionally, the prefix cache has been optimized to support the `continuous_scheduler`, `chunked_scheduler`, and `zero_evict_scheduler`. The cache is updated immediately after prefill operations, enhancing matching timeliness. For the `chunked_scheduler`, multi-stage chunked prefill matching is supported, reducing computational overhead and minimizing KV cache usage as much as possible. 6 | 7 | ## Usage 8 | The prefix cache is implemented in xLLM and exposed through gflags parameters to control its functionality. 9 | 10 | - Enable prefix cache with specific policy and settings: 11 | ``` 12 | --enable_prefix_cache=true 13 | ``` 14 | 15 | ## Performance Impact 16 | After enabling prefix cache, on the Qwen3-8B model with a TPOT constraint of 50ms, the E2E latency **decreased by 10%**. 17 | 18 | !!! warning "Note" 19 | PD separation scheduler is not currently supported. -------------------------------------------------------------------------------- /docs/mkdocs/overrides/.icons/email-fill.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xllm/core/framework/kv_cache/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | 6 | cc_library( 7 | NAME 8 | kv_cache 9 | HDRS 10 | embedding_allocator.h 11 | $<$:hccl_kv_cache_transfer.h> 12 | kv_cache.h 13 | kv_cache_event.h 14 | kv_cache_transfer.h 15 | $<$:llm_data_dist_transfer.h> 16 | $<$:spec_kv_cache_transfer.h> 17 | kv_cache_store.h 18 | hierarchy_kv_cache_transfer.h 19 | SRCS 20 | embedding_allocator.cpp 21 | $<$:hccl_kv_cache_transfer.cpp> 22 | kv_cache.cpp 23 | kv_cache_transfer.cpp 24 | $<$:llm_data_dist_transfer.cpp> 25 | $<$:spec_kv_cache_transfer.cpp> 26 | kv_cache_store.cpp 27 | hierarchy_kv_cache_transfer.cpp 28 | DEPS 29 | :common 30 | $<$:graph> 31 | glog::glog 32 | $<$:hccl_transfer> 33 | $<$:llm_datadist> 34 | torch 35 | $<$:torch_npu> 36 | mooncake_store 37 | :xtensor 38 | $<$:platform_npu> 39 | ) 40 | -------------------------------------------------------------------------------- /xllm/core/framework/parallel_state/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | parallel_state 7 | HDRS 8 | mapping_npu.h 9 | parallel_args.h 10 | parallel_state.h 11 | process_group.h 12 | $<$:npu_process_group.h> 13 | $<$:mlu_process_group.h> 14 | $<$:cuda_process_group.h> 15 | $<$:ilu_process_group.h> 16 | collective_communicator.h 17 | SRCS 18 | mapping_npu.cpp 19 | parallel_state.cpp 20 | process_group.cpp 21 | $<$:npu_process_group.cpp> 22 | collective_communicator.cpp 23 | DEPS 24 | :common 25 | torch 26 | $<$:torch_mlu> 27 | $<$:hccl> 28 | glog::glog 29 | ) 30 | 31 | if(USE_NPU) 32 | cc_test( 33 | NAME 34 | mapping_npu_test 35 | SRCS 36 | mapping_npu_test.cpp 37 | DEPS 38 | parallel_state 39 | absl::synchronization 40 | absl::time 41 | GTest::gtest_main 42 | xllm_kernels 43 | ascendcl 44 | atb 45 | c_sec 46 | spdlog::spdlog 47 | ) 48 | endif() 49 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/xllm_ops/replace_token.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #include "acl/acl.h" 23 | #include "aclnn_replace_token.h" 24 | #include "acltensor_utils.h" 25 | #include "util/tensor_helper.h" 26 | 27 | namespace xllm_ops { 28 | void replace_token(torch::Tensor& forked, torch::Tensor& lastStepOutPut); 29 | } // namespace xllm_ops 30 | -------------------------------------------------------------------------------- /xllm/core/layers/common/layer_utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include "framework/model/model_input_params.h" 18 | #include "framework/parallel_state/parallel_args.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | void update_dummy_run_input(int64_t dp_rank, 24 | torch::Tensor& positions, 25 | ModelInputParams& input_params); 26 | 27 | } // namespace layer 28 | } // namespace xllm 29 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/lm_head_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "base_loader.h" 17 | 18 | namespace xllm { 19 | namespace layer { 20 | class LmHeadLoader : public BaseLoader { 21 | public: 22 | LmHeadLoader(uint64_t weight_count, const ModelContext& context); 23 | 24 | void load_state_dict(const StateDict& state_dict) override; 25 | void verify_loaded_weights(const std::string& weight_str) const override; 26 | }; 27 | } // namespace layer 28 | } // namespace xllm 29 | -------------------------------------------------------------------------------- /docs/zh/features/xtensor_memory.md: -------------------------------------------------------------------------------- 1 | # xTensor显存管理 2 | 3 | ## 背景介绍 4 | 5 | 目前的大模型推理引擎都是使用基于 block 的方式一次性分配大块连续显存用于存储 KVCache,然而这会造成 KVCache 的离散存储,且无法动态扩容/缩容。 6 | 7 | 而 GPU 和 NPU 都提供了虚拟内存管理 API(Virtual Memory Management,VMM),VMM API 可以将显存的虚拟地址和物理地址的分配解耦,然后将物理内存按需映射到虚拟内存上,从而实现物理内存的弹性分配,并保证虚拟内存的连续性。 8 | 9 | 基于 VMM API,我们实现了 KVCache 的连续存储及按需分配物理内存,并且实现了针对解码阶段的连续 KVCache 版本的 Attention 算子。 10 | 11 | ## 主要接口 12 | * `PhyPage`:对物理页的封装。 13 | * `XTensor`:对虚拟内存的封装。 14 | * `PageAllocator`:用于管理一个device上的`PhyPage`的分配与回收。 15 | * `PageManager`:用于管理一个device上虚拟内存与物理内存的映射与取消映射。 16 | * `PageManagerPool`:用于管理所有的device上的`PageManager`。 17 | 18 | ## 使用方式 19 | 只需在启动 xLLM 时加上下面的 gflag 参数即可: 20 | 21 | ```bash 22 | --enable_continuous_kvcache=true 23 | ``` 24 | 25 | !!! warning "注意事项" 26 | 目前该方案暂不支持prefix cacheing,chunked prefill,disaggregated pd,speculative decoding,在使用时需要将这些功能关闭: 27 | ```bash 28 | --enable_prefix_cache=false 29 | --enable_chunked_prefill=false 30 | --enable_disagg_pd=false 31 | --num_speculative_tokens=0 32 | ``` 33 | 34 | !!! tip "未来计划" 35 | * 使用 VMM API 将 KVCache 和激活值统一管理,并动态管理二者使用的物理显存大小。 36 | * 使用 VMM API 实现当多个 LLM 模型共享 GPUs时,动态调整它们使用的 KVCache 的大小从而实现高效负载。 37 | 38 | -------------------------------------------------------------------------------- /docs/zh/features/eplb.md: -------------------------------------------------------------------------------- 1 | # MOE负载均衡 2 | 3 | ## 背景介绍 4 | 5 | MoE模型依赖动态路由分配tokens给专家,但实际部署中因数据分布不均,导致专家负载失衡(部分过载、部分闲置)。专家冗余调整(如新增/删除副本)需要消耗额外显存,并可能因权重迁移影响推理延迟,如何高效、平滑地完成是一大挑战。为此,采用专家冗余策略(复制热点专家)结合分层和全局动态负载均衡实现了动态的MOE负载均衡。 6 | 7 | ## 功能介绍 8 | xLLM eplb功能主要通过以下三个模块实现: 9 | - eplb manager: 负责专家负载并收集并管理专家分布更新更新,采用逐层更新机制,根据专家负载变化情况判断是否更新该层。 10 | - eplb excutor: 实际专家分布更新执行器。 11 | - eplb policy: 新专家负载表生成策略。 12 | 整体架构图如下: 13 | ![xLLM eplb](../../assets/eplb_architecture.png) 14 | 15 | ## 使用方式 16 | 只需在启动 xLLM 时加上下面的 gflag 参数即可: 17 | 替换为实际的Device个数 ep_size要与device个数保持一致 18 | 19 | - xLLM中提供了gflags参数`enable_eplb`,默认false,如需开启动态专家负载均衡,在xLLM的服务启动脚本中设置为true即可。 20 | - `expert_parallel_degree`与`ep_size`为moe相关参数,`expert_parallel_degree`需要设置为`2`,`ep_size`要与实际NPU/GPU卡个数保持一致。参考 [moe_params](./moe_params.md) 21 | - `eplb_update_interval`为专家分布更新时间间隔,单位为妙,默认值为1000. 22 | - 专家分布更新采用根据专家负载的逐层更新机制,当某一层专家的前后两次的负载相似度小于`eplb_update_interval`时选择更新该层,默认值为1,取之范围为(0,1)。 23 | 24 | ```bash 25 | --enable_eplb=true 26 | --expert_parallel_degree=2 27 | --ep_size=16 28 | --eplb_update_interval=2000 29 | --eplb_update_threshold=0.9 30 | ``` 31 | 32 | ## 未来工作 33 | * 采用更加细粒度的专家更新机制。 34 | 35 | * 与调度层结合,通过请求batch的重组实现更好的负载均衡。 36 | -------------------------------------------------------------------------------- /examples/generate.py: -------------------------------------------------------------------------------- 1 | # python examples/generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0' 2 | # python generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0,npu:1' 3 | 4 | from xllm import ArgumentParser, LLM, RequestParams 5 | 6 | # Create an LLM. 7 | parser = ArgumentParser() 8 | llm = LLM(**vars(parser.parse_args())) 9 | 10 | # Create a reqeust params, include sampling params 11 | request_params = RequestParams() 12 | request_params.temperature = 0.8 13 | request_params.top_p = 0.95 14 | request_params.max_tokens = 10 15 | 16 | # Generate texts from the prompts. The output is a list of RequestOutput 17 | # objects that contain the prompt, generated text, and other information. 18 | prompts = [ 19 | "Hello, my name is", 20 | "The president of the United States is", 21 | "The capital of France is", 22 | "The future of AI is", 23 | ] 24 | 25 | outputs = llm.generate(prompts, request_params, True) 26 | 27 | # Print the outputs. 28 | for i, output in enumerate(outputs): 29 | prompt = output.prompt 30 | generated_text = output.outputs[0].text 31 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 32 | 33 | llm.finish() 34 | 35 | -------------------------------------------------------------------------------- /xllm/processors/pywarpper_image_processor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include "image_processor.h" 21 | 22 | namespace xllm { 23 | 24 | struct MMData; 25 | 26 | class PyWarpperImageProcessor : public ImageProcessor { 27 | public: 28 | PyWarpperImageProcessor(const ModelArgs&); 29 | ~PyWarpperImageProcessor() override = default; 30 | 31 | bool process(const MMInput& mm_inputs, MMData& mm_datas) override; 32 | }; 33 | 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /docs/zh/features/async_schedule.md: -------------------------------------------------------------------------------- 1 | # 异步调度 2 | 3 | ## 背景 4 | 大模型推理过程可划分为3个阶段,包括CPU执行调度准备模型输入阶段,device计算阶段,CPU处理输出阶段。 5 | 由于解码操作的序列性,step-i+1 的输入需要依赖 step-i 的输出结果, 6 | 上述3个阶段需要按顺序串行执行,导致在CPU执行阶段1和3的时候,device侧空闲等待出现空泡,资源利用不充分。 7 | 8 | 9 | 10 | ## 功能介绍 11 | 12 | xLLM在框架层支持了异步调度功能,在device执行 step-i 计算的同时提前让CPU执行 step-i+1 的调度操作,device在完成 step-i 计算后可立即开始 step-i+1 的计算,从而消除空泡。 13 | 具体地,CPU在发起 step-i 计算调用后,不等待device计算完成,为 step-i 的请求构造fake token,使用fake token执行 step-i+1 的调度操作,分配KV Cache等;device在启动 step-i+1 的计算时,用 step-i 计算出来的true token替换fake token,保证计算的正确性。CPU在另外的线程中同步处理 step-i 的结果返回给client。 14 | 15 | 整体架构如图,实现中CPU侧执行阶段1和阶段3的操作分别采用了不同的线程池,rpc等函数调用采用C++ future和promise非阻塞调用,实现全异步runtime。![异步调度](../../assets/async_schedule_architecture.jpg) 16 | 17 | 18 | ## 使用方式 19 | 20 | xLLM中提供了gflags参数`enable_schedule_overlap`,默认true,如需关闭在xLLM的服务启动脚本中设置为false即可,示例如下: 21 | ```shell 22 | --enable_schedule_overlap=false 23 | ``` 24 | 25 | 26 | ## 性能效果 27 | - 异步调度开启后,两个step之间的device空闲时在200us左右,基本类似一个kernel launch的时间。 28 | - 在DeepSeek-R1-Distill-Qwen-1.5B模型上,限制TPOT 50ms,吞吐 **提升17%**。 29 | 30 | 31 | !!! warning "注意" 32 | - 异步调度功能会在服务端额外计算一个step,当使用场景中输出token数量较少,或是类似embedding模型只一次性输出的场景,会影响服务端吞吐,所以强制关闭异步调度。 33 | - VLM模型正在适配中,暂时会强制关闭异步调度。 -------------------------------------------------------------------------------- /docs/zh/features/global_kvcache.md: -------------------------------------------------------------------------------- 1 | # 全局多级KV Cache 2 | ## 背景 3 | 大型语言模型(LLM)解码阶段因自回归生成需频繁访问历史KV缓存,导致显存带宽成为瓶颈。随着模型规模与上下文窗口扩大(如128K Token消耗超40GB显存),单卡显存压力剧增。现有方案(如vLLM)在长上下文场景下存在明显局限:预填充耗时激增、解码阶段显存带宽争抢严重,为满足SLO(TTFT<2s, TBT<100ms)常需过量预留资源,致使GPU利用率不足40%,且难以利用跨服务器资源。为此,我们提出分布式全局多级KV缓存管理系统,采用存算一体架构以突破单机资源限制。 4 | 5 | ## 功能介绍 6 | xLLM 全局KV Cache功能主要通过以下三个模块实现: 7 | - etcd: 集群服务注册、负载信息同步及全局缓存状态管理 8 | - xLLM Service: 调度请求和管理所有计算实例 9 | - xLLM: 请求计算实例 10 | 11 | 整体架构图如下: 12 | ![xLLM 全局多级KV Cache](../../assets/globalkvcache_architecture.png) 13 | ## 功能使用示例 14 | ### 使用准备 15 | #### 安装相关依赖 16 | - **xLLM**: 参见[安装编译](../getting_started/compile.md) 17 | - **xLLM Service**: 参见[PD分离部署](../getting_started/PD_disagg.md) 18 | 19 | ### 使用方式 20 | 1. etcd启动配置: 21 | ```bash 22 | ./etcd --listen-peer-urls=http://0.0.0.0:10999 --listen-client-urls=http://0.0.0.0:10998 23 | ``` 24 | 2. xLLM Service启动配置: 25 | ```bash 26 | ./xllm_master_serving --etcd_addr="127.0.0.1:10998" --http_server_port 28888 --rpc_server_port 28889 --tokenizer_path=/path/to/tokenizer_config_dir/ 27 | ``` 28 | 3. xLLM启动添加上下面的 gflag 参数即可: 29 | ```bash 30 | --enable_service_routing=true 31 | --enable_cache_upload=true 32 | # PD分离暂时不支持全局KVCache管理 33 | --enable_disagg_pd=false 34 | ``` -------------------------------------------------------------------------------- /xllm/core/framework/xtensor/phy_page.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | 19 | #include "platform/vmm_api.h" 20 | 21 | namespace xllm { 22 | class PhyPage { 23 | public: 24 | PhyPage(torch::Device device); 25 | 26 | ~PhyPage(); 27 | 28 | const torch::Device& device() const { return device_; } 29 | 30 | PhyMemHandle get_phy_handle() const { return phy_handle_; } 31 | 32 | private: 33 | torch::Device device_; 34 | PhyMemHandle phy_handle_; 35 | }; 36 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/column_parallel_linear_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "base_loader.h" 17 | 18 | namespace xllm { 19 | namespace layer { 20 | class ColumParallelLinearLoader : public BaseLoader { 21 | public: 22 | ColumParallelLinearLoader(uint64_t weight_count, const ModelContext& context); 23 | 24 | void load_state_dict(const StateDict& state_dict) override; 25 | void verify_loaded_weights(const std::string& weight_str) const override; 26 | }; 27 | } // namespace layer 28 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/lm_head.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class LmHead : public torch::nn::ModuleHolder { 24 | public: 25 | using torch::nn::ModuleHolder::ModuleHolder; 26 | using Impl __attribute__((__unused__)) = LmHeadImpl; 27 | 28 | LmHead(const ModelContext& context) 29 | : ModuleHolder(std::make_shared(context)) {} 30 | }; 31 | 32 | } // namespace layer 33 | } // namespace xllm 34 | -------------------------------------------------------------------------------- /xllm/core/util/uuid.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | 20 | #include 21 | 22 | namespace xllm { 23 | 24 | class ShortUUID { 25 | public: 26 | ShortUUID() = default; 27 | 28 | std::string random(size_t len = 0); 29 | 30 | private: 31 | std::string alphabet_ = 32 | "23456789ABCDEFGHJKLMNPQRSTUVWXYZ" 33 | "abcdefghijkmnopqrstuvwxyz"; 34 | absl::BitGen gen_; 35 | }; 36 | 37 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/api_service/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | api_service 6 | HDRS 7 | api_service.h 8 | api_service_impl.h 9 | call.h 10 | completion_service_impl.h 11 | rec_completion_service_impl.h 12 | chat_service_impl.h 13 | embedding_service_impl.h 14 | image_generation_service_impl.h 15 | rerank_service_impl.h 16 | qwen3_rerank_service_impl.h 17 | non_stream_call.h 18 | service_impl_factory.h 19 | stream_call.h 20 | models_service_impl.h 21 | stream_output_parser.h 22 | mm_service_utils.h 23 | SRCS 24 | api_service.cpp 25 | call.cpp 26 | completion_service_impl.cpp 27 | rec_completion_service_impl.cpp 28 | chat_service_impl.cpp 29 | embedding_service_impl.cpp 30 | image_generation_service_impl.cpp 31 | models_service_impl.cpp 32 | rerank_service_impl.cpp 33 | stream_output_parser.cpp 34 | qwen3_rerank_service_impl.cpp 35 | DEPS 36 | :master 37 | :chat_template 38 | :util 39 | glog::glog 40 | proto::xllm_proto 41 | absl::flat_hash_set 42 | absl::random_random 43 | :function_call 44 | :reasoning 45 | torch 46 | $<$:torch_npu> 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/xllm_ops/top_k_top_p.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #include "acl/acl.h" 23 | #include "aclnnop/aclnn_apply_top_k_top_p.h" 24 | #include "acltensor_utils.h" 25 | #include "util/tensor_helper.h" 26 | 27 | namespace xllm_ops { 28 | void top_k_top_p(torch::Tensor& logits, 29 | const torch::Tensor& topK, 30 | const torch::Tensor& topP); 31 | } // namespace xllm_ops -------------------------------------------------------------------------------- /xllm/core/scheduler/scheduler_factory.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "runtime/xservice_client.h" 19 | #include "scheduler/continuous_scheduler.h" 20 | #include "scheduler/dit_scheduler.h" 21 | 22 | namespace xllm { 23 | 24 | std::unique_ptr create_continuous_scheduler( 25 | Engine* engine, 26 | ContinuousScheduler::Options options); 27 | 28 | std::unique_ptr create_dit_scheduler( 29 | DiTEngine* engine, 30 | DiTScheduler::Options options); 31 | 32 | } // namespace xllm 33 | -------------------------------------------------------------------------------- /xllm/core/framework/kv_cache/kv_cache_event.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | 19 | #include "util/hash_util.h" 20 | 21 | namespace xllm { 22 | 23 | struct KvCacheEvent { 24 | std::unordered_set 25 | stored_cache; 26 | std::unordered_set 27 | removed_cache; 28 | 29 | void clear() { 30 | stored_cache.clear(); 31 | removed_cache.clear(); 32 | } 33 | }; 34 | 35 | } // namespace xllm 36 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/matmul.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "npu_ops_api.h" 17 | #include "ops_npu/npu_ops.h" 18 | 19 | namespace xllm::kernel::npu { 20 | 21 | torch::Tensor matmul(const torch::Tensor& a, 22 | const torch::Tensor& b, 23 | const std::optional& bias) { 24 | if (!bias.has_value()) { 25 | return torch::nn::functional::linear(a, b); 26 | } else { 27 | return torch::nn::functional::linear(a, b, bias.value()); 28 | } 29 | } 30 | 31 | } // namespace xllm::kernel::npu 32 | -------------------------------------------------------------------------------- /xllm/core/layers/common/activation.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | 22 | namespace xllm { 23 | namespace layer { 24 | 25 | class ActivationImpl : public torch::nn::Module { 26 | public: 27 | ActivationImpl(const std::string& act_mode, bool is_gated); 28 | 29 | void forward(torch::Tensor& input, torch::Tensor& output); 30 | 31 | private: 32 | std::string act_mode_; 33 | bool is_gated_; 34 | }; 35 | TORCH_MODULE(Activation); 36 | 37 | } // namespace layer 38 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/glm4_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Licensed under the Apache License, Version 2.0 (the "License"); 3 | you may not use this file except in compliance with the License. 4 | You may obtain a copy of the License at 5 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 6 | Unless required by applicable law or agreed to in writing, software 7 | distributed under the License is distributed on an "AS IS" BASIS, 8 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | See the License for the specific language governing permissions and 10 | limitations under the License. 11 | ==============================================================================*/ 12 | #pragma once 13 | 14 | #include "config.h" 15 | 16 | namespace xllm { 17 | namespace layer { 18 | 19 | class Glm4DecoderLayer : public torch::nn::ModuleHolder { 20 | public: 21 | using torch::nn::ModuleHolder::ModuleHolder; 22 | using Impl __attribute__((__unused__)) = Glm4DecoderLayerImpl; 23 | Glm4DecoderLayer(const ModelContext& context) 24 | : ModuleHolder(std::make_shared(context)) {} 25 | }; 26 | 27 | } // namespace layer 28 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/kernels/ilu/activation.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ilu_ops_api.h" 17 | 18 | using namespace ixformer; 19 | 20 | namespace xllm::kernel::ilu { 21 | 22 | void act_and_mul(torch::Tensor out, 23 | torch::Tensor input, 24 | const std::string& act_mode) { 25 | if (act_mode == "silu") { 26 | infer::silu_and_mul(input, out); 27 | } else { 28 | LOG(FATAL) << "Unsupported act mode: " << act_mode 29 | << ", only support silu, gelu, gelu_tanh"; 30 | } 31 | } 32 | } // namespace xllm::kernel::ilu 33 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/active.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include 17 | #include 18 | 19 | #include "npu_ops_api.h" 20 | #include "ops_npu/npu_ops.h" 21 | 22 | namespace xllm::kernel::npu { 23 | 24 | torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) { 25 | if (act_mode != "silu" && act_mode != "swiglu") { 26 | LOG(FATAL) << "Only swiglu activation is supported in NPU active"; 27 | } 28 | return at_npu::native::custom_ops::npu_swiglu(input); 29 | } 30 | } // namespace xllm::kernel::npu -------------------------------------------------------------------------------- /xllm/core/layers/word_embedding.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class WordEmbedding : public torch::nn::ModuleHolder { 24 | public: 25 | using torch::nn::ModuleHolder::ModuleHolder; 26 | using Impl __attribute__((__unused__)) = WordEmbeddingImpl; 27 | WordEmbedding(const ModelContext& context) 28 | : ModuleHolder(std::make_shared(context)) {} 29 | }; 30 | 31 | } // namespace layer 32 | } // namespace xllm 33 | -------------------------------------------------------------------------------- /xllm/cc_api/examples/service_request.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "llm.h" 17 | 18 | namespace xllm { 19 | namespace cc_api_test { 20 | // Send Completion request and print the inference result. 21 | void run_completion_request(const std::string& model_name, 22 | xllm::LLM* llm_instance); 23 | 24 | // Send ChatCompletion request and print the inference result. 25 | void run_chat_completion_request(const std::string& model_name, 26 | xllm::LLM* llm_instance); 27 | } // namespace cc_api_test 28 | } // namespace xllm 29 | -------------------------------------------------------------------------------- /xllm/core/kernels/mlu/gather_split.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "mlu_ops_api.h" 17 | 18 | namespace xllm::kernel::mlu { 19 | 20 | void gather_split(const torch::Tensor& input, 21 | const torch::Tensor& gather_index, 22 | const torch::Tensor& valid_token_num, 23 | const torch::Tensor& output_head, 24 | const torch::Tensor& output_tail) { 25 | tmo::torch_api::gather_split( 26 | output_head, output_tail, input, gather_index, valid_token_num); 27 | } 28 | 29 | } // namespace xllm::kernel::mlu 30 | -------------------------------------------------------------------------------- /xllm/core/util/timer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | namespace xllm { 22 | 23 | class Timer final { 24 | public: 25 | Timer(); 26 | 27 | // reset the timer 28 | void reset(); 29 | 30 | // get the elapsed time. 31 | double elapsed_seconds() const; 32 | double elapsed_milliseconds() const; 33 | double elapsed_microseconds() const; 34 | 35 | private: 36 | // the start time of the timer 37 | absl::Time start_; 38 | }; 39 | 40 | } // namespace xllm -------------------------------------------------------------------------------- /cibuild/install/install_base.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | install_ubuntu() { 6 | deploy_deps="libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev" 7 | # Install common dependencies 8 | apt-get update 9 | apt-get install -y --no-install-recommends \ 10 | ${deploy_deps} \ 11 | build-essential \ 12 | zip \ 13 | pkg-config \ 14 | libssl-dev \ 15 | software-properties-common \ 16 | curl \ 17 | git \ 18 | wget \ 19 | sudo \ 20 | vim \ 21 | jq \ 22 | libtool \ 23 | unzip \ 24 | gdb 25 | 26 | # Cleanup package manager 27 | apt-get autoclean && apt-get clean 28 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 29 | } 30 | 31 | install_almalinux() { 32 | yum -y update 33 | yum -y install \ 34 | zip \ 35 | wget \ 36 | curl \ 37 | perl \ 38 | sudo \ 39 | vim \ 40 | jq \ 41 | libtool \ 42 | unzip 43 | 44 | # Cleanup 45 | yum clean all 46 | } 47 | 48 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 49 | case "$ID" in 50 | ubuntu) 51 | install_ubuntu 52 | ;; 53 | almalinux) 54 | install_almalinux 55 | ;; 56 | *) 57 | echo "Unable to determine OS..." 58 | exit 1 59 | ;; 60 | esac -------------------------------------------------------------------------------- /xllm/core/layers/pos_embedding.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class PosEmbedding : public torch::nn::ModuleHolder { 24 | public: 25 | using torch::nn::ModuleHolder::ModuleHolder; 26 | using Impl __attribute__((__unused__)) = RotaryEmbeddingImpl; 27 | 28 | PosEmbedding(const ModelContext& context) 29 | : ModuleHolder(std::make_shared(context)) {} 30 | }; 31 | 32 | } // namespace layer 33 | } // namespace xllm 34 | -------------------------------------------------------------------------------- /xllm/core/framework/tokenizer/tokenizer_factory.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "fast_tokenizer.h" 19 | #include "rec_tokenizer.h" 20 | #include "sentencepiece_tokenizer.h" 21 | #include "tiktoken_tokenizer.h" 22 | #include "tokenizer_args.h" 23 | #include "tokenizer_proxy.h" 24 | 25 | namespace xllm { 26 | 27 | class TokenizerFactory { 28 | public: 29 | static std::unique_ptr create_tokenizer( 30 | const std::string& model_weights_path, 31 | TokenizerArgs tokenizer_args, 32 | bool proxy = true); 33 | }; 34 | 35 | } // namespace xllm 36 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/word_embedding_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "core/layers/npu/npu_base_layer.h" 22 | 23 | namespace xllm { 24 | namespace layer { 25 | 26 | class WordEmbeddingLoader : public BaseLoader { 27 | public: 28 | WordEmbeddingLoader(uint64_t weight_count, const ModelContext& context); 29 | 30 | void load_state_dict(const StateDict& state_dict) override; 31 | void verify_loaded_weights(const std::string& prefix) const override; 32 | }; 33 | 34 | } // namespace layer 35 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/runtime/dit_executor.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "dit_executor.h" 17 | 18 | #include 19 | 20 | #include "common/metrics.h" 21 | 22 | namespace xllm { 23 | 24 | DiTExecutor::DiTExecutor(DiTModel* model, const runtime::Options& options) 25 | : model_(model), options_(options) {} 26 | 27 | DiTForwardInput DiTExecutor::prepare_inputs(DiTBatch& batch) { 28 | return batch.prepare_forward_input(); 29 | } 30 | 31 | DiTForwardOutput DiTExecutor::forward(const DiTForwardInput& input) { 32 | return model_->forward(input); 33 | } 34 | 35 | } // namespace xllm 36 | -------------------------------------------------------------------------------- /xllm/core/util/uuid.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "uuid.h" 18 | 19 | #include 20 | 21 | namespace xllm { 22 | 23 | std::string ShortUUID::random(size_t len) { 24 | if (len == 0) { 25 | len = 22; 26 | } 27 | 28 | std::string uuid(len, ' '); 29 | for (size_t i = 0; i < len; i++) { 30 | const size_t rand = absl::Uniform( 31 | absl::IntervalClosedOpen, gen_, 0, alphabet_.size()); 32 | uuid[i] = alphabet_[rand]; 33 | } 34 | return uuid; 35 | } 36 | 37 | } // namespace xllm -------------------------------------------------------------------------------- /docs/zh/features/groupgemm.md: -------------------------------------------------------------------------------- 1 | # GroupGEMM算子优化 2 | 3 | # 背景 4 | 混合专家(Mixture of Experts, MoE)架构已成为扩展大规模语言模型的重要范式,其核心思想是将输入token动态路由至不同的专家子网络进行处理。在推理过程中,GroupGEMM算子是MoE架构的关键计算单元,负责高效执行多个专家矩阵乘法的并行计算,且在整个推理耗时中占据主导地位。 5 | 6 | ## 功能介绍 7 | 结合当前GroupGEMM的性能瓶颈为I/O受限,提出了一种优化方案,通过索引重排替代数据拷贝,取消了对token向量的多次复制,改为维护专家分配的索引表。通过该行号索引,直接将token映射到相应的专家计算单元,并将token的分配调度与矩阵乘法融合为一个单一的kernel。 8 | 9 | 10 | ## 用户接口 11 | 12 | ### 算子直调API 13 | ```c++ 14 | aclnnStatus aclnnIndexGroupMatmulGetWorkspaceSize( 15 | const aclTensorList *x, 16 | const aclTensorList *weight, 17 | const aclTensorList *scale, 18 | const aclTensorList *perTokenScale, 19 | const aclTensor *groupList, 20 | const aclTensorList *out, 21 | uint64_t *workspaceSize, 22 | aclOpExecutor **executor); 23 | 24 | aclnnStatus aclnnIndexGroupMatmul( 25 | void *workspace, 26 | uint64_t workspaceSize, 27 | aclOpExecutor *executor, 28 | aclrtStream stream); 29 | ``` 30 | 31 | - `x`: 输入的张量列表,包含待处理的数据。 32 | - `weight`: 权重张量,包含模型的参数。 33 | - `scale`: 缩放因子,用于调整输入张量的值。 34 | - `perTokenScale`:每个token的缩放因子,用于动态调整。 35 | - `groupList`: 专家组列表,指示哪些专家参与计算。 36 | - `out`: 输出张量列表,存储计算结果。 37 | 38 | ## 性能效果 39 | ![groupmatmul](../../assets/groupmatmul_performance.png) 40 | 41 | * 优化后的GroupMatmul算子在计算时间上表现出明显的优势,尤其是在k为128,m为64情况下,如图所示,优化后算子计算延时 **减少50%**。 42 | -------------------------------------------------------------------------------- /xllm/core/framework/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | include_directories(.) 5 | if(USE_NPU) 6 | include_directories( 7 | ${CMAKE_SOURCE_DIR}/third_party/spdlog/include 8 | ) 9 | endif() 10 | add_subdirectory(batch) 11 | add_subdirectory(block) 12 | add_subdirectory(chat_template) 13 | add_subdirectory(kv_cache) 14 | add_subdirectory(model) 15 | add_subdirectory(parallel_state) 16 | add_subdirectory(prefix_cache) 17 | add_subdirectory(request) 18 | add_subdirectory(sampling) 19 | add_subdirectory(state_dict) 20 | add_subdirectory(tokenizer) 21 | add_subdirectory(eplb) 22 | add_subdirectory(xtensor) 23 | add_subdirectory(dit_cache) 24 | 25 | 26 | cc_library( 27 | NAME 28 | model_loader 29 | HDRS 30 | hf_model_loader.h 31 | dit_model_context.h 32 | dit_model_loader.h 33 | model_loader.h 34 | SRCS 35 | hf_model_loader.cpp 36 | dit_model_context.cpp 37 | dit_model_loader.cpp 38 | model_loader.cpp 39 | DEPS 40 | :common 41 | :model 42 | :models 43 | $<$:npu_layers> 44 | :tokenizer 45 | torch 46 | ) 47 | 48 | cc_library( 49 | NAME 50 | model_context 51 | HDRS 52 | model_context.h 53 | SRCS 54 | model_context.cpp 55 | DEPS 56 | torch 57 | $<$:torch_npu> 58 | ) 59 | -------------------------------------------------------------------------------- /xllm/core/framework/block/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | block 8 | HDRS 9 | block.h 10 | block_manager.h 11 | block_manager_pool.h 12 | block_manager_impl.h 13 | concurrent_block_manager_impl.h 14 | hierarchy_block_manager_pool.h 15 | SRCS 16 | block.cpp 17 | block_manager_pool.cpp 18 | concurrent_block_manager_impl.cpp 19 | block_manager_impl.cpp 20 | hierarchy_block_manager_pool.cpp 21 | DEPS 22 | $<$:torch_npu> 23 | $<$:graph> 24 | :request 25 | :common 26 | glog::glog 27 | Boost::serialization 28 | SMHasherSupport 29 | torch 30 | ) 31 | target_link_libraries(block PRIVATE Folly::folly) 32 | 33 | if(USE_NPU) 34 | set(TEST_SRCS 35 | block_manager_test.cpp 36 | ) 37 | 38 | cc_test( 39 | NAME 40 | block_test 41 | SRCS 42 | ${TEST_SRCS} 43 | DEPS 44 | :block 45 | :flags 46 | :kv_cache 47 | :prefix_cache 48 | absl::random_random 49 | Boost::serialization 50 | GTest::gtest_main 51 | ) 52 | 53 | target_link_libraries(block_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto ascendcl Folly::folly) 54 | add_dependencies(block_test brpc-static) 55 | endif() -------------------------------------------------------------------------------- /xllm/core/layers/llama_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class LlamaDecoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = LlamaDecoderLayerImpl; 28 | 29 | LlamaDecoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/core/layers/qwen2_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen2DecoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen2DecoderLayerImpl; 28 | 29 | Qwen2DecoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/core/layers/qwen3_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen3DecoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen3DecoderLayerImpl; 28 | 29 | Qwen3DecoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/buffer/atb_buffer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include "atb/atb_infer.h" 21 | 22 | namespace xllm { 23 | 24 | class AtbBuffer { 25 | public: 26 | explicit AtbBuffer(uint64_t bufferSize, at::Device device); 27 | ~AtbBuffer(); 28 | void* get_buffer(uint64_t bufferSize); 29 | 30 | private: 31 | torch::Tensor create_attensor(uint64_t bufferSize) const; 32 | 33 | private: 34 | uint64_t buffer_size_ = 0; 35 | torch::Tensor at_tensor_; 36 | at::Device device_; 37 | 38 | at::TensorOptions options_; 39 | }; 40 | 41 | } // namespace xllm 42 | -------------------------------------------------------------------------------- /xllm/function_call/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | add_subdirectory(partial_json_parser) 5 | 6 | cc_library ( 7 | NAME 8 | function_call 9 | HDRS 10 | core_types.h 11 | base_format_detector.h 12 | qwen25_detector.h 13 | kimik2_detector.h 14 | deepseekv3_detector.h 15 | glm45_detector.h 16 | function_call_parser.h 17 | function_call.h 18 | utils.h 19 | SRCS 20 | base_format_detector.cpp 21 | qwen25_detector.cpp 22 | kimik2_detector.cpp 23 | deepseekv3_detector.cpp 24 | glm45_detector.cpp 25 | function_call_parser.cpp 26 | utils.cpp 27 | DEPS 28 | nlohmann_json::nlohmann_json 29 | glog::glog 30 | proto::xllm_proto 31 | partial_json_parser 32 | common 33 | ) 34 | 35 | function(add_detector_test TEST_NAME) 36 | cc_test( 37 | NAME 38 | ${TEST_NAME} 39 | SRCS 40 | ${TEST_NAME}.cpp 41 | DEPS 42 | :function_call 43 | GTest::gtest 44 | GTest::gtest_main 45 | nlohmann_json::nlohmann_json 46 | ) 47 | endfunction() 48 | 49 | add_detector_test(qwen25_detector_test) 50 | add_detector_test(kimik2_detector_test) 51 | add_detector_test(deepseekv3_detector_test) 52 | add_detector_test(glm45_detector_test) 53 | 54 | -------------------------------------------------------------------------------- /xllm/core/util/net.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | namespace xllm { 22 | namespace net { 23 | 24 | std::string get_local_ip_addr(); 25 | int get_local_free_port(); 26 | uint64_t convert_ip_port_to_uint64(const std::string& ip, uint16_t port); 27 | void parse_host_port_from_addr(const std::string& addr, 28 | std::string& host, 29 | int& port); 30 | 31 | std::string extract_ip(const std::string& input); 32 | std::string extract_port(const std::string& input); 33 | } // namespace net 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/rms_norm_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "base_loader.h" 22 | 23 | namespace xllm { 24 | namespace layer { 25 | 26 | class RMSNORMLoader : public BaseLoader { 27 | public: 28 | RMSNORMLoader(uint64_t weight_count, const ModelContext& context); 29 | 30 | void load_state_dict(const StateDict& state_dict) override; 31 | 32 | void verify_loaded_weights(const std::string& weight_str) const override; 33 | 34 | protected: 35 | int rank_id_; 36 | torch::ScalarType dtype_; 37 | }; 38 | 39 | } // namespace layer 40 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/scheduler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | 5 | add_subdirectory(profile) 6 | 7 | cc_library( 8 | NAME 9 | scheduler 10 | HDRS 11 | chunked_prefill_scheduler.h 12 | zero_eviction_scheduler.h 13 | continuous_scheduler.h 14 | disagg_pd_scheduler.h 15 | pd_ooc_scheduler.h 16 | async_response_processor.h 17 | scheduler.h 18 | dit_scheduler.h 19 | prefill_only_scheduler.h 20 | scheduler_factory.h 21 | decode_priority_queue.h 22 | perf_model.h 23 | SRCS 24 | chunked_prefill_scheduler.cpp 25 | zero_eviction_scheduler.cpp 26 | continuous_scheduler.cpp 27 | disagg_pd_scheduler.cpp 28 | pd_ooc_scheduler.cpp 29 | async_response_processor.cpp 30 | dit_scheduler.cpp 31 | prefill_only_scheduler.cpp 32 | scheduler_factory.cpp 33 | perf_model.cpp 34 | DEPS 35 | :batch 36 | :request 37 | :runtime 38 | :profile 39 | glog::glog 40 | Folly::folly 41 | absl::time 42 | absl::synchronization 43 | ) 44 | 45 | cc_test( 46 | NAME 47 | chunked_prefill_scheduler_test 48 | continuous_scheduler_test 49 | SRCS 50 | chunked_prefill_scheduler_test.cpp 51 | continuous_scheduler_test.cpp 52 | DEPS 53 | :scheduler 54 | GTest::gtest_main 55 | $<$:nnopbase> 56 | ) 57 | 58 | -------------------------------------------------------------------------------- /xllm/core/util/type_traits.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | #include 19 | 20 | namespace xllm { 21 | 22 | template 23 | struct remove_optional { 24 | using type = value_type; 25 | }; 26 | 27 | // specialization for optional 28 | template 29 | struct remove_optional> { 30 | using type = value_type; 31 | }; 32 | 33 | /// alias template for remove_optional 34 | template 35 | using remove_optional_t = typename remove_optional::type; 36 | 37 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/glm4_vision_encode_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Glm4VisionEncoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Glm4VisionEncoderLayerImpl; 28 | 29 | Glm4VisionEncoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/llama_decoder_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "base_loader.h" 22 | 23 | namespace xllm { 24 | namespace layer { 25 | 26 | class LlamaDecoderLoader : public BaseLoader { 27 | public: 28 | LlamaDecoderLoader(uint64_t weight_count, const ModelContext& context); 29 | 30 | void load_state_dict(const StateDict& state_dict) override; 31 | void verify_loaded_weights() const override; 32 | void merge_loaded_weights() override; 33 | 34 | bool enableAddNorm_; 35 | int rank_id_; 36 | }; 37 | 38 | } // namespace layer 39 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/qwen2_vision_encode_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen2VisionEncoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen2VisionEncoderLayerImpl; 28 | 29 | Qwen2VisionEncoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/request/finish_reason.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "finish_reason.h" 17 | 18 | #include 19 | 20 | namespace xllm { 21 | 22 | std::optional FinishReason::to_string() { 23 | switch (value) { 24 | case Value::NONE: 25 | return std::nullopt; 26 | case Value::STOP: 27 | return "stop"; 28 | case Value::LENGTH: 29 | return "length"; 30 | case Value::FUNCTION_CALL: 31 | return "function_call"; 32 | default: 33 | LOG(WARNING) << "Unknown finish reason: " << static_cast(value); 34 | } 35 | return std::nullopt; 36 | } 37 | 38 | } // namespace xllm 39 | -------------------------------------------------------------------------------- /xllm/core/layers/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | 3 | cc_library( 4 | NAME 5 | common_layers 6 | HDRS 7 | qwen2_attention.h 8 | qwen2_vision_attention.h 9 | rms_norm.h 10 | rotary_embedding.h 11 | rotary_embedding_util.h 12 | fused_moe.h 13 | dense_mlp.h 14 | qwen2_decoder_layer.h 15 | qwen2_5_vision_layer.h 16 | qwen3_moe_decoder_layer.h 17 | linear.h 18 | word_embedding_impl.h 19 | layer_utils.h 20 | indexer.h 21 | deep_ep.h 22 | activation.h 23 | attention_metadata.h 24 | SRCS 25 | qwen2_attention.cpp 26 | qwen2_vision_attention.cpp 27 | rms_norm.cpp 28 | rotary_embedding.cpp 29 | rotary_embedding_util.cpp 30 | fused_moe.cpp 31 | dense_mlp.cpp 32 | qwen2_decoder_layer.cpp 33 | qwen2_5_vision_layer.cpp 34 | qwen3_moe_decoder_layer.cpp 35 | linear.cpp 36 | word_embedding_impl.cpp 37 | layer_utils.cpp 38 | indexer.cpp 39 | deep_ep.cpp 40 | activation.cpp 41 | attention_metadata.cpp 42 | DEPS 43 | "-Wl,--whole-archive" 44 | "-Wl,--no-whole-archive" 45 | :kv_cache 46 | :prefix_cache 47 | :block 48 | :parallel_state 49 | :state_dict 50 | :model 51 | :kernels 52 | glog::glog 53 | gflags::gflags 54 | torch 55 | :platform 56 | ) 57 | 58 | add_subdirectory(tests) 59 | -------------------------------------------------------------------------------- /xllm/core/layers/qwen3_vision_encode_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen3VisionEncoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen3VisionEncoderLayerImpl; 28 | 29 | Qwen3VisionEncoderLayer(const ModelContext& context) 30 | : ModuleHolder(std::make_shared(context)) {} 31 | }; 32 | 33 | } // namespace layer 34 | } // namespace xllm 35 | -------------------------------------------------------------------------------- /xllm/api_service/call.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | 22 | namespace xllm { 23 | 24 | class Call { 25 | public: 26 | Call(brpc::Controller* controller); 27 | virtual ~Call() = default; 28 | 29 | std::string get_x_request_id() { return x_request_id_; } 30 | std::string get_x_request_time() { return x_request_time_; } 31 | 32 | virtual bool is_disconnected() const = 0; 33 | 34 | protected: 35 | void init(); 36 | 37 | protected: 38 | brpc::Controller* controller_; 39 | 40 | std::string x_request_id_; 41 | std::string x_request_time_; 42 | }; 43 | 44 | } // namespace xllm 45 | -------------------------------------------------------------------------------- /xllm/core/framework/sampling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | sampler 7 | HDRS 8 | sampling_params.h 9 | logits_utils.h 10 | rejection_sampler.h 11 | sampler.h 12 | beam_searcher.h 13 | rec_constrained_decoding.h 14 | SRCS 15 | sampling_params.cpp 16 | logits_utils.cpp 17 | rejection_sampler.cpp 18 | sampler.cpp 19 | beam_searcher.cpp 20 | rec_constrained_decoding.cpp 21 | DEPS 22 | :common 23 | glog::glog 24 | torch 25 | :kernels 26 | $<$:xllm_ops> 27 | ) 28 | 29 | cc_test( 30 | NAME 31 | sampler_test 32 | SRCS 33 | rejection_sampler_test.cpp 34 | rejection_sampler.cpp 35 | sampling_params_test.cpp 36 | DEPS 37 | absl::strings 38 | GTest::gtest_main 39 | :flags 40 | :sampler 41 | glog::glog 42 | ) 43 | target_link_libraries(sampler_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto leveldb::leveldb ZLIB::ZLIB protobuf::libprotobuf) 44 | target_link_libraries(sampler_test 45 | PUBLIC 46 | Python::Python 47 | $<$:ascendcl> 48 | $<$:hccl> 49 | $<$:c_sec> 50 | $<$:nnopbase>) 51 | add_dependencies(sampler_test brpc-static) -------------------------------------------------------------------------------- /xllm/core/layers/qwen3_moe_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen3MoeDecoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen3MoeDecoderLayerImpl; 28 | 29 | Qwen3MoeDecoderLayer(const ModelContext& context, int32_t layer_id) 30 | : Qwen3MoeDecoderLayer( 31 | std::make_shared(context, layer_id)) {} 32 | }; 33 | 34 | } // namespace layer 35 | } // namespace xllm 36 | -------------------------------------------------------------------------------- /xllm/core/kernels/ilu/rope.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ilu_ops_api.h" 17 | #include "utils.h" 18 | 19 | namespace xllm::kernel::ilu { 20 | 21 | void apply_rope_pos_ids_cos_sin_cache(torch::Tensor& query, 22 | torch::Tensor& key, 23 | torch::Tensor& cos_sin_cache, 24 | torch::Tensor& positions, 25 | bool interleave) { 26 | const int64_t head_size = cos_sin_cache.size(-1) / 2; 27 | infer::vllm_rotary_embedding( 28 | positions, query, key, head_size, cos_sin_cache, !interleave); 29 | } 30 | 31 | } // namespace xllm::kernel::ilu 32 | -------------------------------------------------------------------------------- /xllm/cc_api/README.md: -------------------------------------------------------------------------------- 1 | ### How to compile xllm dynamic library 2 | Run the following command in root directory: 3 | ``` 4 | python setup.py build --device a3 --generate-so true 5 | ``` 6 | 7 | If you want to debug, it needs to set DEBUG environment variable. 8 | ``` 9 | export DEBUG=1 10 | ``` 11 | 12 | ### How to install dynamic library 13 | Run installation script xllm/cc_api/install.sh, headers and dynamic library will be installed in /usr/local/xllm directory. 14 | ``` 15 | cd xllm/cc_api 16 | 17 | sh install.sh 18 | ``` 19 | 20 | You will see the following files in /usr/local/xllm directory: 21 | ``` 22 | [root@A03-R40-I189-101-4100046 cc_api]# tree /usr/local/xllm 23 | /usr/local/xllm 24 | |-- include 25 | | |-- llm.h 26 | | |-- macros.h 27 | | `-- types.h 28 | `-- lib 29 | |-- libcust_opapi.so 30 | `-- libxllm.so 31 | 32 | 3 directories, 5 files 33 | ``` 34 | 35 | ### How to run cc_api examples 36 | It provides two examples which use cc_api to create xllm instance and run inference. The single_llm_instance.cpp creates one instance which is used in most LLM scenes. The multiple_llm_instances.cpp creates two instances which is used in multiple-models scene or one model with multiple versions. 37 | 38 | You can follow the commands to compile and run these examples: 39 | ``` 40 | cd examples && mkdir build 41 | cd build && cmake .. && make && cd .. 42 | 43 | sh start-llm-instance.sh 44 | ``` -------------------------------------------------------------------------------- /xllm/core/kernels/npu/xllm_ops/beam_search.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #include "acl/acl.h" 23 | #include "aclnn_beam_search.h" 24 | #include "acltensor_utils.h" 25 | #include "util/tensor_helper.h" 26 | 27 | namespace xllm_ops { 28 | void beam_search(const torch::Tensor& logprobs, 29 | const torch::Tensor& top_tokens, 30 | const torch::Tensor& top_logprobs, 31 | torch::Tensor& src_seq_idxes, 32 | torch::Tensor& out_logprobs, 33 | torch::Tensor& out_token_ids); 34 | } // namespace xllm_ops -------------------------------------------------------------------------------- /xllm/core/layers/qwen2dot5_vision_encode_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class Qwen2dot5VisionEncoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = Qwen2dot5VisionEncoderLayerImpl; 28 | 29 | Qwen2dot5VisionEncoderLayer(const ModelContext& context) 30 | : ModuleHolder( 31 | std::make_shared(context)) {} 32 | }; 33 | 34 | } // namespace layer 35 | } // namespace xllm 36 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/brpc"] 2 | path = third_party/brpc 3 | url = https://gitcode.com/xLLM-AI/brpc.git 4 | [submodule "third_party/cpprestsdk"] 5 | path = third_party/cpprestsdk 6 | url = https://gitcode.com/xLLM-AI/cpprestsdk.git 7 | [submodule "third_party/hccl_transfer"] 8 | path = third_party/hccl_transfer 9 | url = https://gitcode.com/xLLM-AI/hccl_transfer.git 10 | [submodule "third_party/minja"] 11 | path = third_party/minja 12 | url = https://gitcode.com/xLLM-AI/minja.git 13 | [submodule "third_party/sentencepiece"] 14 | path = third_party/sentencepiece 15 | url = https://gitcode.com/xLLM-AI/sentencepiece.git 16 | [submodule "third_party/smhasher"] 17 | path = third_party/smhasher 18 | url = https://gitcode.com/xLLM-AI/smhasher.git 19 | [submodule "third_party/xllm_ops"] 20 | path = third_party/xllm_ops 21 | url = https://gitcode.com/xLLM-AI/xllm_ops.git 22 | [submodule "third_party/etcd_cpp_apiv3"] 23 | path = third_party/etcd_cpp_apiv3 24 | url = https://gitcode.com/xLLM-AI/etcd-cpp-apiv3.git 25 | [submodule "third_party/spdlog"] 26 | path = third_party/spdlog 27 | url = https://gitcode.com/xLLM-AI/spdlog.git 28 | [submodule "third_party/Mooncake"] 29 | path = third_party/Mooncake 30 | url = https://gitcode.com/xLLM-AI/Mooncake.git 31 | [submodule "third_party/torch_npu_ops"] 32 | path = third_party/torch_npu_ops 33 | url = https://gitcode.com/xLLM-AI/torch_npu_ops.git 34 | -------------------------------------------------------------------------------- /xllm/core/layers/siglip_encoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | class SiglipEncoderLayer 24 | : public torch::nn::ModuleHolder { 25 | public: 26 | using torch::nn::ModuleHolder::ModuleHolder; 27 | using Impl __attribute__((__unused__)) = SiglipEncoderLayerImpl; 28 | 29 | SiglipEncoderLayer(const ModelContext& context, 30 | const std::string& prefix = "") 31 | : ModuleHolder( 32 | std::make_shared(context, prefix)) {} 33 | }; 34 | 35 | } // namespace layer 36 | } // namespace xllm 37 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/siglip_encoder_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #pragma once 16 | 17 | #include 18 | #include 19 | 20 | #include "base_loader.h" 21 | 22 | namespace xllm { 23 | namespace layer { 24 | 25 | class SiglipEncoderUpLoader : public BaseLoader { 26 | public: 27 | explicit SiglipEncoderUpLoader(const ModelContext& context); 28 | 29 | void load_state_dict(const StateDict& state_dict) override; 30 | }; 31 | 32 | class SiglipEncoderDownLoader : public BaseLoader { 33 | public: 34 | explicit SiglipEncoderDownLoader(const ModelContext& context); 35 | 36 | void load_state_dict(const StateDict& state_dict) override; 37 | }; 38 | 39 | } // namespace layer 40 | } // namespace xllm -------------------------------------------------------------------------------- /CONTRIBUTING_zh.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | [English](./CONTRIBUTING.md) | [中文](./CONTRIBUTING_zh.md) 16 | 17 | # xLLM 贡献指南 18 | 19 | xLLM致力于为每一位用户和开发者提供开放的XX,因此无论您是XX开发者还是专注于XX用户,我们都欢迎您参与我们的项目。 20 | 您可以通过以下方法为项目作出贡献: 21 | 22 | + 撰写/翻译/修改文档 23 | + 提出或回答问题 24 | + 提供使用或测试样例 25 | + 提供建议或其他评论 26 | + 参与[issues](https://github.com/xxx/xLLM/issues) 或[discussions](https://github.com/xxx/xLLM/discussions) 27 | + 提交Pull request 28 | + 分享相关研究或应用场景 29 | + 其他任何对xLLM的帮助 30 | 31 | 如果您希望参与xLLM的开发,请参考以下提示: 32 | 33 | ## 1. 选择参与贡献的issue 34 | + 您可以选择带有`PR welcome`标签的issue,包括: 35 | + 可复现的bug 36 | + 计划实现的功能 37 | 38 | ## 2. 配置开发环境 39 | + 在开发之前,可以参考我们的 **[文档](http://xxx/docs/)** 40 | + 关于环境配置,参见 **[Readme file](/README.md)** 41 | 42 | ## 3. 项目构建和运行 43 | + 您可以运行如下样例: 44 | 45 | ## 4. 测试 46 | 47 | 在pr提交之后,我们会对代码进行格式化及进一步测试。 48 | 我们的测试目前还很不完善,因此欢迎开发者为测试作出贡献! -------------------------------------------------------------------------------- /docs/zh/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 10 | 11 |
12 | xLLM 13 |
14 | 15 | ## 简介 16 | 17 | **xLLM** 是一个高效且易用的开源智能推理框架,为模型在国产芯片上的推理提供企业级服务保障与高性能引擎计算能力。 18 | 19 | #### 背景 20 | 当前,百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景,对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性,硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题,制约了请求的高效推理与系统的可扩展性。xLLM 推理引擎提升了 “通信 - 计算 - 存储” 全链路的资源利用效率,为大语言模型在实际业务中的规模化落地提供了关键技术支撑。 21 | 22 | --- 23 | 24 | ## 核心特性 25 | xLLM 提供了强大的智能计算能力,通过硬件系统的算力优化与算法驱动的决策控制,联合加速推理过程,实现高吞吐、低延迟的分布式推理服务。 26 | 27 | ### 全图化/多层流水线执行编排 28 | 29 | - 框架调度层的异步解耦调度,减少计算空泡; 30 | - 模型图层的计算和通信异步并行,重叠计算与通信; 31 | - 算子内核层的异构计算单元深度流水,重叠计算与访存。 32 | 33 | ### 动态shape的图执行优化 34 | 35 | - 基于参数化与多图缓存方法的动态尺寸适配,提升静态图灵活性; 36 | - 受管控的显存池,保证地址安全可复用; 37 | - 集成适配性能关键的自定义算子(如 *PageAttention*, *AllReduce*)。 38 | 39 | ### MoE算子优化 40 | 41 | - *GroupMatmul* 优化,提升计算效率; 42 | - *Chunked Prefill* 优化,支撑长序列输入。 43 | 44 | ### 高效显存优化 45 | 46 | - 离散物理内存与连续虚拟内存的映射管理; 47 | - 按需分配内存空间,减少内存碎片与浪费; 48 | - 智能调度内存空间,增加内存页复用,减小分配延迟; 49 | - 国产芯片相应算子适配。 50 | 51 | ### 全局多级KV Cache管理 52 | 53 | - 多级缓存的kv智能卸载与预取; 54 | - 以kv cache为中心的分布式存储架构; 55 | - 多节点间kv的智能传输路由。 56 | 57 | ### 算法优化 58 | 59 | - 投机推理优化,多核并行提升效率; 60 | - MoE专家的动态负载均衡,实现专家分布的高效调整。 61 | 62 | 63 | -------------------------------------------------------------------------------- /xllm/api_service/qwen3_rerank_service_impl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "api_service/rerank_service_impl.h" 19 | 20 | namespace xllm { 21 | using RerankCall = NonStreamCall; 22 | 23 | // a class to handle completion requests 24 | class Qwen3RerankServiceImpl final : public RerankServiceImpl { 25 | public: 26 | Qwen3RerankServiceImpl(LLMMaster* master, 27 | const std::vector& models); 28 | 29 | // brpc call_data needs to use shared_ptr 30 | void process_async_impl(std::shared_ptr call) override; 31 | 32 | private: 33 | DISALLOW_COPY_AND_ASSIGN(Qwen3RerankServiceImpl); 34 | }; 35 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/common/activation.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "activation.h" 17 | 18 | #include "kernels/ops_api.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | ActivationImpl::ActivationImpl(const std::string& act_mode, bool is_gated) 24 | : act_mode_(act_mode), is_gated_(is_gated) {} 25 | 26 | void ActivationImpl::forward(torch::Tensor& input, torch::Tensor& output) { 27 | xllm::kernel::ActivationParams activation_params; 28 | activation_params.input = input; 29 | activation_params.output = output; 30 | activation_params.act_mode = act_mode_; 31 | activation_params.is_gated = is_gated_; 32 | xllm::kernel::active(activation_params); 33 | } 34 | 35 | } // namespace layer 36 | } // namespace xllm -------------------------------------------------------------------------------- /docs/zh/features/xllm_service_overview.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # xLLM Service 4 | [:simple-github: xLLM Service](https://github.com/jd-opensource/xllm-service) 5 | 6 | ## 简介 7 | 8 | **xLLM-service** 是一个基于 xLLM 推理引擎开发的服务层框架,为集群化部署提供高效率、高容错、高灵活性的大模型推理服务。 9 | 10 | xLLM-service 旨在解决企业级服务场景中的关键挑战: 11 | 12 | - 如何于在离线混合部署环境中,保障在线服务的SLA,提升离线任务的资源利用率。 13 | - 如何适应实际业务中动态变化的请求负载,如输入/输出长度出现剧烈波动。 14 | - 解决多模态模型请求的性能瓶颈。 15 | - 保障集群计算实例的高可靠性。 16 | 17 | #### 背景 18 | 当前,百亿至万亿参数规模的大语言模型正快速部署于智能客服、实时推荐、内容生成等核心业务场景,对国产计算硬件的高效支持已成为低成本推理部署的核心需求。现有推理引擎难以有效适配国产芯片等专用加速器的架构特性,硬件计算单元利用率低、MoE 架构下的负载不均衡与通信开销瓶颈、kv 缓存管理困难等问题,制约了请求的高效推理与系统的可扩展性。xLLM-service + xLLM推理引擎提升了全链路效率,为大语言模型在实际业务中的规模化落地提供了关键技术支撑。 19 | 20 | --- 21 | 22 | ## 整体架构 23 | xLLM-service 整体架构如图所示: 24 | 25 | ![1](../../assets/service_arch.png) 26 | 27 | ## 核心组件 28 | 29 | ### ETCD Cluster 30 | 用于元信息管理,包括模型,xllm实例,请求等元信息的存储与管理。同时提供xllm节点注册与发现服务。 31 | 32 | ### Fault Tolerance 33 | xLLM-service 提供容错管理,保障服务质量以及稳定性。 34 | 35 | ### Global Scheduler 36 | 实现全局感知调度,根据当前系统状态,将请求精准调度至最优实例执行,有效提升整体服务响应效率与资源利用率。 37 | 38 | ### Global KV Cache Manager 39 | 负责全局 KV Cache 管理,核心能力包括分布式 KV 缓存感知、Prefix 前缀匹配、KV Cache 动态迁移等,优化缓存资源使用效率。 40 | 41 | ### Instance Manager 42 | 聚焦实例全生命周期管理,所有 xllm 实例启动后需向本模块注册,模块基于预设策略,为实例提供调度适配、容错处理等支持。 43 | 44 | ### Event Plane 45 | 作为指标与事件中枢,接收各实例上报的 Metrics 数据,对统计指标进行统一收集与整理,为服务调度、容错、扩缩容等决策提供数据支撑。 46 | 47 | ### Planner 48 | 承担策略分析与决策职能,基于 Event Plane 上报的 Metrics 数据(含实例运行时指标、机器负载指标等),分析服务扩缩容需求、热点实例扩展必要性,输出资源调整与实例优化策略。 -------------------------------------------------------------------------------- /xllm/core/framework/model/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | # Define the base dependencies 5 | set(BASE_DEPS 6 | :common 7 | :flags 8 | :layers 9 | :prefix_cache 10 | :block 11 | :processors 12 | :chat_template 13 | glog::glog 14 | torch 15 | torch_python 16 | ) 17 | 18 | if(USE_NPU) 19 | list(APPEND BASE_DEPS :platform_npu) 20 | endif() 21 | 22 | # Define the library 23 | cc_library( 24 | NAME 25 | model 26 | HDRS 27 | causal_lm.h 28 | causal_vlm.h 29 | dit_model.h 30 | embedding_lm.h 31 | embedding_vlm.h 32 | mm_embedding_vlm.h 33 | model_args.h 34 | npu_dp_ep_padding.h 35 | model_input_params.h 36 | SRCS 37 | npu_dp_ep_padding.cpp 38 | DEPS 39 | ${BASE_DEPS} 40 | ) 41 | target_link_libraries(model PRIVATE :kv_cache) 42 | 43 | if(USE_NPU) 44 | cc_test( 45 | NAME 46 | npu_dp_ep_padding_test 47 | SRCS 48 | npu_dp_ep_padding_test.cpp 49 | DEPS 50 | :flags 51 | :parallel_state 52 | torch 53 | model 54 | absl::synchronization 55 | absl::time 56 | GTest::gtest_main 57 | ) 58 | 59 | target_link_libraries(npu_dp_ep_padding_test 60 | PUBLIC Python::Python 61 | $<$:ascendcl> 62 | $<$:hccl> 63 | $<$:c_sec> 64 | $<$:nnopbase>) 65 | endif() 66 | -------------------------------------------------------------------------------- /xllm/core/framework/prefix_cache/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_binary) 2 | include(cc_library) 3 | include(cc_test) 4 | 5 | cc_library( 6 | NAME 7 | prefix_cache 8 | HDRS 9 | prefix_cache.h 10 | prefix_cache_with_upload.h 11 | prefix_cache_factory.h 12 | SRCS 13 | prefix_cache.cpp 14 | prefix_cache_with_upload.cpp 15 | prefix_cache_factory.cpp 16 | DEPS 17 | $<$:torch_npu> 18 | $<$:graph> 19 | :request 20 | :common 21 | glog::glog 22 | Boost::serialization 23 | SMHasherSupport 24 | torch 25 | ) 26 | 27 | 28 | if(USE_NPU) 29 | cc_test( 30 | NAME 31 | prefix_test 32 | SRCS 33 | prefix_cache_test.cpp 34 | DEPS 35 | :flags 36 | :kv_cache 37 | :prefix_cache 38 | :block 39 | absl::random_random 40 | Boost::serialization 41 | GTest::gtest_main 42 | ) 43 | 44 | target_link_libraries(prefix_test PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto Folly::folly) 45 | add_dependencies(prefix_test brpc-static) 46 | endif() 47 | 48 | cc_binary( 49 | NAME 50 | prefix_cache_benchmark 51 | SRCS 52 | prefix_cache_benchmark.cpp 53 | DEPS 54 | :kv_cache 55 | :prefix_cache 56 | :block 57 | benchmark::benchmark 58 | benchmark::benchmark_main 59 | ) 60 | 61 | target_link_libraries(prefix_cache_benchmark PRIVATE brpc OpenSSL::SSL OpenSSL::Crypto) 62 | add_dependencies(prefix_cache_benchmark brpc-static) 63 | -------------------------------------------------------------------------------- /xllm/core/kernels/mlu/random_sample.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "mlu_ops_api.h" 17 | 18 | namespace xllm::kernel::mlu { 19 | 20 | torch::Tensor random_sample(const torch::Tensor& probs) { 21 | torch::Tensor flat_probs; 22 | if (probs.dim() == 3) { 23 | flat_probs = probs.reshape({-1, probs.size(2)}); 24 | } else { 25 | flat_probs = probs; 26 | } 27 | auto output = 28 | torch::empty({flat_probs.size(0), 1}, 29 | torch::dtype(torch::kInt64).device(probs.device())); 30 | tmo::torch_api::random_sample(flat_probs, output, true, torch::Generator()); 31 | if (probs.dim() == 3) { 32 | return output.reshape({probs.size(0), probs.size(1)}); 33 | } 34 | return output.flatten(); 35 | } 36 | 37 | } // namespace xllm::kernel::mlu -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/qwen3_decoder_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "base_loader.h" 22 | 23 | namespace xllm { 24 | namespace layer { 25 | 26 | class Qwen3DecoderLoader : public BaseLoader { 27 | public: 28 | Qwen3DecoderLoader(uint64_t weight_count, 29 | const ModelContext& context, 30 | bool enableAddNorm); 31 | 32 | void load_state_dict(const StateDict& state_dict) override; 33 | void verify_loaded_weights() const override; 34 | void merge_loaded_weights() override; 35 | 36 | protected: 37 | torch::Tensor at_placeholder_; 38 | bool enableAddNorm_; 39 | int rank_id_; 40 | }; 41 | 42 | } // namespace layer 43 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/request/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | request 7 | HDRS 8 | dit_request.h 9 | dit_request_params.h 10 | finish_reason.h 11 | incremental_decoder.h 12 | mm_codec.h 13 | mm_data.h 14 | mm_handler.h 15 | mm_input.h 16 | request_base.h 17 | request.h 18 | dit_request.h 19 | request_output.h 20 | dit_request_output.h 21 | dit_request_params.h 22 | request_params.h 23 | sequence.h 24 | sequence_logprob_state.h 25 | sequence_kv_state.h 26 | sequences_group.h 27 | request_state.h 28 | stopping_checker.h 29 | priority_comparator.h 30 | SRCS 31 | dit_request.cpp 32 | finish_reason.cpp 33 | incremental_decoder.cpp 34 | mm_codec.cpp 35 | mm_data.cpp 36 | mm_handler.cpp 37 | mm_input.cpp 38 | request.cpp 39 | dit_request.cpp 40 | request_output.cpp 41 | dit_request_output.cpp 42 | request_params.cpp 43 | dit_request_params.cpp 44 | sequence.cpp 45 | sequence_logprob_state.cpp 46 | sequence_kv_state.cpp 47 | sequences_group.cpp 48 | request_state.cpp 49 | stopping_checker.cpp 50 | priority_comparator.cpp 51 | DEPS 52 | :kv_cache 53 | :prefix_cache 54 | :block 55 | :tokenizer 56 | :chat_template 57 | glog::glog 58 | absl::strings 59 | absl::time 60 | proto::xllm_proto 61 | torch 62 | ${OpenCV_LIBS} 63 | ) 64 | 65 | -------------------------------------------------------------------------------- /xllm/core/framework/eplb/eplb_policy_test.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "eplb_policy.h" 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | namespace xllm { 23 | 24 | TEST(EplbPolicyTest, Build) { 25 | std::string rank_table_file; 26 | EplbPolicy eplb_policy(5, 4, 1); 27 | std::vector tensors; 28 | tensors.push_back(torch::arange(0, 16)); 29 | 30 | auto expert_load = torch::stack(tensors, 0); 31 | expert_load[0] = 32 | torch::tensor({100, 100, 100, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 100}); 33 | auto [rebalance_expert, enable_update_vec] = 34 | eplb_policy.rebalance_experts(expert_load); 35 | LOG(INFO) << "rebalance_expert:" << rebalance_expert; 36 | } 37 | 38 | } // namespace xllm 39 | -------------------------------------------------------------------------------- /xllm/core/util/device_name_utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | 23 | namespace xllm { 24 | 25 | class DeviceNameUtils { 26 | public: 27 | static std::vector parse_devices( 28 | const std::string& device_str); 29 | 30 | template 31 | static std::string to_string(const std::vector& items) { 32 | std::stringstream ss; 33 | for (size_t i = 0; i < items.size(); ++i) { 34 | const auto& item = items[i]; 35 | if (i == 0) { 36 | ss << item; 37 | } else { 38 | ss << "," << item; 39 | } 40 | } 41 | return ss.str(); 42 | } 43 | }; 44 | 45 | } // namespace xllm 46 | -------------------------------------------------------------------------------- /xllm/core/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cc_library) 2 | include(cc_test) 3 | 4 | cc_library( 5 | NAME 6 | common 7 | HDRS 8 | etcd_client.h 9 | global_flags.h 10 | instance_name.h 11 | macros.h 12 | message.h 13 | metrics.h 14 | $<$:mspti_helper.h> 15 | options.h 16 | rate_limiter.h 17 | types.h 18 | device_monitor.h 19 | version_singleton.h 20 | SRCS 21 | etcd_client.cpp 22 | global_flags.cpp 23 | metrics.cpp 24 | $<$:mspti_helper.cpp> 25 | options.cpp 26 | rate_limiter.cpp 27 | device_monitor.cpp 28 | DEPS 29 | util 30 | absl::random_random 31 | absl::strings 32 | torch 33 | $<$:torch_npu> 34 | $<$:mspti> 35 | $<$:ms_tools_ext> 36 | Boost::serialization 37 | cpprest 38 | etcd-cpp-api 39 | $<$:torch_mlu> 40 | ) 41 | 42 | cc_library( 43 | NAME 44 | flags 45 | HDRS 46 | global_flags.h 47 | SRCS 48 | global_flags.cpp 49 | DEPS 50 | gflags::gflags 51 | ) 52 | 53 | cc_test( 54 | NAME 55 | common_test 56 | SRCS 57 | rate_limiter_test.cpp 58 | DEPS 59 | common 60 | absl::synchronization 61 | absl::time 62 | GTest::gtest_main 63 | gflags::gflags 64 | glog::glog 65 | ) 66 | target_link_libraries(common PRIVATE OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf) 67 | add_dependencies(common brpc-static) 68 | 69 | 70 | -------------------------------------------------------------------------------- /xllm/core/common/interruption_bus.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | namespace xllm { 22 | class ForwardInterruptedException : public std::exception {}; 23 | 24 | class InterruptionBus { 25 | public: 26 | void subscribe(std::function func) { observers_.push_back(func); } 27 | 28 | void publish(bool interruption) { 29 | for (auto it = observers_.begin(); it != observers_.end(); ++it) { 30 | auto& observer = *it; 31 | observer(interruption); 32 | } 33 | } 34 | 35 | static InterruptionBus& get_instance() { 36 | static InterruptionBus instance; 37 | return instance; 38 | } 39 | 40 | private: 41 | std::vector> observers_; 42 | }; 43 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/common/layer_utils.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "layer_utils.h" 17 | 18 | #include "framework/parallel_state/parallel_state.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | void update_dummy_run_input(int64_t dp_rank, 24 | torch::Tensor& positions, 25 | ModelInputParams& input_params) { 26 | auto& dp_ranks = input_params.dp_global_token_nums; 27 | bool is_dummy_run = dp_ranks[dp_rank] == 0; 28 | for (size_t i = 0; i < dp_ranks.size(); i++) { 29 | if (dp_ranks[i] == 0) { 30 | dp_ranks[i] = 1; 31 | } 32 | } 33 | if (is_dummy_run) { 34 | positions = torch::tensor({1}).to(torch::kInt32).to(positions.device()); 35 | } 36 | } 37 | 38 | } // namespace layer 39 | } // namespace xllm 40 | -------------------------------------------------------------------------------- /xllm/core/layers/deepseek_v2_decoder_layer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "config.h" 19 | 20 | namespace xllm { 21 | namespace layer { 22 | 23 | // DeepSeek V3.2 used different structure but 24 | // it is still compatible with DeepSeek V2. 25 | class DeepseekV2DecoderLayer 26 | : public torch::nn::ModuleHolder { 27 | public: 28 | using torch::nn::ModuleHolder::ModuleHolder; 29 | using Impl __attribute__((__unused__)) = DeepseekV2DecoderLayerImpl; 30 | 31 | DeepseekV2DecoderLayer(const ModelContext& context, const int32_t layer_id) 32 | : ModuleHolder( 33 | std::make_shared(context, layer_id)) {} 34 | }; 35 | 36 | } // namespace layer 37 | } // namespace xllm 38 | -------------------------------------------------------------------------------- /xllm/parser/reasoning_parser.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include "parser/detector_registry.h" 23 | 24 | namespace xllm { 25 | 26 | class ReasoningParser { 27 | public: 28 | ReasoningParser(const std::string& model_type, 29 | bool stream_reasoning = true, 30 | bool force_reasoning = false); 31 | 32 | // Non-streaming call: one-time parsing 33 | ReasoningResult parse_non_stream(const std::string& text); 34 | // Streaming call: incremental parsing 35 | ReasoningResult parse_stream_chunk(const std::string& chunk_text); 36 | 37 | private: 38 | std::unique_ptr detector_; 39 | }; 40 | } // namespace xllm -------------------------------------------------------------------------------- /docs/en/features/topk_topp.md: -------------------------------------------------------------------------------- 1 | # Topk & Topp Operator Optimization 2 | 3 | ## Background 4 | 5 | In natural language generation tasks, the topK and topP sampling strategies are widely used to control the diversity and quality of generated text. However, in small models, the computation time for these two strategies is relatively long. This is mainly due to the fewer parameters in small models, which leads to reduced efficiency in sorting and filtering when processing probability distributions, thereby affecting generation speed. Therefore, optimizing the implementation of topK and topP in small models can enhance their sampling efficiency. 6 | 7 | ## Feature Introduction 8 | 9 | The implementation of the topKtopP operator merges multiple small operators, such as sorting, topK, softmax, and topP, into a single large operator, thereby improving computational efficiency and performance. 10 | 11 | ## User Interface 12 | 13 | ### Operator Call API 14 | 15 | ```c++ 16 | void top_k_top_p(torch::Tensor& logits, 17 | const torch::Tensor& topK, 18 | const torch::Tensor& topP); 19 | ``` 20 | 21 | - `logits`: The input logits tensor containing the model's output scores. 22 | - `topK`: The threshold tensor for selecting the top K probabilities. 23 | - `topP`: The threshold tensor for selecting the cumulative probabilities. 24 | 25 | ## Performance Effect 26 | 27 | * After using the topKtopP fused operator, in the qwen2-0.5B model, TTOT **decreased by 37%**, and TTFT **increased by 10%**. -------------------------------------------------------------------------------- /xllm/core/common/instance_name.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | namespace xllm { 21 | 22 | class InstanceName { 23 | public: 24 | static InstanceName* name() { 25 | static InstanceName n; 26 | return &n; 27 | } 28 | 29 | void set_name(const std::string& name) { 30 | name_ = name; 31 | name_hash_ = std::to_string(std::hash{}(name_)); 32 | } 33 | 34 | std::string get_name() const { return name_; } 35 | 36 | std::string get_name_hash() const { return name_hash_; } 37 | 38 | private: 39 | InstanceName() {} 40 | InstanceName(const InstanceName&) = delete; 41 | InstanceName& operator=(const InstanceName&) = delete; 42 | 43 | private: 44 | std::string name_; 45 | std::string name_hash_; 46 | }; 47 | 48 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/layers/npu/buffer/atb_workspace.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #include "atb_buffer.h" 24 | 25 | namespace xllm { 26 | 27 | class AtbWorkspace { 28 | public: 29 | AtbWorkspace() = default; 30 | 31 | AtbWorkspace(at::Device device); 32 | 33 | ~AtbWorkspace(); 34 | 35 | AtbWorkspace(const AtbWorkspace&) = delete; 36 | 37 | AtbWorkspace& operator=(const AtbWorkspace&) = delete; 38 | 39 | AtbWorkspace(AtbWorkspace&&) = default; 40 | 41 | AtbWorkspace& operator=(AtbWorkspace&&) = default; 42 | 43 | void* get_workspace_buffer(uint64_t bufferSize); 44 | 45 | private: 46 | static std::map> buffer_map_; 47 | }; 48 | 49 | } // namespace xllm 50 | -------------------------------------------------------------------------------- /xllm/server/xllm_server_registry.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "xllm_server.h" 22 | 23 | namespace xllm { 24 | 25 | class ServerRegistry { 26 | public: 27 | static ServerRegistry& get_instance() { 28 | static ServerRegistry instance; 29 | return instance; 30 | } 31 | 32 | XllmServer* register_server(const std::string& name); 33 | void unregister_server(const std::string& name); 34 | XllmServer* get_server(const std::string& name); 35 | 36 | private: 37 | ServerRegistry() = default; 38 | ~ServerRegistry() = default; 39 | DISALLOW_COPY_AND_ASSIGN(ServerRegistry); 40 | 41 | std::unordered_map> servers_; 42 | std::mutex mutex_; 43 | }; 44 | 45 | } // namespace xllm 46 | -------------------------------------------------------------------------------- /docs/en/features/ppmatmul.md: -------------------------------------------------------------------------------- 1 | # PpMatmul Operator Optimization 2 | 3 | ## Background 4 | 5 | In the inference of large models, matrix multiplication accounts for a high proportion and takes a long time. We have optimized the implementation of the matrix multiplication operator. 6 | 7 | ## Feature Introduction 8 | 9 | The PpMatmul operator uses a Tiling strategy to decompose matrix multiplication into multiple smaller matrix multiplication tasks. However, when the number of tiles is small, tasks cannot be evenly distributed across all NPU cores, leading to the tail effect problem, which affects computational efficiency. We optimize the performance of the PpMatmul operator by prefetching memory or redistributing tasks. 10 | 11 | ## User Interface 12 | 13 | ### Operator Direct Call API 14 | 15 | ```cpp 16 | aclnnStatus aclnnPpMatmulOptGetWorkspaceSize( 17 | const aclTensor *a, 18 | const aclTensor *b, 19 | const aclTensor *out, 20 | uint64_t *workspaceSize, 21 | aclOpExecutor **executor); 22 | 23 | aclnnStatus aclnnPpMatmulOpt( 24 | void *workspace, 25 | uint64_t workspaceSize, 26 | aclOpExecutor *executor, 27 | aclrtStream stream); 28 | ``` 29 | 30 | - `a`: Input matrix A. 31 | - `b`: Input matrix B. 32 | - `out`: Output matrix, storing the computation result. 33 | 34 | ## Performance Effect 35 | 36 | For cases with a small number of tiles (e.g., when M is small, corresponding to a small batch size), there is an **18%** performance improvement of the operator compared to before optimization when (TP=4). -------------------------------------------------------------------------------- /xllm/core/platform/npu/npu_layer_synchronizer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | namespace xllm { 24 | 25 | class NPULayerSynchronizerImpl { 26 | public: 27 | NPULayerSynchronizerImpl(const int64_t num_layers, 28 | const int32_t timeout = -1); 29 | virtual ~NPULayerSynchronizerImpl(); 30 | 31 | aclrtEvent* get_event(const int64_t layer_index); 32 | std::atomic* get_event_flag(const int64_t layer_index); 33 | bool synchronize_layer(const int64_t layer_index); 34 | uint32_t get_event_size() { return events_.size(); }; 35 | 36 | private: 37 | std::vector events_; 38 | std::vector> event_record_flags_; 39 | const int32_t timeout_; 40 | }; 41 | 42 | } // namespace xllm 43 | -------------------------------------------------------------------------------- /xllm/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import os 3 | import xllm 4 | import sys 5 | 6 | def get_python_version(): 7 | return f"{sys.version_info.major}{sys.version_info.minor}" 8 | 9 | install_path_x86 = os.path.dirname(xllm.__file__) + f"/xllm_export.cpython-{get_python_version()}-x86_64-linux-gnu.so" 10 | install_path_arm = os.path.dirname(xllm.__file__) + f"/xllm_export.cpython-{get_python_version()}-aarch64-linux-gnu.so" 11 | if os.path.exists(install_path_x86): 12 | install_path = install_path_x86 13 | elif os.path.exists(install_path_arm): 14 | install_path = install_path_arm 15 | else: 16 | raise ValueError("cannot open shared object file: No such file or directory, required ", install_path_x86, " or ", install_path_arm) 17 | export_so_path = os.path.abspath(install_path) 18 | spec = importlib.util.spec_from_file_location("xllm_export", export_so_path) 19 | xllm_export = importlib.util.module_from_spec(spec) 20 | 21 | from xllm.pybind.embedding import Embedding 22 | from xllm.pybind.llm import LLM 23 | from xllm.pybind.vlm import VLM 24 | from xllm.pybind.args import ArgumentParser 25 | from xllm_export import (LLMMaster, Options, RequestParams, RequestOutput, 26 | SequenceOutput, Status, StatusCode, MMType, MMData) 27 | 28 | __all__ = [ 29 | "ArgumentParser", 30 | "Embedding", 31 | "LLM", 32 | "LLMMaster", 33 | "VLM", 34 | "VLMMaster" 35 | "Options", 36 | "RequestParams", 37 | "RequestOutput", 38 | "SequenceOutput", 39 | "Status", 40 | "StatusCode", 41 | ] 42 | -------------------------------------------------------------------------------- /xllm/core/framework/request/finish_reason.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | namespace xllm { 22 | class FinishReason { 23 | public: 24 | enum Value : uint8_t { NONE = 0, STOP = 1, LENGTH, FUNCTION_CALL }; 25 | 26 | FinishReason() = default; 27 | FinishReason(Value v) : value(v) {} 28 | operator Value() const { return value; } 29 | explicit operator bool() const = delete; 30 | 31 | bool operator==(FinishReason rhs) const { return value == rhs.value; } 32 | bool operator!=(FinishReason rhs) const { return value != rhs.value; } 33 | 34 | bool operator==(Value v) const { return value == v; } 35 | bool operator!=(Value v) const { return value != v; } 36 | 37 | std::optional to_string(); 38 | 39 | private: 40 | Value value; 41 | }; 42 | } // namespace xllm 43 | -------------------------------------------------------------------------------- /xllm/core/util/pretty_print.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "pretty_print.h" 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | namespace xllm { 24 | 25 | std::string readable_size(size_t bytes) { 26 | static const std::array suffixes = { 27 | "B", "KB", "MB", "GB", "TB"}; 28 | const size_t bytes_in_kb = 1024; 29 | double size = static_cast(bytes); 30 | size_t suffix_index = 0; 31 | while (size >= bytes_in_kb && suffix_index < suffixes.size() - 1) { 32 | size /= bytes_in_kb; 33 | ++suffix_index; 34 | } 35 | std::stringstream stream; 36 | stream << std::fixed << std::setprecision(2) << size << " " 37 | << suffixes.at(suffix_index); 38 | return stream.str(); 39 | } 40 | 41 | } // namespace xllm 42 | -------------------------------------------------------------------------------- /xllm/api_service/models_service_impl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include "core/common/macros.h" 22 | #include "models.pb.h" 23 | 24 | namespace xllm { 25 | 26 | class ModelsServiceImpl final { 27 | public: 28 | ModelsServiceImpl(const std::vector& model_names, 29 | const std::vector& model_versions); 30 | 31 | bool list_models(const proto::ModelListRequest* request, 32 | proto::ModelListResponse* response); 33 | std::string list_model_versions(); 34 | 35 | private: 36 | DISALLOW_COPY_AND_ASSIGN(ModelsServiceImpl); 37 | 38 | std::vector model_names_; 39 | std::vector model_versions_; 40 | uint32_t created_; 41 | }; 42 | 43 | } // namespace xllm 44 | -------------------------------------------------------------------------------- /xllm/core/kernels/npu/xllm_ops/acltensor_utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | #include "acl/acl.h" 24 | #include "util/tensor_helper.h" 25 | 26 | namespace xllm_ops_utils { 27 | struct type_info { 28 | static aclDataType get_acl_type(const torch::ScalarType& dtype); 29 | }; 30 | 31 | void create_acltensor(aclTensor** tensor, const torch::Tensor& tensor_data); 32 | void check_tensor(const torch::Tensor& t, 33 | const std::string& name, 34 | const std::string& func_name = ""); 35 | void check_tensor_shapes_equal(const torch::Tensor& a, 36 | const torch::Tensor& b, 37 | const std::string& func_name = ""); 38 | } // namespace xllm_ops_utils -------------------------------------------------------------------------------- /xllm/core/util/timer.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "timer.h" 18 | 19 | #include 20 | #include 21 | 22 | namespace xllm { 23 | 24 | Timer::Timer() : start_(absl::Now()) {} 25 | 26 | // reset the timer 27 | void Timer::reset() { start_ = absl::Now(); } 28 | 29 | // get the elapsed time in seconds 30 | double Timer::elapsed_seconds() const { 31 | return absl::ToDoubleSeconds(absl::Now() - start_); 32 | } 33 | 34 | // get the elapsed time in milliseconds 35 | double Timer::elapsed_milliseconds() const { 36 | return absl::ToDoubleMilliseconds(absl::Now() - start_); 37 | } 38 | 39 | // get the elapsed time in microseconds 40 | double Timer::elapsed_microseconds() const { 41 | return absl::ToDoubleMicroseconds(absl::Now() - start_); 42 | } 43 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/dit_cache/dit_non_cache.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include "dit_cache_impl.h" 18 | 19 | namespace xllm { 20 | 21 | class DiTNonCache : public DitCacheImpl { 22 | public: 23 | DiTNonCache() = default; 24 | ~DiTNonCache() override = default; 25 | 26 | DiTNonCache(const DiTNonCache&) = delete; 27 | DiTNonCache& operator=(const DiTNonCache&) = delete; 28 | DiTNonCache(DiTNonCache&&) = default; 29 | DiTNonCache& operator=(DiTNonCache&&) = default; 30 | 31 | void init(const DiTCacheConfig& cfg) override; 32 | 33 | bool on_before_block(const CacheBlockIn& blockin) override; 34 | CacheBlockOut on_after_block(const CacheBlockIn& blockin) override; 35 | 36 | bool on_before_step(const CacheStepIn& stepin) override; 37 | CacheStepOut on_after_step(const CacheStepIn& stepin) override; 38 | }; 39 | 40 | } // namespace xllm 41 | -------------------------------------------------------------------------------- /xllm/core/framework/xtensor/options.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | 22 | #include "common/macros.h" 23 | 24 | namespace xllm { 25 | namespace xtensor { 26 | struct Options { 27 | // devices for xtensor manager pool 28 | PROPERTY(std::vector, devices); 29 | 30 | // num of layers 31 | PROPERTY(int64_t, num_layers) = 0; 32 | 33 | // total pages for xtensor manager 34 | PROPERTY(int64_t, num_total_pages) = 0; 35 | 36 | // key or value cache size in bytes per token 37 | PROPERTY(int64_t, cache_size_per_token) = 0; 38 | 39 | // Index ID for internal server ID, which must be set different values 40 | // if the model supports multiple version or there are multiple models. 41 | PROPERTY(int64_t, server_idx) = 0; 42 | }; 43 | } // namespace xtensor 44 | } // namespace xllm -------------------------------------------------------------------------------- /docs/mkdocs/overrides/.icons/gitcodeai.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xllm/api_service/api_service_impl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | #include 18 | 19 | #include 20 | 21 | #include "call.h" 22 | #include "core/distributed_runtime/llm_master.h" 23 | 24 | namespace xllm { 25 | 26 | template 27 | class APIServiceImpl { 28 | public: 29 | APIServiceImpl(const std::vector& models) 30 | : models_(models.begin(), models.end()) { 31 | CHECK(!models_.empty()); 32 | } 33 | virtual ~APIServiceImpl() = default; 34 | 35 | void process_async(std::shared_ptr call) { 36 | std::shared_ptr call_cast = std::dynamic_pointer_cast(call); 37 | process_async_impl(call_cast); 38 | } 39 | 40 | virtual void process_async_impl(std::shared_ptr call) = 0; 41 | 42 | protected: 43 | absl::flat_hash_set models_; 44 | }; 45 | 46 | } // namespace xllm 47 | -------------------------------------------------------------------------------- /xllm/core/layers/npu/loader/qwen3_decoder_manual_loader.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | #include "base_manual_loader.h" 22 | #include "core/layers/npu/npu_base_layer.h" 23 | 24 | namespace xllm { 25 | namespace layer { 26 | 27 | class Qwen3DecoderManualLoader : public BaseManualLoader { 28 | public: 29 | Qwen3DecoderManualLoader(uint64_t weight_count, 30 | const ModelContext& context, 31 | bool enableAddNorm); 32 | 33 | void load_state_dict(const StateDict& state_dict) override; 34 | void verify_loaded_weights() const override; 35 | void merge_loaded_weights() override; 36 | 37 | protected: 38 | void merge_host_at_weights(); 39 | at::Tensor at_placeholder_; 40 | bool enableAddNorm_; 41 | int rank_id_; 42 | }; 43 | 44 | } // namespace layer 45 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/dit_cache/dit_cache.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "dit_cache.h" 17 | 18 | namespace xllm { 19 | 20 | bool DiTCache::init(const DiTCacheConfig& cfg) { 21 | active_cache_ = create_dit_cache(cfg); 22 | if (!active_cache_) { 23 | return false; 24 | } 25 | active_cache_->init(cfg); 26 | return true; 27 | } 28 | 29 | bool DiTCache::on_before_block(const CacheBlockIn& blockin) { 30 | return active_cache_->on_before_block(blockin); 31 | } 32 | 33 | CacheBlockOut DiTCache::on_after_block(const CacheBlockIn& blockin) { 34 | return active_cache_->on_after_block(blockin); 35 | } 36 | 37 | bool DiTCache::on_before_step(const CacheStepIn& stepin) { 38 | return active_cache_->on_before_step(stepin); 39 | } 40 | 41 | CacheStepOut DiTCache::on_after_step(const CacheStepIn& stepin) { 42 | return active_cache_->on_after_step(stepin); 43 | } 44 | 45 | } // namespace xllm 46 | -------------------------------------------------------------------------------- /xllm/parser/reasoning_parser.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | Copyright 2024 The ScaleLLM Authors. All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ==============================================================================*/ 16 | 17 | #include "xllm/parser/reasoning_parser.h" 18 | 19 | namespace xllm { 20 | ReasoningParser::ReasoningParser(const std::string& model_type, 21 | bool stream_reasoning, 22 | bool force_reasoning) { 23 | detector_ = DetectorRegistry::getInstance().getDetector( 24 | model_type, stream_reasoning, force_reasoning); 25 | } 26 | 27 | ReasoningResult ReasoningParser::parse_non_stream(const std::string& text) { 28 | return detector_->detect_and_parse(const_cast(text)); 29 | } 30 | 31 | ReasoningResult ReasoningParser::parse_stream_chunk( 32 | const std::string& chunk_text) { 33 | return detector_->parse_streaming_increment( 34 | const_cast(chunk_text)); 35 | } 36 | 37 | } // namespace xllm -------------------------------------------------------------------------------- /xllm/core/framework/batch/mposition.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | namespace xllm { 24 | 25 | class Sequence; 26 | struct ModelArgs; 27 | 28 | class MPositionHelper { 29 | public: 30 | MPositionHelper(Sequence& seq, const ModelArgs& args) 31 | : seq_(seq), args_(args) {} 32 | 33 | torch::Tensor get_positions(); 34 | 35 | private: 36 | std::tuple get_positions_p( 37 | torch::Tensor image_grid_thw, 38 | torch::Tensor video_grid_thw, 39 | torch::Tensor second_per_grid_ts); 40 | std::tuple get_positions_glm( 41 | torch::Tensor image_grid_thw, 42 | torch::Tensor video_grid_thw); 43 | 44 | torch::Tensor get_positions_d(); 45 | 46 | private: 47 | Sequence& seq_; 48 | const ModelArgs& args_; 49 | }; 50 | 51 | } // namespace xllm 52 | -------------------------------------------------------------------------------- /xllm/core/distributed_runtime/pd_ooc_service_impl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2025 The xLLM Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://github.com/jd-opensource/xllm/blob/main/LICENSE 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #pragma once 17 | 18 | #include "disagg_pd.pb.h" 19 | #include "disagg_pd_service_impl.h" 20 | 21 | namespace xllm { 22 | 23 | class Engine; 24 | class Request; 25 | class PDOOCScheduler; 26 | 27 | // a class to handle disagg_pd OOC requests 28 | class PDOOCServiceImpl final : public DisaggPDServiceImpl { 29 | public: 30 | explicit PDOOCServiceImpl(PDOOCScheduler* scheduler, Engine* engine); 31 | ~PDOOCServiceImpl() = default; 32 | 33 | virtual void decode_recv_multi_generations( 34 | const proto::DisaggGenerationsRequests* request, 35 | proto::Status* response); 36 | 37 | virtual void prefill_recv_pull_signal(const proto::PullSignal* request, 38 | proto::Status* response); 39 | 40 | private: 41 | PDOOCScheduler* pd_ooc_scheduler_; // not owned 42 | }; 43 | 44 | } // namespace xllm 45 | --------------------------------------------------------------------------------