├── tests
    ├── __init__.py
    ├── e2e
    │   ├── __init__.py
    │   ├── nightly
    │   │   ├── ops
    │   │   │   ├── __init__.py
    │   │   │   ├── triton
    │   │   │   │   └── __init__.py
    │   │   │   ├── test_gating_top_k_softmax.py
    │   │   │   ├── test_bgmv_shrink.py
    │   │   │   └── test_bgmv_expand.py
    │   │   └── multi_node
    │   │   │   ├── __init__.py
    │   │   │   └── config
    │   │   │       ├── __init__.py
    │   │   │       └── models
    │   │   │           └── DeepSeek-V3_2-Exp-bf16.yaml
    │   ├── singlecard
    │   │   ├── __init__.py
    │   │   ├── pooling
    │   │   │   ├── __init__.py
    │   │   │   └── test_classification.py
    │   │   └── test_quantization.py
    │   ├── vllm_interface
    │   │   ├── vllm_test.cfg
    │   │   └── singlecard
    │   │   │   └── test_sampler.py
    │   ├── models
    │   │   ├── configs
    │   │   │   ├── ERNIE-4.5-21B-A3B-PT.yaml
    │   │   │   ├── Qwen2.5-Omni-7B.yaml
    │   │   │   ├── Qwen3-8B.yaml
    │   │   │   ├── Qwen3-VL-8B-Instruct.yaml
    │   │   │   ├── Llama-3.2-3B-Instruct.yaml
    │   │   │   ├── InternVL3_5-8B-hf.yaml
    │   │   │   ├── llava-onevision-qwen2-0.5b-ov-hf.yaml
    │   │   │   ├── Qwen3-Omni-30B-A3B-Instruct.yaml
    │   │   │   ├── Qwen2-Audio-7B-Instruct.yaml
    │   │   │   ├── Qwen3-8B-W8A8.yaml
    │   │   │   ├── Qwen3-VL-30B-A3B-Instruct.yaml
    │   │   │   ├── Molmo-7B-D-0924.yaml
    │   │   │   ├── internlm3-8b-instruct.yaml
    │   │   │   ├── gemma-3-4b-it.yaml
    │   │   │   ├── Qwen3-Next-80B-A3B-Instruct.yaml
    │   │   │   ├── accuracy.txt
    │   │   │   ├── Qwen3-30B-A3B-W8A8.yaml
    │   │   │   └── Qwen3-30B-A3B.yaml
    │   │   └── report_template.md
    │   ├── prompts
    │   │   └── example.txt
    │   ├── multicard
    │   │   ├── test_ilama_lora_tp2.py
    │   │   ├── test_expert_parallel.py
    │   │   ├── test_data_parallel_tp2.py
    │   │   └── test_chunk_gated_delta_rule.py
    │   └── run_doctests.sh
    └── ut
    │   ├── __init__.py
    │   ├── distributed
    │       └── test_determin_expert_map_all.py
    │   ├── ops
    │       └── expert_map.json
    │   ├── fake_weight
    │       └── config.json
    │   ├── eplb
    │       └── core
    │       │   └── policy
    │       │       ├── test_policy_factor.py
    │       │       └── test_policy_abstract.py
    │   ├── conftest.py
    │   ├── sample
    │       └── test_sampler.py
    │   └── base.py
├── vllm_ascend
    ├── core
    │   └── __init__.py
    ├── eplb
    │   ├── __init__.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   └── policy
    │   │   │   ├── __init__.py
    │   │   │   ├── policy_random.py
    │   │   │   ├── policy_abstract.py
    │   │   │   └── policy_factory.py
    │   └── adaptor
    │   │   ├── __init__.py
    │   │   └── abstract_adaptor.py
    ├── lora
    │   └── __init__.py
    ├── attention
    │   └── __init__.py
    ├── sample
    │   └── __init__.py
    ├── worker
    │   ├── __init__.py
    │   └── v2
    │   │   ├── __init__.py
    │   │   ├── input_batch.py
    │   │   └── utils.py
    ├── xlite
    │   ├── __init__.py
    │   ├── xlite_worker.py
    │   └── xlite_model_runner.py
    ├── compilation
    │   ├── __init__.py
    │   ├── passes
    │   │   └── __init__.py
    │   └── npugraph_ex_passes
    │   │   └── __init__.py
    ├── kv_offload
    │   └── __init__.py
    ├── model_loader
    │   ├── __init__.py
    │   └── netloader
    │   │   ├── executor
    │   │       └── __init__.py
    │   │   ├── interaction
    │   │       └── __init__.py
    │   │   └── __init__.py
    ├── ops
    │   ├── fused_moe
    │   │   └── __init__.py
    │   ├── triton
    │   │   ├── __init__.py
    │   │   ├── fla
    │   │   │   └── __init__.py
    │   │   ├── mamba
    │   │   │   └── __init__.py
    │   │   ├── linearnorm
    │   │   │   └── __init__.py
    │   │   └── triton_utils.py
    │   └── activation.py
    ├── quantization
    │   ├── __init__.py
    │   └── compressed_tensors
    │   │   └── __init__.py
    ├── device_allocator
    │   └── __init__.py
    ├── distributed
    │   ├── kvpool
    │   │   ├── __init__.py
    │   │   └── backend
    │   │   │   ├── __init__.py
    │   │   │   └── backend.py
    │   ├── cpu_offload_manager
    │   │   └── __init__.py
    │   ├── device_communicators
    │   │   └── __init__.py
    │   └── __init__.py
    ├── _cann_ops_custom
    │   └── .gitkeep
    ├── patch
    │   ├── platform
    │   │   ├── patch_sched_yield.py
    │   │   ├── __init__.py
    │   │   └── patch_ec_connector.py
    │   └── worker
    │   │   ├── patch_rejection_sampler.py
    │   │   ├── patch_triton.py
    │   │   ├── patch_module.py
    │   │   ├── patch_rope.py
    │   │   ├── patch_minicpm.py
    │   │   ├── patch_bert.py
    │   │   └── __init__.py
    ├── __init__.py
    ├── flash_common3_context.py
    └── spec_decode
    │   └── __init__.py
├── packages.txt
├── docs
    ├── requirements-test.txt
    ├── source
    │   ├── assets
    │   │   ├── eplb.png
    │   │   ├── workflow.png
    │   │   ├── deployment.png
    │   │   ├── multi_node_dp_kimi.png
    │   │   ├── multi_node_dp_deepseek.png
    │   │   ├── disaggregated_prefill_pull.png
    │   │   └── disaggregated_prefill_push.png
    │   ├── logos
    │   │   ├── vllm-ascend-logo-text-dark.png
    │   │   └── vllm-ascend-logo-text-light.png
    │   ├── user_guide
    │   │   ├── feature_guide
    │   │   │   ├── images
    │   │   │   │   ├── eplb_img.png
    │   │   │   │   ├── netloader_flowchart.png
    │   │   │   │   ├── structured_output_1.png
    │   │   │   │   └── netloader_timing_diagram.png
    │   │   │   ├── index.md
    │   │   │   ├── structured_output.md
    │   │   │   └── lora.md
    │   │   ├── configuration
    │   │   │   ├── index.md
    │   │   │   └── env_vars.md
    │   │   └── support_matrix
    │   │   │   └── index.md
    │   ├── developer_guide
    │   │   ├── evaluation
    │   │   │   └── index.md
    │   │   ├── performance_and_debug
    │   │   │   └── index.md
    │   │   └── feature_guide
    │   │   │   ├── index.md
    │   │   │   └── add_custom_aclnn_op.md
    │   ├── tutorials
    │   │   └── index.md
    │   ├── locale
    │   │   └── zh_CN
    │   │   │   └── LC_MESSAGES
    │   │   │       ├── tutorials
    │   │   │           └── index.po
    │   │   │       ├── developer_guide
    │   │   │           ├── evaluation
    │   │   │           │   ├── index.po
    │   │   │           │   └── accuracy_report
    │   │   │           │   │   └── index.po
    │   │   │           ├── performance_and_debug
    │   │   │           │   └── index.po
    │   │   │           ├── modeling
    │   │   │           │   ├── adding_a_new_multimodal_model.po
    │   │   │           │   └── index.po
    │   │   │           └── feature_guide
    │   │   │           │   └── index.po
    │   │   │       └── user_guide
    │   │   │           ├── configuration
    │   │   │               ├── env_vars.po
    │   │   │               └── index.po
    │   │   │           ├── feature_guide
    │   │   │               └── index.po
    │   │   │           └── support_matrix
    │   │   │               └── index.po
    │   └── community
    │   │   └── user_stories
    │   │       └── llamafactory.md
    ├── requirements-docs.txt
    ├── README.md
    └── Makefile
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── 100-documentation.yml
    │   ├── 800-others.yml
    │   ├── 500-feature-request.yml
    │   ├── 110-user-story.yml
    │   ├── 750-RFC.yml
    │   ├── 600-new-model.yml
    │   ├── 200-installation.yml
    │   └── 300-usage.yml
    ├── dependabot.yml
    ├── workflows
    │   ├── matchers
    │   │   ├── mypy.json
    │   │   └── actionlint.json
    │   ├── bot_merge_conflict.yaml
    │   ├── schedule_test_vllm_main.yaml
    │   └── _pre_commit.yml
    ├── actionlint.yaml
    ├── labeler.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── Dockerfile.nightly.a2
    ├── Dockerfile.nightly.a3
    └── Dockerfile.buildwheel
├── benchmarks
    ├── requirements-bench.txt
    └── tests
    │   ├── latency-tests.json
    │   └── throughput-tests.json
├── .gitmodules
├── requirements-lint.txt
├── csrc
    ├── dispatch_ffn_combine
    │   ├── op_kernel
    │   │   ├── utils
    │   │   │   ├── const_args.hpp
    │   │   │   ├── layout3d.hpp
    │   │   │   ├── select_helper.hpp
    │   │   │   ├── copy_gm_to_l1_custom.hpp
    │   │   │   └── dispatch_policy_custom.hpp
    │   │   └── unpermute
    │   │   │   └── moe_token_unpermute_tiling.h
    │   └── op_host
    │   │   ├── tiling_args.h
    │   │   ├── dispatch_ffn_combine_proto.cpp
    │   │   └── error_log.h
    ├── kernels
    │   ├── math_utils.h
    │   ├── types.h
    │   └── utils.h
    ├── dispatch_layout
    │   ├── op_kernel
    │   │   ├── dispatch_layout_tiling.h
    │   │   └── dispatch_layout.cpp
    │   └── op_host
    │   │   └── CMakeLists.txt
    ├── notify_dispatch
    │   ├── op_kernel
    │   │   └── notify_dispatch_tiling.h
    │   └── op_host
    │   │   ├── aclnn_notify_dispatch.h
    │   │   └── CMakeLists.txt
    ├── utils
    │   └── inc
    │   │   ├── aclnn_util.h
    │   │   ├── error
    │   │       └── ops_error.h
    │   │   └── fallback_comm.h
    ├── cmake
    │   └── intf.cmake
    ├── dispatch_gmm_combine_decode
    │   └── op_kernel
    │   │   ├── dispatch_gmm_combine_decode
    │   │       ├── gemm
    │   │       │   ├── block
    │   │       │   │   └── block_mmad.h
    │   │       │   └── dispatch_policy.h
    │   │       └── epilogue
    │   │       │   ├── block
    │   │       │       └── block_epilogue.h
    │   │       │   └── dispatch_policy.h
    │   │   └── dispatch_gmm_combine_decode_base.h
    ├── mla_preprocess
    │   └── op_kernel
    │   │   └── kernel
    │   │       ├── layout.h
    │   │       ├── common.h
    │   │       ├── hardware.h
    │   │       ├── set_fpc.h
    │   │       └── iterators
    │   │           └── l1_to_fb_iterator.inc
    ├── moe_combine_normal
    │   ├── op_kernel
    │   │   ├── moe_combine_normal_tiling.h
    │   │   └── moe_combine_normal.cpp
    │   └── op_host
    │   │   └── CMakeLists.txt
    ├── aclnn_torch_adapter
    │   ├── NPUBridge.h
    │   └── NPUBridge.cpp
    ├── moe_dispatch_normal
    │   ├── op_host
    │   │   ├── aclnn_moe_dispatch_normal.h
    │   │   └── CMakeLists.txt
    │   └── op_kernel
    │   │   └── moe_dispatch_normal_tiling.h
    ├── grouped_matmul_swiglu_quant_weight_nz_tensor_list
    │   └── op_host
    │   │   └── grouped_matmul_swiglu_quant_weight_nz_tensor_list.h
    ├── sparse_flash_attention
    │   └── op_host
    │   │   └── CMakeLists.txt
    ├── matmul_allreduce_add_rmsnorm
    │   └── op_host
    │   │   └── aclnn_matmul_allreduce_add_rmsnorm.h
    └── lightning_indexer
    │   └── op_host
    │       └── CMakeLists.txt
├── .gemini
    └── config.yaml
├── CONTRIBUTING.md
├── .readthedocs.yaml
├── requirements-dev.txt
├── requirements.txt
├── tools
    ├── send_request.py
    ├── sphinx-lint.sh
    ├── check_repo.sh
    ├── png-lint.sh
    ├── mypy.sh
    ├── actionlint.sh
    ├── send_mm_request.py
    └── shellcheck.sh
├── mypy.ini
├── examples
    ├── run_dp_server.sh
    ├── chat_templates
    │   └── template_qwen2_audio.jinja
    ├── external_online_dp
    │   └── run_dp_template.sh
    └── offline_inference_npu.py
├── codecov.yml
├── pyproject.toml
├── DCO
└── format.sh


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/ut/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/lora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/singlecard/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/attention/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/sample/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/worker/v2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/xlite/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/compilation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/adaptor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/kv_offload/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/model_loader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/fused_moe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/triton/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/triton/fla/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/multi_node/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/ops/triton/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/singlecard/pooling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/compilation/passes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/device_allocator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/core/policy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/triton/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/multi_node/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/distributed/kvpool/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/triton/linearnorm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | git
2 | vim
3 | wget
4 | jq
5 | curl
6 | 


--------------------------------------------------------------------------------
/tests/ut/distributed/test_determin_expert_map_all.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/compilation/npugraph_ex_passes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/distributed/cpu_offload_manager/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/distributed/kvpool/backend/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/vllm_ascend/model_loader/netloader/executor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/model_loader/netloader/interaction/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm_ascend/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest-asyncio
2 | pytest-mock
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/benchmarks/requirements-bench.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | datasets
3 | modelscope
4 | tabulate


--------------------------------------------------------------------------------
/docs/source/assets/eplb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/eplb.png


--------------------------------------------------------------------------------
/docs/source/assets/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/workflow.png


--------------------------------------------------------------------------------
/docs/source/assets/deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/deployment.png


--------------------------------------------------------------------------------
/docs/source/assets/multi_node_dp_kimi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/multi_node_dp_kimi.png


--------------------------------------------------------------------------------
/docs/source/assets/multi_node_dp_deepseek.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/multi_node_dp_deepseek.png


--------------------------------------------------------------------------------
/docs/source/assets/disaggregated_prefill_pull.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/disaggregated_prefill_pull.png


--------------------------------------------------------------------------------
/docs/source/assets/disaggregated_prefill_push.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/assets/disaggregated_prefill_push.png


--------------------------------------------------------------------------------
/docs/source/logos/vllm-ascend-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/logos/vllm-ascend-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/logos/vllm-ascend-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/logos/vllm-ascend-logo-text-light.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "csrc/third_party/catlass"]
2 | 	path = csrc/third_party/catlass
3 | 	url = https://gitcode.com/cann/catlass.git
4 | 	branch = catlass-v1-stable
5 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/images/eplb_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/user_guide/feature_guide/images/eplb_img.png


--------------------------------------------------------------------------------
/requirements-lint.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | pre-commit==4.0.1
 3 | 
 4 | # type checking
 5 | mypy==1.11.1
 6 | types-PyYAML
 7 | types-regex
 8 | types-requests
 9 | types-setuptools
10 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/images/netloader_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/user_guide/feature_guide/images/netloader_flowchart.png


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/images/structured_output_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/user_guide/feature_guide/images/structured_output_1.png


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/images/netloader_timing_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/vllm-ascend/HEAD/docs/source/user_guide/feature_guide/images/netloader_timing_diagram.png


--------------------------------------------------------------------------------
/docs/source/developer_guide/evaluation/index.md:
--------------------------------------------------------------------------------
 1 | # Accuracy
 2 | 
 3 | :::{toctree}
 4 | :caption: Accuracy
 5 | :maxdepth: 1
 6 | using_evalscope
 7 | using_lm_eval
 8 | using_ais_bench
 9 | using_opencompass
10 | :::
11 | 


--------------------------------------------------------------------------------
/tests/e2e/vllm_interface/vllm_test.cfg:
--------------------------------------------------------------------------------
1 | # Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
2 | BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
3 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp:
--------------------------------------------------------------------------------
1 | 
2 | #ifndef CONST_ARGS_HPP
3 | #define CONST_ARGS_HPP
4 | constexpr static uint64_t MB_SIZE = 1024 * 1024UL;
5 | constexpr static int32_t NUMS_PER_FLAG = 16;
6 | #endif


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx
 2 | sphinx-argparse
 3 | sphinx-book-theme
 4 | sphinx-copybutton
 5 | sphinx-design
 6 | sphinx-togglebutton
 7 | myst-parser
 8 | msgspec
 9 | sphinx-substitution-extensions
10 | sphinx-intl


--------------------------------------------------------------------------------
/docs/source/user_guide/configuration/index.md:
--------------------------------------------------------------------------------
 1 | # Configuration Guide
 2 | 
 3 | This section provides a detailed configuration guide of vLLM Ascend.
 4 | 
 5 | :::{toctree}
 6 | :caption: Configuration Guide
 7 | :maxdepth: 1
 8 | env_vars
 9 | additional_config
10 | :::
11 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/support_matrix/index.md:
--------------------------------------------------------------------------------
 1 | # Features and Models
 2 | 
 3 | This section provides a detailed matrix supported by vLLM Ascend.
 4 | 
 5 | :::{toctree}
 6 | :caption: Support Matrix
 7 | :maxdepth: 1
 8 | supported_models
 9 | supported_features
10 | :::
11 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,flexible-extract"
 7 |     value: 0.71
 8 | num_fewshot: 5
 9 | trust_remote_code: True
10 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen2.5-Omni-7B"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "mmmu_val"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.52
 9 | max_model_len: 8192
10 | gpu_memory_utilization: 0.7
11 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       # Check for updates to GitHub Actions every week
 7 |       interval: "weekly"
 8 |     open-pull-requests-limit: 2
 9 |     reviewers:
10 |       - "Yikun"
11 | 


--------------------------------------------------------------------------------
/docs/source/developer_guide/performance_and_debug/index.md:
--------------------------------------------------------------------------------
 1 | # Performance and Debug
 2 | 
 3 | ::::{toctree}
 4 | :caption: Performance and Debug
 5 | :maxdepth: 1
 6 | performance_benchmark
 7 | profile_execute_duration
 8 | optimization_and_tuning
 9 | service_profiling_guide
10 | msprobe_guide
11 | ::::
12 | 


--------------------------------------------------------------------------------
/vllm_ascend/_cann_ops_custom/.gitkeep:
--------------------------------------------------------------------------------
1 | # This folder is reserved for the installation of custom aclnn operators tailored for vLLM-Ascend.
2 | # Source code of the operators can be found in the `src` folder.
3 | # The operators are compiled into a custom CANN software package and installed to this folder automatically.
4 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-8B.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-8B"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.765
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.81
10 | num_fewshot: 5
11 | enable_thinking: False
12 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-VL-8B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "mmmu_val"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.55
 9 | max_model_len: 8192
10 | batch_size: 32
11 | gpu_memory_utilization: 0.7
12 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Llama-3.2-3B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "LLM-Research/Llama-3.2-3B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.71
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.76
10 | num_fewshot: 5
11 | 


--------------------------------------------------------------------------------
/.gemini/config.yaml:
--------------------------------------------------------------------------------
1 | # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
2 | have_fun: false  # Just review the code
3 | code_review:
4 |   comment_severity_threshold: HIGH  # Reduce quantity of comments
5 |   pull_request_opened:
6 |     summary: false  # Don't summarize the PR in a separate comment
7 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/InternVL3_5-8B-hf.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "OpenGVLab/InternVL3_5-8B-hf"
 2 | runner: "linux-aarch64-a2-1"
 3 | hardware: "Atlas A2 Series"
 4 | model: "vllm-vlm"
 5 | tasks:
 6 |   - name: "mmmu_val"
 7 |     metrics:
 8 |     - name: "acc,none"
 9 |       value: 0.58
10 | max_model_len: 40960
11 | trust_remote_code: True
12 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/llava-onevision-qwen2-0.5b-ov-hf.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "ceval-valid"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.42
 9 | trust_remote_code: True
10 | gpu_memory_utilization: 0.8
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM Ascend
2 | 
3 | You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally.
4 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/configuration/env_vars.md:
--------------------------------------------------------------------------------
 1 | # Environment Variables
 2 | 
 3 | vllm-ascend uses the following environment variables to configure the system:
 4 | 
 5 | :::{literalinclude} ../../../../vllm_ascend/envs.py
 6 | :language: python
 7 | :start-after: begin-env-vars-definition
 8 | :end-before: end-env-vars-definition
 9 | :::
10 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-Omni-30B-A3B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "mmmu_val"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.52
 9 | max_model_len: 8192
10 | tensor_parallel_size: 4
11 | enable_expert_parallel: True
12 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_host/tiling_args.h:
--------------------------------------------------------------------------------
 1 | #ifndef TILING_ARGS_H
 2 | #define TILING_ARGS_H
 3 | #include <cstdint>
 4 | 
 5 | namespace Moe {
 6 | constexpr uint64_t COMBINE_STATE_WIN_OFFSET = 3U * 1024UL * 1024UL;
 7 | constexpr uint64_t NOTIFY_DISPATCH_WIN_OFFSET = 204U * 1024UL * 1024UL;
 8 | }  // namespace Moe
 9 | #endif  // TILING_ARGS_H
10 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen2-Audio-7B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen2-Audio-7B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.44
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.45
10 | num_fewshot: 5
11 | gpu_memory_utilization: 0.8
12 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-8B-W8A8.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "vllm-ascend/Qwen3-8B-W8A8"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.80
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.82
10 | num_fewshot: 5
11 | enable_thinking: False
12 | quantization: ascend
13 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "mypy",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "severity": 3,
11 |           "message": 4
12 |         }
13 |       ]
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-VL-30B-A3B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-VL-30B-A3B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "mmmu_val"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.58
 9 | max_model_len: 8192
10 | tensor_parallel_size: 2
11 | gpu_memory_utilization: 0.7
12 | enable_expert_parallel: True
13 | 


--------------------------------------------------------------------------------
/csrc/kernels/math_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef KERNEL_MATH_UTILS_H
 2 | #define KERNEL_MATH_UTILS_H
 3 | #include <cstdint>
 4 | 
 5 | namespace device_utils {
 6 | 
 7 | template <typename T, T roundVal>
 8 | __aicore__ __force_inline__ T RoundUp(const T &val)
 9 | {
10 |     return (val + roundVal - 1) / roundVal * roundVal;
11 | }
12 | 
13 | };  // namespace device_utils
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Molmo-7B-D-0924.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "LLM-Research/Molmo-7B-D-0924"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm-vlm"
 4 | tasks:
 5 | - name: "ceval-valid"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.71
 9 | max_model_len: 4096
10 | trust_remote_code: True
11 | apply_chat_template: False
12 | fewshot_as_multiturn: False
13 | gpu_memory_utilization: 0.8
14 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/internlm3-8b-instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Shanghai_AI_Laboratory/internlm3-8b-instruct"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "ceval-valid"
 5 |   metrics:
 6 |   - name: "acc,none"
 7 |     value: 0.42
 8 | num_fewshot: 5
 9 | max_model_len: 2048
10 | trust_remote_code: True
11 | dtype: "bfloat16"
12 | apply_chat_template: False
13 | fewshot_as_multiturn: False
14 | 


--------------------------------------------------------------------------------
/tests/ut/ops/expert_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "moe_layer_count":
 3 |     1,
 4 |     "layer_list": [{
 5 |         "layer_id":
 6 |         0,
 7 |         "device_count":
 8 |         2,
 9 |         "device_list": [{
10 |             "device_id": 0,
11 |             "device_expert": [7, 2, 0, 3, 5]
12 |         }, {
13 |             "device_id": 1,
14 |             "device_expert": [6, 1, 4, 7, 2]
15 |         }]
16 |     }]
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/gemma-3-4b-it.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "LLM-Research/gemma-3-4b-it"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.59
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.59
10 | num_fewshot: 5
11 | apply_chat_template: False
12 | fewshot_as_multiturn: False
13 | gpu_memory_utilization: 0.7
14 | enforce_eager: True
15 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-Next-80B-A3B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct"
 2 | hardware: "Atlas A2 Series"
 3 | model: "vllm"
 4 | tasks:
 5 | - name: "ceval-valid_accountant"
 6 |   metrics:
 7 |   - name: "acc,none"
 8 |     value: 0.98
 9 | max_model_len: 4096
10 | tensor_parallel_size: 4
11 | gpu_memory_utilization: 0.7
12 | enable_expert_parallel: True
13 | enforce_eager: True
14 | batch_size: 1
15 | num_fewshot: 5
16 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/accuracy.txt:
--------------------------------------------------------------------------------
 1 | Qwen3-30B-A3B.yaml
 2 | Qwen3-8B.yaml
 3 | Qwen2-7B.yaml
 4 | Qwen2-Audio-7B-Instruct.yaml
 5 | Qwen3-VL-30B-A3B-Instruct.yaml
 6 | Qwen3-VL-8B-Instruct.yaml
 7 | Qwen2.5-Omni-7B.yaml
 8 | Qwen3-Omni-30B-A3B-Instruct.yaml
 9 | InternVL3_5-8B-hf.yaml
10 | ERNIE-4.5-21B-A3B-PT.yaml
11 | gemma-3-4b-it.yaml
12 | internlm3-8b-instruct.yaml
13 | Molmo-7B-D-0924.yaml
14 | llava-onevision-qwen2-0.5b-ov-hf.yaml
15 | Llama-3.2-3B-Instruct.yaml
16 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/platform/patch_sched_yield.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import vllm.distributed.utils
 4 | from vllm.platforms import CpuArchEnum, Platform
 5 | 
 6 | is_arm = (Platform.get_cpu_architecture() == CpuArchEnum.ARM)
 7 | 
 8 | USE_SCHED_YIELD = (
 9 |     ((sys.version_info[:3] >= (3, 11, 1)) or
10 |      (sys.version_info[:2] == (3, 10) and sys.version_info[2] >= 8))
11 |     and not is_arm)
12 | 
13 | vllm.distributed.utils.USE_SCHED_YIELD = USE_SCHED_YIELD
14 | 


--------------------------------------------------------------------------------
/docs/source/developer_guide/feature_guide/index.md:
--------------------------------------------------------------------------------
 1 | # Feature Guide
 2 | 
 3 | This section provides an overview of the features implemented in vLLM Ascend. Developers can refer to this guide to understand how vLLM Ascend works.
 4 | 
 5 | :::{toctree}
 6 | :caption: Feature Guide
 7 | :maxdepth: 1
 8 | patch
 9 | ModelRunner_prepare_inputs
10 | disaggregated_prefill
11 | eplb_swift_balancer.md
12 | Multi_Token_Prediction
13 | ACL_Graph
14 | KV_Cache_Pool_Guide
15 | add_custom_aclnn_op
16 | :::
17 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-30B-A3B-W8A8.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "vllm-ascend/Qwen3-30B-A3B-W8A8"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.9
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.8
10 | num_fewshot: 5
11 | gpu_memory_utilization: 0.7
12 | enable_expert_parallel: True
13 | tensor_parallel_size: 2
14 | apply_chat_template: False
15 | fewshot_as_multiturn: False
16 | quantization: ascend
17 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/index.md:
--------------------------------------------------------------------------------
 1 | # Feature Guide
 2 | 
 3 | This section provides a detailed usage guide of vLLM Ascend features.
 4 | 
 5 | :::{toctree}
 6 | :caption: Feature Guide
 7 | :maxdepth: 1
 8 | graph_mode
 9 | quantization
10 | quantization-llm-compressor
11 | sleep_mode
12 | structured_output
13 | lora
14 | eplb_swift_balancer
15 | netloader
16 | dynamic_batch
17 | kv_pool
18 | external_dp
19 | large_scale_ep
20 | ucm_deployment
21 | Fine_grained_TP
22 | speculative_decoding
23 | :::
24 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/csrc/dispatch_layout/op_kernel/dispatch_layout_tiling.h:
--------------------------------------------------------------------------------
 1 | #ifndef DISPATCH_LAYOUT_TILING_H
 2 | #define DISPATCH_LAYOUT_TILING_H
 3 | 
 4 | #include "kernel_tiling/kernel_tiling.h"
 5 | 
 6 | struct DispatchLayoutInfo {
 7 |     uint32_t numTokens;
 8 |     uint32_t numRanks;
 9 |     uint32_t numExperts;
10 |     uint32_t numTopk;
11 |     uint64_t totalUbSize;
12 | };
13 | 
14 | struct DispatchLayoutTilingData {
15 |     Mc2InitTiling mc2InitTiling;
16 |     Mc2CcTiling mc2CcTiling1;
17 |     DispatchLayoutInfo dispatchLayoutInfo;
18 | };
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/tests/e2e/models/configs/Qwen3-30B-A3B.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "Qwen/Qwen3-30B-A3B"
 2 | hardware: "Atlas A2 Series"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.89
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.85
10 | - name: "ceval-valid"
11 |   metrics:
12 |   - name: "acc,none"
13 |     value: 0.84
14 | num_fewshot: 5
15 | gpu_memory_utilization: 0.6
16 | enable_expert_parallel: True
17 | tensor_parallel_size: 2
18 | apply_chat_template: False
19 | fewshot_as_multiturn: False
20 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM Ascend Plugin documents
 2 | 
 3 | Live doc: https://vllm-ascend.readthedocs.io
 4 | 
 5 | ## Build the docs
 6 | 
 7 | ```bash
 8 | # Install dependencies.
 9 | pip install -r requirements-docs.txt
10 | 
11 | # Build the docs.
12 | make clean
13 | make html
14 | 
15 | # Build the docs with translation
16 | make intl
17 | 
18 | # Open the docs with your browser
19 | python -m http.server -d _build/html/
20 | ```
21 | 
22 | Launch your browser and open:
23 | - English version: http://localhost:8000
24 | - Chinese version: http://localhost:8000/zh_CN
25 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.12"
10 | 
11 | sphinx:
12 |   configuration: docs/source/conf.py
13 |   fail_on_warning: true
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats: []
16 | 
17 | # Optionally declare the Python requirements required to build your docs
18 | python:
19 |   install:
20 |     - requirements: docs/requirements-docs.txt
21 | 


--------------------------------------------------------------------------------
/docs/source/tutorials/index.md:
--------------------------------------------------------------------------------
 1 | # Tutorials
 2 | 
 3 | :::{toctree}
 4 | :caption: Deployment
 5 | :maxdepth: 1
 6 | Qwen2.5-Omni.md
 7 | Qwen2.5-7B
 8 | Qwen3-Dense
 9 | Qwen-VL-Dense.md
10 | Qwen3-30B-A3B.md
11 | Qwen3-235B-A22B.md
12 | Qwen3-VL-235B-A22B-Instruct.md
13 | Qwen3-Coder-30B-A3B
14 | Qwen3_embedding
15 | Qwen3_reranker
16 | Qwen3-8B-W4A8
17 | Qwen3-32B-W4A4
18 | Qwen3-Next
19 | DeepSeek-V3.1.md
20 | DeepSeek-V3.2.md
21 | DeepSeek-R1.md
22 | Kimi-K2-Thinking
23 | pd_disaggregation_mooncake_single_node
24 | pd_disaggregation_mooncake_multi_node
25 | ray
26 | 310p
27 | :::
28 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | -r requirements-lint.txt
 2 | -r requirements.txt
 3 | modelscope
 4 | openai
 5 | pytest >= 6.0,<9.0.0
 6 | pytest-asyncio
 7 | pytest-mock
 8 | lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d 
 9 | types-jsonschema
10 | xgrammar
11 | zmq
12 | types-psutil
13 | pytest-cov
14 | regex
15 | sentence_transformers
16 | ray>=2.47.1,<=2.48.0
17 | protobuf>3.20.0
18 | librosa 
19 | soundfile
20 | pytest_mock
21 | msserviceprofiler>=1.2.2
22 | mindstudio-probe>=8.3.0
23 | arctic-inference==0.1.1
24 | xlite
25 | uc-manager


--------------------------------------------------------------------------------
/benchmarks/tests/latency-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "test_name": "latency_qwen3_8B_tp1",
 4 |     "parameters": {
 5 |       "model": "Qwen/Qwen3-8B",
 6 |       "tensor_parallel_size": 1,
 7 |       "load_format": "dummy",
 8 |       "max_model_len": 16384,
 9 |       "num_iters_warmup": 5,
10 |       "num_iters": 15
11 |     }
12 |   },
13 |   {
14 |     "test_name": "latency_qwen2_5_7B_tp1",
15 |     "parameters": {
16 |       "model": "Qwen/Qwen2.5-7B-Instruct",
17 |       "tensor_parallel_size": 1,
18 |       "load_format": "dummy",
19 |       "num_iters_warmup": 5,
20 |       "num_iters": 15
21 |     }
22 |   }
23 | ]
24 | 


--------------------------------------------------------------------------------
/csrc/notify_dispatch/op_kernel/notify_dispatch_tiling.h:
--------------------------------------------------------------------------------
 1 | #ifndef NOTIFY_DISPATCH_TILING_H
 2 | #define NOTIFY_DISPATCH_TILING_H
 3 | 
 4 | #include "kernel_tiling/kernel_tiling.h"
 5 | 
 6 | struct NotifyDispatchInfo {
 7 |     uint32_t rankSize;
 8 |     uint32_t rankId;
 9 |     uint32_t localRankSize;
10 |     uint32_t localRankId;
11 |     uint32_t sendCount;
12 |     uint32_t numTokens;
13 |     uint32_t aivNum;
14 |     uint64_t totalUbSize;
15 | };
16 | 
17 | struct NotifyDispatchTilingData {
18 |     Mc2InitTiling mc2InitTiling;
19 |     Mc2CcTiling mc2CcTiling1;
20 |     NotifyDispatchInfo notifyDispatchInfo;
21 | };
22 | 
23 | #endif


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_rejection_sampler.py:
--------------------------------------------------------------------------------
 1 | import vllm.v1.sample.rejection_sampler as rs
 2 | 
 3 | from vllm_ascend.sample.rejection_sampler import (apply_sampling_constraints,
 4 |                                                   expand_batch_to_tokens,
 5 |                                                   rejection_sample)
 6 | 
 7 | # TODO: delete this patch after apply_sampling_constraints and rejection_sample
 8 | #   are extracted to as class func of RejectionSampler
 9 | rs.apply_sampling_constraints = apply_sampling_constraints
10 | rs.rejection_sample = rejection_sample
11 | rs.expand_batch_to_tokens = expand_batch_to_tokens
12 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/utils/layout3d.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LAYOUT_3D_HPP
 2 | #define LAYOUT_3D_HPP
 3 | #include "kernel_operator.h"
 4 | #include "catlass/catlass.hpp"
 5 | class Layout3D {
 6 |     int64_t strides[2];
 7 |     public:
 8 |     CATLASS_DEVICE
 9 |     Layout3D() {}
10 |     CATLASS_DEVICE
11 |     Layout3D(int64_t stride0, int64_t stride1) {
12 |         strides[0] = stride0;
13 |         strides[1] = stride1;
14 |     }
15 |     CATLASS_DEVICE
16 |     int64_t operator() (int64_t dim0, int64_t dim1, int64_t dim2) {
17 |         return dim0 * strides[0] + dim1 * strides[1] + dim2;
18 |     }
19 | };
20 | #endif // LAYOUT_3D_HPP
21 | 


--------------------------------------------------------------------------------
/.github/actionlint.yaml:
--------------------------------------------------------------------------------
 1 | self-hosted-runner:
 2 |   # Labels of self-hosted runner in array of strings.
 3 |   labels:
 4 |     - linux-aarch64-a2-0
 5 |     - linux-aarch64-a2-1
 6 |     - linux-aarch64-a2-2
 7 |     - linux-aarch64-a2-4
 8 |     - linux-aarch64-a2-8
 9 |     - linux-arm64-npu-static-8
10 |     - linux-aarch64-310p-1
11 |     - linux-aarch64-310p-2
12 |     - linux-aarch64-310p-4
13 |     - ubuntu-24.04-arm
14 |     - linux-aarch64-a3-1
15 |     - linux-aarch64-a3-2
16 |     - linux-aarch64-a3-4
17 |     - linux-aarch64-a3-8
18 |     - linux-amd64-cpu-0
19 |     - linux-amd64-cpu-8
20 |     - linux-amd64-cpu-16
21 |     - linux-aarch64-a3-0
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Should be mirrored in pyporject.toml
 2 | cmake>=3.26
 3 | decorator
 4 | einops
 5 | numpy<2.0.0
 6 | packaging
 7 | pip
 8 | pybind11
 9 | pyyaml
10 | scipy
11 | pandas
12 | setuptools>=64
13 | setuptools-scm>=8
14 | torch==2.8.0
15 | torchvision
16 | wheel
17 | pandas-stubs
18 | opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
19 | compressed_tensors>=0.11.0
20 | 
21 | # requirements for disaggregated prefill
22 | msgpack
23 | quart
24 | 
25 | # Required for N-gram speculative decoding
26 | numba
27 | 
28 | # Install torch_npu
29 | #--pre
30 | #--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
31 | torch-npu==2.8.0
32 | 
33 | transformers<=4.57.1
34 | fastapi<0.124.0
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://vllm-ascend.readthedocs.org
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://vllm-ascend.readthedocs.org/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | 


--------------------------------------------------------------------------------
/tools/send_request.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import requests
 4 | 
 5 | data: dict[str, Any] = {
 6 |     "messages": [{
 7 |         "role": "user",
 8 |         "content": "",
 9 |     }],
10 | }
11 | 
12 | 
13 | def send_text_request(prompt, model, server, request_args=None):
14 |     data["messages"][0]["content"] = prompt
15 |     data["model"] = model
16 |     url = server.url_for("v1", "chat", "completions")
17 |     if request_args:
18 |         data.update(request_args)
19 |     response = requests.post(url, json=data)
20 |     print("Status Code:", response.status_code)
21 |     response_json = response.json()
22 |     print("Response:", response_json)
23 |     assert response_json["choices"][0]["message"]["content"], "empty response"
24 | 


--------------------------------------------------------------------------------
/csrc/dispatch_layout/op_kernel/dispatch_layout.cpp:
--------------------------------------------------------------------------------
 1 | #include "kernel_operator.h"
 2 | #include "dispatch_layout.h"
 3 | #include "dispatch_layout_tiling.h"
 4 | 
 5 | 
 6 | extern "C" __global__ __aicore__ void dispatch_layout(GM_ADDR topkIdx, GM_ADDR numTokensPerRank, GM_ADDR numTokensPerExpert, 
 7 |                                                       GM_ADDR isTokenInRank, GM_ADDR workspace, GM_ADDR tiling)
 8 | {
 9 |     REGISTER_TILING_DEFAULT(DispatchLayoutTilingData);
10 |     GET_TILING_DATA_WITH_STRUCT(DispatchLayoutTilingData, tilingData, tiling);
11 | 
12 |     TPipe pipe;
13 | 
14 |     DispatchLayout<int32_t> op;
15 |     op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, workspace, &pipe, &tilingData);
16 |     op.Process();
17 | }
18 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | ; warn_return_any = True
 3 | warn_unused_configs = True
 4 | 
 5 | ; Suppress all missing import errors from torch_npu for mypy.
 6 | [mypy-torch_npu.*]
 7 | ignore_missing_imports = True
 8 | 
 9 | [mypy-torchair.*]
10 | ignore_missing_imports = True
11 | 
12 | [mypy-transformers.*]
13 | ignore_missing_imports = True
14 | 
15 | [mypy-lm_eval.*]
16 | ignore_missing_imports = True
17 | 
18 | [mypy-compressed_tensors.*]
19 | ignore_missing_imports = True
20 | 
21 | [mypy-datasets.*]
22 | ignore_missing_imports = True
23 | 
24 | [mypy-llmcompressor.*]
25 | ignore_missing_imports = True
26 | 
27 | [mypy-msprobe.*]
28 | ignore_missing_imports = True
29 | 
30 | [mypy-xlite.*]
31 | ignore_missing_imports = True
32 | 
33 | [mypy-ucm.*]
34 | ignore_missing_imports = True


--------------------------------------------------------------------------------
/tests/ut/fake_weight/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "facebook/opt-125m",
 3 |   "activation_dropout": 0.0,
 4 |   "activation_function": "relu",
 5 |   "architectures": [
 6 |     "OPTForCausalLM"
 7 |   ],
 8 |   "attention_dropout": 0.0,
 9 |   "bos_token_id": 2,
10 |   "do_layer_norm_before": true,
11 |   "dropout": 0.1,
12 |   "eos_token_id": 2,
13 |   "ffn_dim": 3072,
14 |   "hidden_size": 768,
15 |   "init_std": 0.02,
16 |   "layerdrop": 0.0,
17 |   "max_position_embeddings": 2048,
18 |   "model_type": "opt",
19 |   "num_attention_heads": 12,
20 |   "num_hidden_layers": 12,
21 |   "pad_token_id": 1,
22 |   "prefix": "</s>",
23 |   "torch_dtype": "float16",
24 |   "transformers_version": "4.21.0.dev0",
25 |   "use_cache": true,
26 |   "vocab_size": 50272,
27 |   "word_embed_proj_dim": 768
28 | }
29 | 


--------------------------------------------------------------------------------
/vllm_ascend/distributed/kvpool/backend/backend.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from vllm.config import ParallelConfig
 4 | 
 5 | 
 6 | class Backend(ABC):
 7 | 
 8 |     def __init__(self, parallel_config: ParallelConfig):
 9 |         pass
10 | 
11 |     def set_device(self):
12 |         pass
13 | 
14 |     def register_buffer(self, ptrs: list[int], lengths: list[int]):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def exists(self, keys: list[str]) -> list[int]:
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def put(self, keys: list[str], addrs: list[list[int]],
23 |             sizes: list[list[int]]):
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def get(self, keys: list[str], addrs: list[list[int]],
28 |             sizes: list[list[int]]):
29 |         pass
30 | 


--------------------------------------------------------------------------------
/tests/e2e/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/utils/select_helper.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SELECT_HELPER_HPP
 2 | #define SELECT_HELPER_HPP
 3 | 
 4 | #include "catlass/layout/layout.hpp"
 5 | using namespace AscendC;
 6 | using namespace Catlass;
 7 | 
 8 | template <typename Layout, typename ElementType, typename = void>
 9 | struct LayoutBInitializer {
10 |     CATLASS_DEVICE
11 |     static Layout create(uint32_t k, uint32_t n) {
12 |         return Layout{k, n};
13 |     }
14 | };
15 | 
16 | template <typename Layout, typename ElementType>
17 | struct LayoutBInitializer<Layout, ElementType,
18 |     std::enable_if_t<std::is_same_v<Layout, layout::zN>>
19 | > {
20 |     CATLASS_DEVICE
21 |     static Layout create(uint32_t k, uint32_t n) {
22 |         return Layout::template MakeLayout<ElementType>(k, n);
23 |     }
24 | };
25 | #endif // SELECT_HELPER_HPP


--------------------------------------------------------------------------------
/csrc/utils/inc/aclnn_util.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #ifndef OP_API_INC_ACLNN_UTIL_H
11 | #define OP_API_INC_ACLNN_UTIL_H
12 |  
13 | #define ACLNN_API __attribute__((visibility("default")))
14 | #endif // OP_API_INC_ACLNN_UTIL_H


--------------------------------------------------------------------------------
/csrc/cmake/intf.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Huawei Technologies Co., Ltd.
 2 | # This file is a part of the CANN Open Software.
 3 | # Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 4 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 5 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 6 | # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | if (BUILD_OPEN_PROJECT)
11 |     include(${OPS_ADV_CMAKE_DIR}/intf_pub.cmake)
12 | endif ()
13 | 


--------------------------------------------------------------------------------
/.github/workflows/bot_merge_conflict.yaml:
--------------------------------------------------------------------------------
 1 | name: Merge Conflict Labeler
 2 | on:
 3 |   # So that PRs touching the same files as the push are updated
 4 |   push:
 5 |   # So that the `dirtyLabel` is removed if conflicts are resolve
 6 |   # We recommend `pull_request_target` so that github secrets are available.
 7 |   # In `pull_request` we wouldn't be able to change labels of fork PRs
 8 |   pull_request_target:
 9 |     types: [synchronize]
10 | 
11 | jobs:
12 |   main:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: check if prs are dirty
16 |         uses: eps1lon/actions-label-merge-conflict@v3
17 |         with:
18 |           dirtyLabel: "merge-conflicts"
19 |           repoToken: "${{ secrets.GITHUB_TOKEN }}"
20 |           commentOnDirty: "This pull request has conflicts, please resolve those before we can evaluate the pull request."
21 | 


--------------------------------------------------------------------------------
/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/block/block_mmad.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #pragma once
11 | #include "catlass/gemm/block/block_mmad.hpp"
12 | 
13 | #include "block_mmad_preload_async_with_callback_resident_a.h"
14 | 


--------------------------------------------------------------------------------
/csrc/kernels/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | namespace vllm_ascend {
20 | enum struct AscendType {
21 |     FP16 = 0,
22 |     BF16 = 1,
23 |     FP32 = 2,
24 | };
25 | }


--------------------------------------------------------------------------------
/vllm_ascend/model_loader/netloader/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | def register_netloader():
19 |     """Register the NetLoader plugin."""
20 |     from .netloader import ModelNetLoaderElastic  # noqa
21 | 


--------------------------------------------------------------------------------
/examples/run_dp_server.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | export HCCL_IF_IP=2.0.0.0
 3 | export GLOO_SOCKET_IFNAME="eth0"
 4 | export TP_SOCKET_IFNAME="eth0"
 5 | export HCCL_SOCKET_IFNAME="eth0"
 6 | 
 7 | export OMP_PROC_BIND=false
 8 | export OMP_NUM_THREADS=10
 9 | 
10 | export VLLM_USE_MODELSCOPE=true
11 | 
12 | export ASCEND_LAUNCH_BLOCKING=0
13 | 
14 | vllm serve Qwen/Qwen1.5-MoE-A2.7B  \
15 |   --host 0.0.0.0 \
16 |   --port 20002 \
17 |   --served-model-name Qwen \
18 |   --data-parallel-size 2 \
19 |   --data-parallel-size-local 2 \
20 |   --data-parallel-address 2.0.0.0 \
21 |   --data-parallel-rpc-port 13389 \
22 |   --tensor-parallel-size 4 \
23 |   --enable-expert-parallel \
24 |   --no-enable-prefix-caching \
25 |   --max-num-seqs 16 \
26 |   --max-model-len 4096 \
27 |   --max-num-batched-tokens 4096 \
28 |   --gpu-memory-utilization 0.9 \
29 |   --trust-remote-code \
30 |   --enforce-eager
31 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | intl:
23 | 	sphinx-intl build
24 | 	@$(SPHINXBUILD) -b html -D language=zh_CN "$(SOURCEDIR)" "$(BUILDDIR)/html/zh-cn" $(SPHINXOPTS) $(O)
25 | 
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-others.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Others
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm-ascend/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm on ascend.
14 |     description: >
15 |       Anything you want to discuss about vllm on ascend.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/csrc/mla_preprocess/op_kernel/kernel/layout.h:
--------------------------------------------------------------------------------
 1 | /*  Adapted from
 2 |  *      https://gitee.com/ascend/ascend-transformer-boost.git
 3 |  *
 4 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 5 |  * This file is a part of the CANN Open Software.
 6 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 7 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 8 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 9 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
10 |  * See LICENSE in the root of the software repository for the full text of the License.
11 |  */
12 | 
13 | #ifndef INCLUDE_LAYOUT_H
14 | #define INCLUDE_LAYOUT_H
15 | 
16 | enum class DataFormat { ND = 0, NZ, ZN, ZZ, NN, VECTOR };
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | documentation:
 3 |   - changed-files:
 4 |       - any-glob-to-any-file:
 5 |           - 'docs/**'
 6 |           - '**/*.md'
 7 | 
 8 | ci/build:
 9 |   - changed-files:
10 |       - any-glob-to-any-file:
11 |           - '.github/actions/*.yaml'
12 |           - '.github/workflows/*.yaml'
13 | 
14 | 'module:tests':
15 |   - changed-files:
16 |       - any-glob-to-any-file:
17 |           - 'tests/**'
18 | 
19 | 'module:tools':
20 |   - changed-files:
21 |       - any-glob-to-any-file:
22 |           - 'tools/**'
23 | 
24 | 'module:ops':
25 |   - changed-files:
26 |       - any-glob-to-any-file:
27 |           - 'vllm_ascend/ops/**'
28 | 
29 | 'module:quantization':
30 |   - changed-files:
31 |       - any-glob-to-any-file:
32 |           - 'vllm_ascend/quantization/**'
33 | 
34 | 'module:core':
35 |   - changed-files:
36 |       - any-glob-to-any-file:
37 |           - 'vllm_ascend/*.py'
38 | 
39 | 


--------------------------------------------------------------------------------
/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/epilogue/block/block_epilogue.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #pragma once
11 | #include "catlass/epilogue/block/block_epilogue.hpp"
12 | 
13 | #include "block_epilogue_per_token_dequant_swiglu.h"
14 | #include "block_epilogue_per_token_dequant.hpp"
15 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../tutorials/index.md:3
24 | msgid "Deployment"
25 | msgstr "部署"
26 | 
27 | #: ../../tutorials/index.md:1
28 | msgid "Tutorials"
29 | msgstr "教程"
30 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/evaluation/index.md:1
24 | #: ../../developer_guide/evaluation/index.md:3
25 | msgid "Accuracy"
26 | msgstr "准确性"
27 | 


--------------------------------------------------------------------------------
/tests/ut/eplb/core/policy/test_policy_factor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm_ascend.eplb.core.policy.policy_abstract import DynamicConfig
 4 | from vllm_ascend.eplb.core.policy.policy_dynamic_ep import DynamicEplb
 5 | from vllm_ascend.eplb.core.policy.policy_dynamic_ep_v2 import DynamicEplbV2
 6 | from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory
 7 | from vllm_ascend.eplb.core.policy.policy_random import RandomLoadBalance
 8 | 
 9 | 
10 | @pytest.fixture
11 | def dummy_config():
12 |     return DynamicConfig()
13 | 
14 | 
15 | @pytest.mark.parametrize("policy_type, expected_class", [
16 |     (0, RandomLoadBalance),
17 |     (1, DynamicEplb),
18 |     (2, DynamicEplbV2),
19 |     (999, RandomLoadBalance),
20 | ])
21 | def test_generate_policy(policy_type, expected_class, dummy_config):
22 |     policy_instance = PolicyFactory.generate_policy(policy_type, dummy_config)
23 |     assert isinstance(policy_instance, expected_class)
24 | 


--------------------------------------------------------------------------------
/tools/sphinx-lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | sphinx-lint --disable trailing-whitespace,missing-final-newline docs
23 | 


--------------------------------------------------------------------------------
/csrc/moe_combine_normal/op_kernel/moe_combine_normal_tiling.h:
--------------------------------------------------------------------------------
 1 | #ifndef MOE_COMBINE_NORMAL_TILING_H
 2 | #define MOE_COMBINE_NORMAL_TILING_H
 3 | 
 4 | #include <cstdint>
 5 | #include "kernel_tiling/kernel_tiling.h"
 6 | 
 7 | // a3
 8 | struct MoeCombineNormalInfo {
 9 |     uint32_t epWorldSize;
10 |     uint32_t tpWorldSize;
11 |     uint32_t epRankId;
12 |     uint32_t tpRankId;
13 |     uint32_t expertShardType;
14 |     uint32_t moeExpertNum;
15 |     uint32_t moeExpertPerRankNum;
16 |     uint32_t globalBs;
17 |     uint32_t bs;
18 |     uint32_t k;
19 |     uint32_t h;
20 |     uint32_t aivNum;
21 |     uint64_t totalUbSize;
22 |     uint64_t totalWinSize;
23 |     float armAvgFactor;
24 |     float epsilon;
25 | };
26 | struct MoeCombineNormalTilingData {
27 |     Mc2InitTiling mc2InitTiling;
28 |     Mc2CcTiling mc2CcTiling1;
29 |     Mc2CcTiling mc2CcTiling2;
30 |     MoeCombineNormalInfo moeCombineNormalInfo;
31 | };
32 | 
33 | #endif //MOE_COMBINE_NORMAL_TILING_H


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_triton.py:
--------------------------------------------------------------------------------
 1 | import vllm.model_executor.layers.mamba.ops.causal_conv1d
 2 | 
 3 | from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
 4 | from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn
 5 | from vllm_ascend.ops.triton.fla.sigmoid_gating import \
 6 |     fused_recurrent_gated_delta_rule_fwd_kernel
 7 | from vllm_ascend.ops.triton.mamba.causal_conv1d import (
 8 |     causal_conv1d_fn, causal_conv1d_update_npu)
 9 | 
10 | vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
11 | vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
12 | vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel
13 | vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
14 | vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule
15 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance_and_debug/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/performance_and_debug/index.md:1
24 | #: ../../developer_guide/performance_and_debug/index.md:3
25 | msgid "Performance and Debug"
26 | msgstr "性能和调试"
27 | 


--------------------------------------------------------------------------------
/csrc/aclnn_torch_adapter/NPUBridge.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020, Huawei Technologies Co., Ltd
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the BSD-style license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | #pragma once
 8 | #include <c10/core/StorageImpl.h>
 9 | #include "NPUStorageImpl.h"
10 | 
11 | namespace vllm_ascend
12 | {
13 | 
14 |     class NPUBridge
15 |     {
16 |     public:
17 |         // at::tensor to NPUStorageImpl
18 |         static NPUStorageImpl *GetNpuStorageImpl(const at::Tensor &tensor);
19 | 
20 |         // c10::StorageImpl to NPUStorageImpl
21 |         static NPUStorageImpl *GetNpuStorageImpl(c10::StorageImpl *storageImpl);
22 | 
23 |         // c10::Storage to NPUStorageImpl
24 |         static NPUStorageImpl *GetNpuStorageImpl(c10::Storage &&storage);
25 | 
26 |         // tensor to NPUStorageDesc
27 |         static NPUStorageDesc &GetNpuStorageImplDesc(const at::Tensor &tensor);
28 |     };
29 | }
30 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/evaluation/accuracy_report/index.md:1
24 | #: ../../developer_guide/evaluation/accuracy_report/index.md:3
25 | msgid "Accuracy Report"
26 | msgstr "准确性报告"
27 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | 
18 | coverage:
19 |   status:
20 |     # Patch coverage is mandatory and must be >= 80%
21 |     patch:
22 |       default:
23 |         target: 80%
24 |     # non-voting
25 |     project:
26 |       default:
27 |         # non-voting
28 |         informational: true
29 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po:
--------------------------------------------------------------------------------
 1 | # Translations template for PROJECT.
 2 | # Copyright (C) 2025 ORGANIZATION
 3 | # This file is distributed under the same license as the PROJECT project.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 5 | #
 6 | #, fuzzy
 7 | msgid ""
 8 | msgstr ""
 9 | "Project-Id-Version: PROJECT VERSION\n"
10 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
11 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
14 | "Language-Team: LANGUAGE <LL@li.org>\n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.17.0\n"
19 | 
20 | #: ../../user_guide/configuration/env_vars.md:1
21 | msgid "Environment Variables"
22 | msgstr "环境变量"
23 | 
24 | #: ../../user_guide/configuration/env_vars.md:3
25 | msgid ""
26 | "vllm-ascend uses the following environment variables to configure the "
27 | "system:"
28 | msgstr "vllm-ascend 使用以下环境变量来配置系统："
29 | 


--------------------------------------------------------------------------------
/tests/ut/conftest.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # Copyright 2023 The vLLM team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # This file is a part of the vllm-ascend project.
17 | #
18 | 
19 | from vllm_ascend.utils import adapt_patch  # noqa E402
20 | from vllm_ascend.utils import register_ascend_customop
21 | 
22 | adapt_patch()
23 | adapt_patch(True)
24 | 
25 | # register Ascend CustomOp here because uts will use this
26 | register_ascend_customop()
27 | 


--------------------------------------------------------------------------------
/tests/ut/eplb/core/policy/test_policy_abstract.py:
--------------------------------------------------------------------------------
 1 | # test_policy_abstract.py
 2 | from vllm_ascend.eplb.core.policy.policy_abstract import (DynamicConfig,
 3 |                                                           EplbPolicy)
 4 | 
 5 | 
 6 | class DummyPolicy(EplbPolicy):
 7 | 
 8 |     def rebalance_experts(self, current_expert_table, expert_workload):
 9 |         return 1, current_expert_table
10 | 
11 | 
12 | def test_dynamic_config_attributes():
13 |     config = DynamicConfig()
14 |     assert config.placement_policy is None
15 |     assert config.max_transferred_expert_per_layer == 100
16 |     assert config.ep_worldsize == 64
17 |     assert config.num_die_per_host == 8
18 | 
19 | 
20 | def test_eplb_policy_init_and_method():
21 |     config = DynamicConfig()
22 |     policy = DummyPolicy(config)
23 | 
24 |     assert policy.config == config
25 | 
26 |     expert_table = [[0, 1, 2]]
27 |     workload = [10]
28 |     res, new_table = policy.rebalance_experts(expert_table, workload)
29 | 
30 |     assert res == 1
31 |     assert new_table == expert_table
32 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/structured_output.md:
--------------------------------------------------------------------------------
 1 | # Structured Output Guide
 2 | 
 3 | ## Overview
 4 | 
 5 | ### What is structured output?
 6 | 
 7 | LLMs can be unpredictable when you need output in specific formats. Think of asking a model to generate JSON without guidance, it might produce valid text that breaks JSON specification. **Structured Output (also known as Guided Decoding)** enables LLMs to generate outputs that follow a desired structure while preserving the non-deterministic nature of the system.
 8 | 
 9 | In simple terms, structured decoding gives LLMs a "template" to follow. Users provide a schema that "influences" the model output, ensuring compliance with the desired structure.
10 | 
11 | ![structured decoding](./images/structured_output_1.png)
12 | 
13 | ## Usage in vllm-ascend
14 | 
15 | Currently, the usage of structured output feature in vllm-ascend is totally the same as that in vllm.
16 | 
17 | Find more examples and explanations about these usages in [vLLM official document](https://docs.vllm.ai/en/stable/features/structured_outputs/).
18 | 


--------------------------------------------------------------------------------
/examples/chat_templates/template_qwen2_audio.jinja:
--------------------------------------------------------------------------------
 1 | {% set audio_count = namespace(value=0) %}
 2 | {% for message in messages %}
 3 |     {% if loop.first and message['role'] != 'system' %}
 4 |         <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n
 5 |     {% endif %}
 6 |     <|im_start|>{{ message['role'] }}\n
 7 |     {% if message['content'] is string %}
 8 |         {{ message['content'] }}<|im_end|>\n
 9 |     {% else %}
10 |         {% for content in message['content'] %}
11 |             {% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}
12 |                 {% set audio_count.value = audio_count.value + 1 %}
13 |                 Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n
14 |             {% elif 'text' in content %}
15 |                 {{ content['text'] }}
16 |             {% endif %}
17 |         {% endfor %}
18 |         <|im_end|>\n
19 |     {% endif %}
20 | {% endfor %}
21 | {% if add_generation_prompt %}
22 |     <|im_start|>assistant\n
23 | {% endif %}
24 | 


--------------------------------------------------------------------------------
/csrc/moe_combine_normal/op_kernel/moe_combine_normal.cpp:
--------------------------------------------------------------------------------
 1 | #include "kernel_operator.h"
 2 | #include "lib/matmul_intf.h"
 3 | #include "moe_combine_normal.h"
 4 | #include "moe_combine_normal_tiling.h"
 5 | using namespace AscendC;
 6 | using namespace MoeCombineNormalImpl;
 7 | 
 8 | extern "C" __global__ __aicore__ void moe_combine_normal(GM_ADDR recvX, GM_ADDR tokenSrcInfo, GM_ADDR epRecvCount,
 9 |                                                              GM_ADDR topkWeights, GM_ADDR tpRecvCount, GM_ADDR XOut,
10 |                                                              GM_ADDR workspaceGM, GM_ADDR tilingGM)
11 | 
12 | {
13 |     REGISTER_TILING_DEFAULT(MoeCombineNormalTilingData);
14 |     TPipe pipe;
15 | 
16 | #if (ORIG_DTYPE_RECV_X == DT_BF16 || ORIG_DTYPE_RECV_X == DT_FLOAT16)
17 |     GET_TILING_DATA_WITH_STRUCT(MoeCombineNormalTilingData, tilingData, tilingGM);
18 |     MoeCombineNormal<DTYPE_RECV_X, DTYPE_X, int32_t> op;
19 |     op.Init(recvX, tokenSrcInfo, epRecvCount, topkWeights, tpRecvCount, XOut, workspaceGM, &pipe, &tilingData);
20 |     op.Process();
21 | #endif
22 | }


--------------------------------------------------------------------------------
/csrc/moe_dispatch_normal/op_host/aclnn_moe_dispatch_normal.h:
--------------------------------------------------------------------------------
 1 | #ifndef ACLNN_MOE_DISPATCH_NORMAL_H_
 2 | #define ACLNN_MOE_DISPATCH_NORMAL_H_
 3 | 
 4 | #include "aclnn/acl_meta.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | __attribute__((visibility("default"))) aclnnStatus aclnnMoeDispatchNormalGetWorkspaceSize(const aclTensor *x,
11 |     const aclTensor *topkIdx, const aclTensor *sendOffset, const aclTensor *sendTokenIdx, const aclTensor *recvOffset,
12 |     const aclTensor *recvCount, char *groupEp, int64_t epWorldSize, int64_t epRankId, char *groupTpOptional,
13 |     int64_t tpWorldSize, int64_t tpRankId, int64_t moeExpertNum, int64_t quantMode, int64_t globalBs,
14 |     const aclTensor *recvX, const aclTensor *recvXScales, const aclTensor *assistInfoForCombine,
15 |     uint64_t *workspaceSize, aclOpExecutor **executor);
16 | 
17 | __attribute__((visibility("default"))) aclnnStatus aclnnMoeDispatchNormal(
18 |     void *workspace, uint64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream);
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif


--------------------------------------------------------------------------------
/csrc/aclnn_torch_adapter/NPUBridge.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020, Huawei Technologies Co., Ltd
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the BSD-style license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | #include "NPUBridge.h"
 8 | 
 9 | namespace vllm_ascend
10 | {
11 |     NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::StorageImpl *storageImpl)
12 |     {
13 |         return static_cast<NPUStorageImpl *>(storageImpl);
14 |     }
15 | 
16 |     NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::Storage &&storage)
17 |     {
18 |         return static_cast<NPUStorageImpl *>(storage.unsafeGetStorageImpl());
19 |     }
20 | 
21 |     NPUStorageImpl *NPUBridge::GetNpuStorageImpl(const at::Tensor &tensor)
22 |     {
23 |         return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl());
24 |     }
25 | 
26 |     NPUStorageDesc &NPUBridge::GetNpuStorageImplDesc(const at::Tensor &tensor)
27 |     {
28 |         return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl())->npu_desc_;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:1
24 | msgid "Adding a New Multi-Modal Model"
25 | msgstr "添加新的多模态模型"
26 | 
27 | #: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3
28 | msgid "**_Comming soon ..._**"
29 | msgstr "**_敬请期待 ..._**"
30 | 


--------------------------------------------------------------------------------
/tests/e2e/multicard/test_ilama_lora_tp2.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from modelscope import snapshot_download  # type: ignore
 3 | 
 4 | from tests.e2e.conftest import VllmRunner
 5 | from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
 6 |                                                   MODEL_PATH, do_sample)
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("distributed_executor_backend", ["mp"])
10 | def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
11 |     with VllmRunner(snapshot_download(MODEL_PATH),
12 |                     enable_lora=True,
13 |                     max_loras=4,
14 |                     dtype="half",
15 |                     max_model_len=1024,
16 |                     max_num_seqs=16,
17 |                     tensor_parallel_size=2,
18 |                     distributed_executor_backend=distributed_executor_backend,
19 |                     enforce_eager=False) as vllm_model:
20 |         output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
21 | 
22 |     for i in range(len(EXPECTED_LORA_OUTPUT)):
23 |         assert output[i] == EXPECTED_LORA_OUTPUT[i]
24 | 


--------------------------------------------------------------------------------
/csrc/utils/inc/error/ops_error.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | 
11 | /*!
12 |  * \file ops_error.h
13 |  * \brief
14 |  */
15 | 
16 | #pragma once
17 | 
18 | #include "log/ops_log.h"
19 | 
20 | /* 基础报错 */
21 | #define OPS_REPORT_VECTOR_INNER_ERR(OPS_DESC, ...) OPS_INNER_ERR_STUB("E89999", OPS_DESC, __VA_ARGS__)
22 | #define OPS_REPORT_CUBE_INNER_ERR(OPS_DESC, ...) OPS_INNER_ERR_STUB("E69999", OPS_DESC, __VA_ARGS__)
23 | 
24 | /* 条件报错 */
25 | #define OPS_ERR_IF(COND, LOG_FUNC, EXPR) OPS_LOG_STUB_IF(COND, LOG_FUNC, EXPR)
26 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../user_guide/feature_guide/index.md:1
24 | #: ../../user_guide/feature_guide/index.md:5
25 | msgid "Feature Guide"
26 | msgstr "功能指南"
27 | 
28 | #: ../../user_guide/feature_guide/index.md:3
29 | msgid "This section provides a detailed usage guide of vLLM Ascend features."
30 | msgstr "本节提供了 vLLM Ascend 功能的详细使用指南。"
31 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../user_guide/configuration/index.md:1
24 | #: ../../user_guide/configuration/index.md:5
25 | msgid "Configuration Guide"
26 | msgstr "配置指南"
27 | 
28 | #: ../../user_guide/configuration/index.md:3
29 | msgid "This section provides a detailed configuration guide of vLLM Ascend."
30 | msgstr "本节提供了 vLLM Ascend 的详细配置指南。"
31 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po:
--------------------------------------------------------------------------------
 1 | # Translations template for PROJECT.
 2 | # Copyright (C) 2025 ORGANIZATION
 3 | # This file is distributed under the same license as the PROJECT project.
 4 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 5 | #
 6 | #, fuzzy
 7 | msgid ""
 8 | msgstr ""
 9 | "Project-Id-Version: PROJECT VERSION\n"
10 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
11 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
14 | "Language-Team: LANGUAGE <LL@li.org>\n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.17.0\n"
19 | 
20 | #: ../../user_guide/support_matrix/index.md:5
21 | msgid "Support Matrix"
22 | msgstr "支持矩阵"
23 | 
24 | #: ../../user_guide/support_matrix/index.md:1
25 | msgid "Features and models"
26 | msgstr "特性与模型"
27 | 
28 | #: ../../user_guide/support_matrix/index.md:3
29 | msgid "This section provides a detailed supported matrix by vLLM Ascend."
30 | msgstr "本节提供了 vLLM Ascend 的详细支持矩阵。"
31 | 


--------------------------------------------------------------------------------
/tests/e2e/run_doctests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # This file is a part of the vllm-ascend project.
18 | #
19 | 
20 | set -eo errexit
21 | 
22 | . $(dirname "$0")/common.sh
23 | 
24 | export VLLM_USE_MODELSCOPE=true
25 | 
26 | _info "====> Start Quickstart test"
27 | . "${SCRIPT_DIR}/doctests/001-quickstart-test.sh"
28 | 
29 | _info "====> Start pip binary installation test"
30 | . "${SCRIPT_DIR}/doctests/002-pip-binary-installation-test.sh"
31 | 
32 | _info "Doctest passed."
33 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/modeling/index.md:1
24 | #: ../../developer_guide/modeling/index.md:5
25 | msgid "Modeling"
26 | msgstr "新模型"
27 | 
28 | #: ../../developer_guide/modeling/index.md:3
29 | msgid ""
30 | "This section provides tutorials of how to implement and register a new model"
31 | " into vllm-ascend."
32 | msgstr "本节提供了如何在 vllm-ascend 中实现并注册新模型的教程。"
33 | 


--------------------------------------------------------------------------------
/examples/external_online_dp/run_dp_template.sh:
--------------------------------------------------------------------------------
 1 | export HCCL_IF_IP=your_ip_here
 2 | export GLOO_SOCKET_IFNAME=your_socket_ifname_here
 3 | export TP_SOCKET_IFNAME=your_socket_ifname_here
 4 | export HCCL_SOCKET_IFNAME=your_socket_ifname_here
 5 | export VLLM_LOGGING_LEVEL="info"
 6 | export OMP_PROC_BIND=false
 7 | export OMP_NUM_THREADS=10
 8 | export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 9 | export HCCL_DETERMINISTIC=True
10 | export HCCL_BUFFSIZE=1024
11 | export TASK_QUEUE_ENABLE=1
12 | 
13 | export ASCEND_RT_VISIBLE_DEVICES=$1
14 | 
15 | vllm serve model_path \
16 |     --host 0.0.0.0 \
17 |     --port $2 \
18 |     --data-parallel-size $3 \
19 |     --data-parallel-rank $4 \
20 |     --data-parallel-address $5 \
21 |     --data-parallel-rpc-port $6 \
22 |     --tensor-parallel-size $7 \
23 |     --enable-expert-parallel \
24 |     --seed 1024 \
25 |     --served-model-name dsv3 \
26 |     --max-model-len 8192 \
27 |     --max-num-batched-tokens 2048 \
28 |     --max-num-seqs 16 \
29 |     --trust-remote-code \
30 |     --gpu-memory-utilization 0.9 \
31 |     --quantization ascend \
32 |     --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
33 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | # torch_npu.argsort does not sipport bool now, it will support it in the future.
 5 | # TODO When the operator of argsort is ready, this patch must be removed.
 6 | def _argsort(tensor, *args, **kwargs):
 7 |     if tensor.dtype == torch.bool:
 8 |         # If it is not stable, it will have redundant outputs.
 9 |         kwargs["stable"] = True
10 |         return torch.argsort(tensor.to(torch.int32), *args, **kwargs)
11 |     else:
12 |         return torch.argsort(tensor, *args, **kwargs)
13 | 
14 | 
15 | class _TorchWrapper:
16 | 
17 |     def __init__(self):
18 |         self._raw_torch = torch
19 | 
20 |     def __getattr__(self, name):
21 |         if name == "argsort":
22 |             return _argsort
23 |         else:
24 |             return getattr(self._raw_torch, name)
25 | 
26 | 
27 | _is_patched = False
28 | 
29 | 
30 | # patch argsort only for torch in gdn_attn
31 | def patch_torch_npu_argsort():
32 |     global _is_patched
33 |     if not _is_patched:
34 |         import vllm.v1.attention.backends.gdn_attn as gdn_attn
35 |         gdn_attn.torch = _TorchWrapper()
36 |         _is_patched = True
37 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/triton/triton_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | import torch
 4 | from vllm.triton_utils import HAS_TRITON, triton
 5 | 
 6 | _NUM_AICORE = -1
 7 | _NUM_VECTORCORE = -1
 8 | 
 9 | 
10 | def init_device_properties_triton():
11 |     global _NUM_AICORE, _NUM_VECTORCORE
12 |     if _NUM_AICORE == -1 and HAS_TRITON:
13 |         device_properties: Dict[str, Any] = (
14 |             triton.runtime.driver.active.utils.get_device_properties(
15 |                 torch.npu.current_device()))
16 |         _NUM_AICORE = device_properties.get("num_aicore", -1)
17 |         _NUM_VECTORCORE = device_properties.get("num_vectorcore", -1)
18 |         assert _NUM_AICORE > 0 and _NUM_VECTORCORE > 0, "Failed to detect device properties."
19 | 
20 | 
21 | def get_aicore_num():
22 |     global _NUM_AICORE
23 |     assert _NUM_AICORE > 0, "Device properties not initialized. Please call init_device_properties_triton() first."
24 |     return _NUM_AICORE
25 | 
26 | 
27 | def get_vectorcore_num():
28 |     global _NUM_VECTORCORE
29 |     assert _NUM_VECTORCORE > 0, "Device properties not initialized. Please call init_device_properties_triton() first."
30 |     return _NUM_VECTORCORE
31 | 


--------------------------------------------------------------------------------
/docs/source/community/user_stories/llamafactory.md:
--------------------------------------------------------------------------------
 1 | # LLaMA-Factory
 2 | 
 3 | **Introduction**
 4 | 
 5 | [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use and efficient platform for training and fine-tuning large language models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally without writing any code.
 6 | 
 7 | LLaMA-Facotory users need to evaluate and inference the model after fine-tuning.
 8 | 
 9 | **Business challenge**
10 | 
11 | LLaMA-Factory uses Transformers to perform inference on Ascend NPUs, but the speed is slow.
12 | 
13 | **Benefits with vLLM Ascend**
14 | 
15 | With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), LLaMA-Factory has achieved significant performance gains during model inference. Benchmark results show that its inference speed is now up to 2× faster compared to the Transformers implementation.
16 | 
17 | **Learn more**
18 | 
19 | See more details about LLaMA-Factory and how it uses vLLM Ascend for inference on Ascend NPUs in [LLaMA-Factory Ascend NPU Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html).
20 | 


--------------------------------------------------------------------------------
/benchmarks/tests/throughput-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "test_name": "throughput_qwen3_8B_tp1",
 4 |     "parameters": {
 5 |       "model": "Qwen/Qwen3-8B",
 6 |       "tensor_parallel_size": 1,
 7 |       "load_format": "dummy",
 8 |       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
 9 |       "num_prompts": 200,
10 |       "backend": "vllm"
11 |     }
12 |   },
13 |   {
14 |     "test_name": "throughput_qwen2_5vl_7B_tp1",
15 |     "parameters": {
16 |       "model": "Qwen/Qwen2.5-VL-7B-Instruct",
17 |       "tensor_parallel_size": 1,
18 |       "backend": "vllm-chat",
19 |       "dataset_name": "hf",
20 |       "hf_split": "train",
21 |       "max_model_len": 16384,
22 |       "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
23 |       "num_prompts": 200
24 |     }
25 |   },
26 |   {
27 |     "test_name": "throughput_qwen2_5_7B_tp1",
28 |     "parameters": {
29 |       "model": "Qwen/Qwen2.5-7B-Instruct",
30 |       "tensor_parallel_size": 1,
31 |       "load_format": "dummy",
32 |       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
33 |       "num_prompts": 200,
34 |       "backend": "vllm"
35 |     }
36 |   }
37 | ]
38 | 
39 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/core/policy/policy_random.py:
--------------------------------------------------------------------------------
 1 | # Copyright # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
 2 | # Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy.
 3 | import copy
 4 | import random
 5 | 
 6 | from .policy_abstract import DynamicConfig, EplbPolicy
 7 | 
 8 | random.seed(42)
 9 | 
10 | 
11 | class RandomLoadBalance(EplbPolicy):
12 | 
13 |     def __init__(self, config: DynamicConfig):
14 |         super().__init__(config)
15 | 
16 |     def rebalance_experts(self, current_expert_table, expert_workload):
17 |         new_table = copy.deepcopy(current_expert_table)
18 |         num_layers = len(current_expert_table)
19 | 
20 |         for i in range(num_layers):
21 |             # randomly choose two card
22 |             # indices = random.sample(range(num_card), 2)
23 |             indices = [3, 1]
24 | 
25 |             # swap redundant experts
26 |             expert_id_to_exchange = new_table[i][indices[0]][-1].clone()
27 |             new_table[i][indices[0]][-1] = new_table[i][indices[1]][-1]
28 |             new_table[i][indices[1]][-1] = expert_id_to_exchange
29 | 
30 |         return 1, [-i for i in range(num_layers)], new_table
31 | 


--------------------------------------------------------------------------------
/csrc/moe_dispatch_normal/op_kernel/moe_dispatch_normal_tiling.h:
--------------------------------------------------------------------------------
 1 | #ifndef MOE_DISPATCH_NORMAL_TILING_H
 2 | #define MOE_DISPATCH_NORMAL_TILING_H
 3 | 
 4 | struct MoeDispatchNormalInfo {
 5 |     uint32_t epWorldSize;          // epWorldSize
 6 |     uint32_t tpWorldSize;          // tpWorldSize
 7 |     uint32_t epRankId;             // epRankId
 8 |     uint32_t tpRankId;             // tpRankId
 9 |     uint32_t moeExpertNum;         // moe expert number
10 |     uint32_t quantMode;            // quant mode
11 |     uint32_t globalBs;             // globalBs = BS * worldSize
12 |     uint32_t bs;                   // bs
13 |     uint32_t k;                    // k
14 |     uint32_t h;                    // h
15 |     uint32_t aivNum;               // aivNum
16 |     bool isQuant;                  // whether quant or not
17 |     bool reserved2;                // reserved
18 |     bool reserved3;                // reserved
19 |     uint64_t totalUbSize;          // epWorldSize
20 |     uint64_t totalWinSize;
21 | };
22 | 
23 | struct MoeDispatchNormalTilingData {
24 |     Mc2InitTiling mc2InitTiling;
25 |     Mc2CcTiling mc2CcTiling1;
26 |     Mc2CcTiling mc2CcTiling2;
27 |     MoeDispatchNormalInfo moeDispatchNormalInfo;
28 | };
29 | 
30 | #endif


--------------------------------------------------------------------------------
/vllm_ascend/xlite/xlite_worker.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # Copyright 2023 The vLLM team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from vllm_ascend.worker.worker import NPUWorker
17 | from vllm_ascend.xlite.xlite_model_runner import XliteModelRunner
18 | 
19 | 
20 | class XliteWorker(NPUWorker):
21 |     """Xlite worker bases on NPUWorker. Only xlite specified code should be added in this class."""
22 | 
23 |     def init_device(self):
24 |         """Override init_device to init xlite model runner"""
25 |         self.device = self._init_device()
26 |         self.model_runner = XliteModelRunner(self.vllm_config, self.device)
27 | 


--------------------------------------------------------------------------------
/csrc/mla_preprocess/op_kernel/kernel/common.h:
--------------------------------------------------------------------------------
 1 | /*  Adapted from
 2 |  *      https://gitee.com/ascend/ascend-transformer-boost.git
 3 |  *
 4 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 5 |  * This file is a part of the CANN Open Software.
 6 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 7 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 8 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 9 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
10 |  * See LICENSE in the root of the software repository for the full text of the License.
11 |  */
12 | #ifndef INCLUDE_COMMON_H
13 | #define INCLUDE_COMMON_H
14 | 
15 | #define CONST_2 2
16 | 
17 | #define SET_FLAG(trigger, waiter, e) AscendC::SetFlag<AscendC::HardEvent::trigger##_##waiter>((e))
18 | #define WAIT_FLAG(trigger, waiter, e) AscendC::WaitFlag<AscendC::HardEvent::trigger##_##waiter>((e))
19 | #define PIPE_BARRIER(pipe) AscendC::PipeBarrier<PIPE_##pipe>()
20 | 
21 | #ifndef __force_inline__
22 | #define __force_inline__ inline __attribute__((always_inline))
23 | #endif
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/platform/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import os
18 | 
19 | import vllm_ascend.patch.platform.patch_distributed  # noqa
20 | import vllm_ascend.patch.platform.patch_ec_connector  # noqa
21 | import vllm_ascend.patch.platform.patch_mamba_config  # noqa
22 | import vllm_ascend.patch.platform.patch_sched_yield  # noqa
23 | 
24 | if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
25 |         "EXPERT_MAP_RECORD", "false") == "true":
26 |     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
27 | 


--------------------------------------------------------------------------------
/vllm_ascend/worker/v2/input_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from vllm.v1.worker.gpu.input_batch import InputBuffers
 4 | 
 5 | 
 6 | class AscendInputBuffers(InputBuffers):
 7 |     """Input buffers for Ascend NPUs."""
 8 | 
 9 |     def __init__(
10 |         self,
11 |         max_num_reqs: int,
12 |         max_num_tokens: int,
13 |         inputs_embeds_size: int,
14 |         vocab_size: int,
15 |         dtype: torch.dtype,
16 |         device: torch.device,
17 |         pin_memory: bool,
18 |     ):
19 |         super().__init__(
20 |             max_num_reqs,
21 |             max_num_tokens,
22 |             inputs_embeds_size,
23 |             vocab_size,
24 |             dtype,
25 |             device,
26 |             pin_memory,
27 |         )
28 |         # Create seq_lens_cpu and seq_lens_np.
29 |         # npu's attention backend still needs seq_lens on CPU side.
30 |         self.seq_lens_cpu: torch.Tensor = torch.zeros(
31 |             max_num_reqs,
32 |             dtype=torch.int32,
33 |             device="cpu",
34 |         )
35 |         # seq_len_np and seq_lens_cpu share the same memory.
36 |         # define seq_lens_np for easier calculation with numpy.
37 |         self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()
38 | 


--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
23 | 
24 | if ! git diff --quiet; then
25 | 	echo "Repo is dirty" >&2
26 | 
27 | 	exit 1
28 | fi
29 | 
30 | if ! git describe --tags; then
31 | 	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
32 | 
33 | 	exit 1
34 | fi
35 | 


--------------------------------------------------------------------------------
/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po:
--------------------------------------------------------------------------------
 1 | # SOME DESCRIPTIVE TITLE.
 2 | # Copyright (C) 2025, vllm-ascend team
 3 | # This file is distributed under the same license as the vllm-ascend
 4 | # package.
 5 | # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 6 | #
 7 | #, fuzzy
 8 | msgid ""
 9 | msgstr ""
10 | "Project-Id-Version: vllm-ascend\n"
11 | "Report-Msgid-Bugs-To: \n"
12 | "POT-Creation-Date: 2025-07-18 09:01+0800\n"
13 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
14 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
15 | "Language-Team: zh_CN <LL@li.org>\n"
16 | "Language: zh_CN\n"
17 | "MIME-Version: 1.0\n"
18 | "Content-Type: text/plain; charset=utf-8\n"
19 | "Content-Transfer-Encoding: 8bit\n"
20 | "Plural-Forms: nplurals=1; plural=0;\n"
21 | "Generated-By: Babel 2.17.0\n"
22 | 
23 | #: ../../developer_guide/feature_guide/index.md:1
24 | #: ../../developer_guide/feature_guide/index.md:5
25 | msgid "Feature Guide"
26 | msgstr "功能指南"
27 | 
28 | #: ../../developer_guide/feature_guide/index.md:3
29 | msgid ""
30 | "This section provides an overview of the features implemented in vLLM "
31 | "Ascend. Developers can refer to this guide to understand how vLLM Ascend "
32 | "works."
33 | msgstr "本节概述了 vLLM Ascend 中实现的功能。开发者可以参考本指南以了解 vLLM Ascend 的工作原理。"
34 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/ops/test_gating_top_k_softmax.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch_npu
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     'B',
 8 |     [1, 16, 64, 128, 32768],
 9 | )
10 | @pytest.mark.parametrize(
11 |     'D',
12 |     [8, 16, 32, 64, 128],
13 | )
14 | @pytest.mark.parametrize(
15 |     'top_k',
16 |     [1, 2, 4, 8],
17 | )
18 | @pytest.mark.parametrize(
19 |     "dtype, atol, rtol",
20 |     [
21 |         (torch.float16, 1e-3, 1e-3),
22 |         (torch.bfloat16, 1e-3, 1e-3),
23 |     ],
24 | )
25 | def test_quant_fpx_linear(B: int, D: int, top_k: int, dtype, atol, rtol):
26 |     x = torch.rand((B, D), dtype=dtype).to("npu")
27 |     # finished = torch.randint(1, size=(B,), dtype=torch.bool).to("npu")
28 |     finished = None
29 |     y, expert_idx, row_idx = torch_npu.npu_moe_gating_top_k_softmax(x,
30 |                                                                     finished,
31 |                                                                     k=top_k)
32 | 
33 |     topk_weights = x.softmax(dim=-1)
34 |     topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1)
35 |     topk_ids = topk_ids.to(torch.int32)
36 |     torch.allclose(y, topk_weights, atol=atol, rtol=rtol)
37 |     torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol)
38 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--  Thanks for sending a pull request!
 2 | 
 3 | BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html
 4 | 
 5 | -->
 6 | ### What this PR does / why we need it?
 7 | <!--
 8 | - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue.
 9 | If possible, please consider writing useful notes for better and faster reviews in your PR.
10 | 
11 | - Please clarify why the changes are needed. For instance, the use case and bug description.
12 | 
13 | - Fixes #
14 | -->
15 | 
16 | ### Does this PR introduce _any_ user-facing change?
17 | <!--
18 | Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes.
19 | Documentation-only updates are not considered user-facing changes.
20 | -->
21 | 
22 | ### How was this patch tested?
23 | <!--
24 | CI passed with new added/existing test.
25 | If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
26 | If tests were not added, please describe why they were not added and/or why it was difficult to add.
27 | -->
28 | 


--------------------------------------------------------------------------------
/tests/e2e/singlecard/pooling/test_classification.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from modelscope import snapshot_download  # type: ignore[import-untyped]
 3 | from transformers import AutoModelForSequenceClassification
 4 | 
 5 | from tests.e2e.conftest import HfRunner, VllmRunner
 6 | 
 7 | 
 8 | def test_classify_correctness() -> None:
 9 | 
10 |     model_name = snapshot_download("Howeee/Qwen2.5-1.5B-apeach")
11 | 
12 |     prompts = [
13 |         "Hello, my name is",
14 |         "The president of the United States is",
15 |         "The capital of France is",
16 |         "The future of AI is what",
17 |     ]
18 |     with VllmRunner(
19 |             model_name,
20 |             runner="pooling",
21 |             max_model_len=None,
22 |             cudagraph_capture_sizes=[4],
23 |     ) as vllm_runner:
24 |         vllm_outputs = vllm_runner.classify(prompts)
25 | 
26 |     with HfRunner(model_name,
27 |                   dtype="float32",
28 |                   auto_cls=AutoModelForSequenceClassification) as hf_runner:
29 |         hf_outputs = hf_runner.classify(prompts)
30 | 
31 |     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
32 |         hf_output = torch.tensor(hf_output)
33 |         vllm_output = torch.tensor(vllm_output)
34 |         assert torch.allclose(hf_output, vllm_output, 1e-2)
35 | 


--------------------------------------------------------------------------------
/tests/e2e/multicard/test_expert_parallel.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tests.e2e.conftest import VllmRunner
 4 | from tests.e2e.model_utils import check_outputs_equal
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
 8 | def test_deepseek_correctness_ep(model_name):
 9 |     example_prompts = [
10 |         "Hello, my name is",
11 |         "The president of the United States is",
12 |         "The capital of France is",
13 |         "The future of AI is",
14 |     ]
15 |     max_tokens = 5
16 | 
17 |     # FIXME: Really strange that chunked prefill might lead to different results, investigate further
18 |     with VllmRunner(model_name, tensor_parallel_size=2,
19 |                     enforce_eager=False) as vllm_model:
20 |         tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
21 | 
22 |     with VllmRunner(model_name,
23 |                     tensor_parallel_size=2,
24 |                     enable_expert_parallel=True,
25 |                     enforce_eager=False) as vllm_model:
26 |         ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
27 | 
28 |     check_outputs_equal(
29 |         outputs_0_lst=ep_output,
30 |         outputs_1_lst=tp_output,
31 |         name_0="ep_output",
32 |         name_1="tp_output",
33 |     )
34 | 


--------------------------------------------------------------------------------
/docs/source/developer_guide/feature_guide/add_custom_aclnn_op.md:
--------------------------------------------------------------------------------
 1 | # Adding a custom aclnn operation
 2 | 
 3 | This document describes how to add a custom aclnn operation to vllm-ascend.
 4 | 
 5 | ## How custom aclnn operation works in vllm-ascend?
 6 | 
 7 | Custom aclnn operations are built and installed into `vllm_ascend/cann_ops_custom` directory during the build process of vllm-ascend. Then the aclnn operators are bound to `torch.ops._C_ascend` module, enabling users to invoke them in vllm-ascend python code.
 8 | 
 9 | To enable custom operations, use the following code:
10 | 
11 | ```python
12 | from vllm_ascend.utils import enable_custom_op
13 | 
14 | enable_custom_op()
15 | ```
16 | 
17 | ## How to add a custom aclnn operation?
18 | 
19 | - Create a new operation folder under `csrc` directory
20 | - Create `op_host` and `op_kernel` directories for host and kernel source code
21 | - Add build options in `csrc/build_aclnn.sh` for supported SOC. Note that multiple ops should be separated with `;`, i.e. `CUSTOM_OPS=op1;op2;op3`
22 | - Bind aclnn operators to torch.ops._C_ascend module in `csrc/torch_binding.cpp`
23 | - Write a meta implementation in `csrc/torch_binding_meta.cpp` for op being captured into aclgraph
24 | 
25 | After a successful build of vllm-ascend, the custom aclnn operation can be invoked in python code.
26 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # Should be mirrored in requirements.txt
 3 | requires = [
 4 |     "attrs",
 5 |     "cmake>=3.26",
 6 |     "decorator",
 7 |     "einops",
 8 |     "googleapis-common-protos",
 9 |     "numpy<2.0.0",
10 |     "packaging",
11 |     "pip",
12 |     "pybind11",
13 |     "pyyaml",
14 |     "scipy",
15 |     "pandas",
16 |     "pandas-stubs",
17 |     "psutil",
18 |     "setuptools>=64",
19 |     "setuptools-scm>=8",
20 |     "transformers<=4.57.1",
21 |     "torch-npu==2.8.0",
22 |     "torch==2.8.0",
23 |     "torchvision",
24 |     "wheel",
25 |     "msgpack",
26 |     "quart",
27 |     "numba",
28 |     "fastapi<0.124.0",
29 |     "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
30 |     "compressed_tensors>=0.11.0"
31 | ]
32 | build-backend = "setuptools.build_meta"
33 | 
34 | [tool.pymarkdown]
35 | plugins.md004.style = "sublist" # ul-style
36 | plugins.md007.indent = 4 # ul-indent
37 | plugins.md007.start_indented = true # ul-indent
38 | plugins.md013.enabled = false # line-length
39 | plugins.md041.enabled = false # first-line-h1
40 | plugins.md033.enabled = false # inline-html
41 | plugins.md046.enabled = false # code-block-style
42 | plugins.md024.allow_different_nesting = true # no-duplicate-headers
43 | plugins.md029.enabled = false # ol-prefix
44 | 


--------------------------------------------------------------------------------
/vllm_ascend/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | 
18 | 
19 | def register():
20 |     """Register the NPU platform."""
21 | 
22 |     return "vllm_ascend.platform.NPUPlatform"
23 | 
24 | 
25 | def register_connector():
26 |     from vllm_ascend.distributed import register_connector
27 |     register_connector()
28 | 
29 | 
30 | def register_model_loader():
31 |     from .model_loader.netloader import register_netloader
32 |     register_netloader()
33 | 
34 | 
35 | def register_service_profiling():
36 |     from .profiling_config import generate_service_profiling_config
37 |     generate_service_profiling_config()
38 | 


--------------------------------------------------------------------------------
/csrc/utils/inc/fallback_comm.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | 
11 | /*!
12 |  * \file fallback_comm.h
13 |  * \brief
14 |  */
15 | 
16 | #ifndef INC_EXTERNAL_GRAPH_FALLBACK_COMMON_H_
17 | #define INC_EXTERNAL_GRAPH_FALLBACK_COMMON_H_
18 | 
19 | #include "aclnn/aclnn_base.h"
20 | #include "exe_graph/runtime/op_execute_context.h"
21 | #include "exe_graph/runtime/tensor.h"
22 | #include "register/op_impl_registry.h"
23 | #include "runtime/base.h"
24 | 
25 | #ifdef __cplusplus
26 | extern "C" {
27 | #endif
28 | 
29 | namespace fallback {
30 | 
31 | aclDataType ToAclDataType(ge::DataType dtype);
32 | }  // namespace fallback
33 | 
34 | #ifdef __cplusplus
35 | }
36 | #endif
37 | 
38 | #endif  // INC_EXTERNAL_GRAPH_FALLBACK_COMMON_H_
39 | 


--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | # Ensure that *.excalidraw.png files have the excalidraw metadata
23 | # embedded in them. This ensures they can be loaded back into
24 | # the tool and edited in the future.
25 | 
26 | find . -iname '*.excalidraw.png' | while read -r file; do
27 | 	if git check-ignore -q "$file"; then
28 | 		continue
29 | 	fi
30 | 	if ! grep -q "excalidraw+json" "$file"; then
31 | 		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
32 | 		exit 1
33 | 	fi
34 | done
35 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_rope.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import torch
19 | import torch.nn as nn
20 | from vllm.model_executor.layers.rotary_embedding.base import \
21 |     RotaryEmbeddingBase
22 | 
23 | 
24 | class AscendRotaryEmbeddingBase(nn.Module):
25 | 
26 |     def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
27 |         cos_sin = self.cos_sin_cache[:seqlen]
28 |         cos, sin = cos_sin.chunk(2, dim=-1)
29 |         return cos, sin
30 | 
31 | 
32 | # NOTE: These will be removed after vllm-ascend is aligned with vllm latest main.
33 | RotaryEmbeddingBase.get_cos_sin = AscendRotaryEmbeddingBase.get_cos_sin
34 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/feature_guide/lora.md:
--------------------------------------------------------------------------------
 1 | # LoRA Adapters Guide
 2 | 
 3 | ## Overview
 4 | Like vLLM, vllm-ascend supports LoRA as well. The usage and more details can be found in [vLLM official document](https://docs.vllm.ai/en/latest/features/lora.html).
 5 | 
 6 | You can refer to [Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) to find which models support LoRA in vLLM.
 7 | 
 8 | You can run LoRA with ACLGraph mode now. Please refer to [Graph Mode Guide](./graph_mode.md) for a better LoRA performance.
 9 | 
10 | Address for downloading models:\
11 | base model: https://www.modelscope.cn/models/vllm-ascend/Llama-2-7b-hf/files \
12 | lora model:
13 | https://www.modelscope.cn/models/vllm-ascend/llama-2-7b-sql-lora-test/files
14 | 
15 | ## Example
16 | We provide a simple LoRA example here, which enables the ACLGraph mode by default.
17 | 
18 | ```shell
19 | vllm serve meta-llama/Llama-2-7b \
20 |     --enable-lora \
21 |     --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
22 | ```
23 | 
24 | ## Custom LoRA Operators
25 | 
26 | We have implemented LoRA-related AscendC operators, such as bgmv_shrink, bgmv_expand, sgmv_shrink and sgmv_expand. You can find them under the "csrc/kernels" directory of [vllm-ascend repo](https://github.com/vllm-project/vllm-ascend.git).
27 | 


--------------------------------------------------------------------------------
/tools/mypy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | CI=${1:-0}
23 | PYTHON_VERSION=${2:-local}
24 | 
25 | if [ "$CI" -eq 1 ]; then
26 |     set -e
27 | fi
28 | 
29 | if [ $PYTHON_VERSION == "local" ]; then
30 |     PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
31 | fi
32 | 
33 | run_mypy() {
34 |     echo "Running mypy on $1"
35 |     mypy --check-untyped-defs --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
36 | }
37 | 
38 | run_mypy vllm_ascend
39 | run_mypy examples
40 | run_mypy tests
41 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/500-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new vllm-ascend feature
 3 | title: "[Feature]: "
 4 | labels: ["feature request"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm-ascend/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: 🚀 The feature, motivation and pitch
14 |     description: >
15 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Alternatives
21 |     description: >
22 |       A description of any alternative solutions or features you've considered, if any.
23 | - type: textarea
24 |   attributes:
25 |     label: Additional context
26 |     description: >
27 |       Add any other context or screenshots about the feature request.
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/tests/ut/sample/test_sampler.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | import torch
 4 | 
 5 | from tests.ut.base import TestBase
 6 | from vllm_ascend.sample.sampler import AscendSampler, AscendTopKTopPSampler
 7 | 
 8 | 
 9 | class TestAscendSampler(TestBase):
10 | 
11 |     def test_init_with_raw_logprobs(self):
12 |         sampler = AscendSampler(logprobs_mode="raw_logprobs")
13 |         self.assertEqual(sampler.logprobs_mode, "raw_logprobs")
14 |         self.assertTrue(hasattr(sampler, 'topk_topp_sampler'))
15 |         self.assertIsInstance(sampler.topk_topp_sampler, AscendTopKTopPSampler)
16 | 
17 | 
18 | class TestAscendTopKTopPSampler(TestBase):
19 | 
20 |     @mock.patch("vllm_ascend.sample.sampler.random_sample")
21 |     @mock.patch("torch_npu.npu_top_k_top_p")
22 |     def test_npu_topk_topp_called_when_optimized(self, mock_npu_op,
23 |                                                  mock_random_sample):
24 |         mock_npu_op.return_value = (torch.randn(1, 3))
25 |         mock_random_sample.return_value = torch.randn(3)
26 |         sampler = AscendTopKTopPSampler()
27 | 
28 |         logits = torch.tensor([[1.0, 2.0, 3.0]])
29 |         k = torch.tensor([2])
30 |         p = torch.tensor([0.9])
31 |         generators = {0: torch.Generator()}
32 |         generators[0].manual_seed(42)
33 | 
34 |         sampler.forward_native(logits, generators, k, p)
35 |         mock_npu_op.assert_called_once_with(logits, p, k)
36 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_minicpm.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import torch
19 | from vllm.model_executor.models.minicpm import MiniCPMAttention
20 | 
21 | 
22 | def forward(
23 |     self,
24 |     positions: torch.Tensor,
25 |     hidden_states: torch.Tensor,
26 | ) -> torch.Tensor:
27 |     qkv, _ = self.qkv_proj(hidden_states)
28 |     q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
29 |     q, k = self.rotary_emb(positions, q, k)
30 |     attn_output = self.attn(q, k, v)
31 |     output, _ = self.o_proj(attn_output)
32 |     return output
33 | 
34 | 
35 | # The type conversion in the forward function is deleted to support the rope operator.
36 | MiniCPMAttention.forward = forward
37 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/110-user-story.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 User Story
 2 | description: Apply for an user story to be displayed on https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html
 3 | title: "[User Story]: "
 4 | labels: ["user-story"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 Title
10 |     description: >
11 |       A clear title about what your user story is about.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: About / Introduction
17 |     description: >
18 |       A brief introduction about the background of your use case, like your scenario, hardware size etc.
19 | - type: textarea
20 |   attributes:
21 |     label: Bussiness Challenges
22 |     description: >
23 |       Tell us how what kind of challenge you faced in this user story.
24 | - type: textarea
25 |   attributes:
26 |     label: Solving challenges with vLLM Ascend and benefits
27 |     description: >
28 |       Tell us how vLLM Ascend helped you overcome the challenges, including details like how you use it, what version you used, hardware info, etc. And what kind of benefit do you get from using vLLM Ascend
29 | - type: textarea
30 |   attributes:
31 |     label: Extra Info
32 |     description: >
33 |       Any extra infomation you want to include in this story
34 | - type: markdown
35 |   attributes:
36 |     value: >
37 |       Thanks for contributing 🎉!
38 | 


--------------------------------------------------------------------------------
/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/epilogue/dispatch_policy.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #pragma once
11 | #include "catlass/epilogue/dispatch_policy.hpp"
12 | 
13 | namespace Catlass::Epilogue {
14 | 
15 | template <uint32_t UB_STAGES_, uint32_t EXEC_FLAG_>
16 | struct EpilogueAtlasA2PerTokenDequantSwiglu {
17 |     using ArchTag = Arch::AtlasA2;
18 |     static constexpr uint32_t UB_STAGES = UB_STAGES_;
19 |     static constexpr uint32_t EXEC_FLAG = EXEC_FLAG_;
20 | };
21 | 
22 | template <uint32_t UB_STAGES_, uint32_t EXEC_FLAG_>
23 | struct EpilogueAtlasA2PerTokenDequantCombine {
24 |     using ArchTag = Arch::AtlasA2;
25 |     static constexpr uint32_t UB_STAGES = UB_STAGES_;
26 |     static constexpr uint32_t EXEC_FLAG = EXEC_FLAG_;
27 | };
28 | 
29 | }  // namespace Catlass::Epilogue
30 | 


--------------------------------------------------------------------------------
/vllm_ascend/worker/v2/utils.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | @contextmanager
 7 | def torch_cuda_wrapper():
 8 |     ori_event = torch.cuda.Event
 9 |     ori_stream = torch.cuda.Stream
10 |     ori_default_stream = torch.cuda.default_stream
11 |     ori_current_stream = torch.cuda.current_stream
12 |     ori_graph_pool_handle = torch.cuda.graph_pool_handle
13 |     ori_cuda_graph_cls = torch.cuda.CUDAGraph
14 |     ori_cuda_graph_func = torch.cuda.graph
15 |     try:
16 |         torch.cuda.Event = torch.npu.Event
17 |         torch.cuda.Stream = torch.npu.Stream
18 |         torch.cuda.default_stream = torch.npu.default_stream
19 |         torch.cuda.current_stream = torch.npu.current_stream
20 |         torch.cuda.graph_pool_handle = torch.npu.graph_pool_handle
21 |         torch.cuda.CUDAGraph = torch.npu.NpuGraph
22 |         torch.cuda.graph = torch.npu.graph
23 |         yield
24 |     finally:
25 |         # revert back torch cuda properties, so it will still raise error
26 |         # to call cuda ops in npu environment.
27 |         torch.cuda.Event = ori_event
28 |         torch.cuda.Stream = ori_stream
29 |         torch.cuda.default_stream = ori_default_stream
30 |         torch.cuda.current_stream = ori_current_stream
31 |         torch.cuda.graph_pool_handle = ori_graph_pool_handle
32 |         torch.cuda.CUDAGraph = ori_cuda_graph_cls
33 |         torch.cuda.graph = ori_cuda_graph_func
34 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/ops/test_bgmv_shrink.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import torch
 4 | 
 5 | from vllm_ascend.utils import enable_custom_op
 6 | 
 7 | enable_custom_op()
 8 | 
 9 | DEFAULT_ATOL = 1e-3
10 | DEFAULT_RTOL = 1e-3
11 | 
12 | 
13 | def bgmv_shrink_cpu_impl(x: torch.Tensor, w: torch.Tensor,
14 |                          indices: torch.Tensor, y: torch.tensor,
15 |                          scaling: float) -> torch.Tensor:
16 |     W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
17 |     z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
18 |     y[:, :] += z * scaling
19 |     return y
20 | 
21 | 
22 | @torch.inference_mode()
23 | def test_bgmv_shrink():
24 |     B = 1
25 |     x = torch.randn([B, 128], dtype=torch.float16)
26 |     w = torch.randn([64, 16, 128], dtype=torch.float16)
27 |     indices = torch.zeros([B], dtype=torch.int64)
28 |     y = torch.zeros([B, 16])
29 | 
30 |     x_npu = x.npu()
31 |     w_npu = w.npu()
32 |     indices_npu = indices.npu()
33 |     y_npu = y.npu()
34 | 
35 |     y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
36 |     torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
37 | 
38 |     # Compare the results.
39 |     torch.testing.assert_close(y_npu.cpu(),
40 |                                y,
41 |                                atol=DEFAULT_ATOL,
42 |                                rtol=DEFAULT_RTOL)
43 |     gc.collect()
44 |     torch.npu.empty_cache()
45 |     torch.npu.reset_peak_memory_stats()
46 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/platform/patch_ec_connector.py:
--------------------------------------------------------------------------------
 1 | import vllm.distributed.ec_transfer.ec_connector.example_connector
 2 | from safetensors.torch import load_file
 3 | from vllm.distributed.ec_transfer.ec_connector.example_connector import (
 4 |     ECConnectorMetadata, ECExampleConnector)
 5 | from vllm.logger import logger
 6 | 
 7 | 
 8 | class AscendECExampleConnector(ECExampleConnector):
 9 | 
10 |     def start_load_caches(self, encoder_cache, **kwargs) -> None:
11 |         metadata: ECConnectorMetadata = self._get_connector_metadata()
12 |         assert isinstance(metadata, ECConnectorMetadata)
13 |         assert encoder_cache is not None
14 |         if metadata is None:
15 |             logger.warning((
16 |                 "In connector.start_load_caches, ",
17 |                 "but the connector metadata is None",
18 |             ))
19 |             return
20 |         # Load the EC for each mm data
21 |         for mm_data in metadata.mm_datas:
22 |             if mm_data.mm_hash in encoder_cache:
23 |                 continue
24 |             filename = self._generate_filename_debug(mm_data.mm_hash)
25 |             ec_cache = load_file(filename)["ec_cache"].npu()
26 |             encoder_cache[mm_data.mm_hash] = ec_cache
27 |             logger.debug("Success load encoder cache for hash %s",
28 |                          mm_data.mm_hash)
29 | 
30 | 
31 | vllm.distributed.ec_transfer.ec_connector.example_connector.ECExampleConnector = AscendECExampleConnector
32 | 


--------------------------------------------------------------------------------
/tools/actionlint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | export SHELLCHECK_OPTS="--exclude=SC2046,SC2006,SC2086"
22 | 
23 | if command -v actionlint &> /dev/null; then
24 |     actionlint .github/workflows/*.yml .github/workflows/*.yaml
25 |     exit 0
26 | elif [ -x ./actionlint ]; then
27 |     ./actionlint .github/workflows/*.yml .github/workflows/*.yaml
28 |     exit 0
29 | fi
30 | 
31 | # download a binary to the current directory - v1.7.3
32 | bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
33 | ./actionlint .github/workflows/*.yml .github/workflows/*.yaml
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/750-RFC.yml:
--------------------------------------------------------------------------------
 1 | name: 💬 Request for comments (RFC).
 2 | description: Ask for feedback on major architectural changes or design choices.
 3 | title: "[RFC]: "
 4 | labels: ["RFC"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm-ascend/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
11 | - type: textarea
12 |   attributes:
13 |     label: Motivation.
14 |     description: >
15 |       The motivation of the RFC.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Proposed Change.
21 |     description: >
22 |       The proposed change of the RFC.
23 |   validations:
24 |     required: true
25 | - type: textarea
26 |   attributes:
27 |     label: Feedback Period.
28 |     description: >
29 |       The feedback period of the RFC. Usually at least one week.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: CC List.
35 |     description: >
36 |       The list of people you want to CC.
37 |   validations:
38 |     required: false
39 | - type: textarea
40 |   attributes:
41 |     label: Any Other Things.
42 |     description: >
43 |       Any other things you would like to mention, such as feature branch request.
44 |   validations:
45 |     required: false
46 | - type: markdown
47 |   attributes:
48 |     value: >
49 |       Thanks for contributing 🎉!
50 | 


--------------------------------------------------------------------------------
/tests/e2e/singlecard/test_quantization.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # Copyright 2023 The vLLM team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # This file is a part of the vllm-ascend project.
17 | #
18 | from modelscope import snapshot_download  # type: ignore[import-untyped]
19 | 
20 | from tests.e2e.conftest import VllmRunner
21 | 
22 | 
23 | def test_quant_W8A8():
24 |     max_tokens = 5
25 |     example_prompts = [
26 |         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
27 |     ]
28 |     with VllmRunner(
29 |             snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
30 |             max_model_len=8192,
31 |             enforce_eager=False,
32 |             gpu_memory_utilization=0.7,
33 |             quantization="ascend",
34 |     ) as vllm_model:
35 |         vllm_model.generate_greedy(example_prompts, max_tokens)
36 | 


--------------------------------------------------------------------------------
/tests/e2e/multicard/test_data_parallel_tp2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run `pytest tests/e2e/multicard/test_data_parallel_tp2.py`.
 3 | """
 4 | 
 5 | import os
 6 | import subprocess
 7 | import sys
 8 | from unittest.mock import patch
 9 | 
10 | import pytest
11 | 
12 | MODELS = ["Qwen/Qwen3-0.6B"]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", MODELS)
16 | @pytest.mark.parametrize("max_tokens", [32])
17 | @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
18 | def test_qwen_inference_dp2_tp2(model, max_tokens):
19 |     script = "examples/offline_data_parallel.py"
20 | 
21 |     env = os.environ.copy()
22 | 
23 |     cmd = [
24 |         sys.executable,
25 |         script,
26 |         "--model",
27 |         model,
28 |         "--dp-size",
29 |         "2",
30 |         "--tp-size",
31 |         "2",
32 |         "--node-size",
33 |         "1",
34 |         "--node-rank",
35 |         "0",
36 |         "--trust-remote-code",
37 |     ]
38 | 
39 |     print(f"Running subprocess: {' '.join(cmd)}")
40 |     proc = subprocess.run(cmd,
41 |                           env=env,
42 |                           stdout=subprocess.PIPE,
43 |                           stderr=subprocess.STDOUT,
44 |                           timeout=600)
45 |     output = proc.stdout.decode(errors='ignore')
46 | 
47 |     print(output)
48 | 
49 |     assert "DP rank 0 needs to process" in output
50 |     assert "DP rank 1 needs to process" in output
51 |     assert "Generated text:" in output
52 |     assert proc.returncode == 0
53 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 
 6 | Everyone is permitted to copy and distribute verbatim copies of this
 7 | license document, but changing it is not allowed.
 8 | 
 9 | 
10 | Developer's Certificate of Origin 1.1
11 | 
12 | By making a contribution to this project, I certify that:
13 | 
14 | (a) The contribution was created in whole or in part by me and I
15 |     have the right to submit it under the open source license
16 |     indicated in the file; or
17 | 
18 | (b) The contribution is based upon previous work that, to the best
19 |     of my knowledge, is covered under an appropriate open source
20 |     license and I have the right under that license to submit that
21 |     work with modifications, whether created in whole or in part
22 |     by me, under the same open source license (unless I am
23 |     permitted to submit under a different license), as indicated
24 |     in the file; or
25 | 
26 | (c) The contribution was provided directly to me by some other
27 |     person who certified (a), (b) or (c) and I have not modified
28 |     it.
29 | 
30 | (d) I understand and agree that this project and the contribution
31 |     are public and that a record of the contribution (including all
32 |     personal information I submit with it, including my sign-off) is
33 |     maintained indefinitely and may be redistributed consistent with
34 |     this project or the open source license(s) involved.
35 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/utils/copy_gm_to_l1_custom.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef COPY_GM_TO_L1_CUSTOM_HPP
 2 | #define COPY_GM_TO_L1_CUSTOM_HPP
 3 | 
 4 | namespace Catlass::Gemm::Tile {
 5 |     /// Partial specialization for nZ in and nZ out.
 6 |     template <
 7 |         class ArchTag,
 8 |         class Element
 9 |     >
10 |     struct CopyGmToL1<ArchTag, Gemm::GemmType<Element, layout::VectorLayout>> {
11 |         using LayoutDst = layout::VectorLayout;
12 |         using LayoutSrc = layout::VectorLayout;
13 | 
14 |         static constexpr uint32_t ELE_NUM_PER_C0 = BYTE_PER_C0 / sizeof(Element);   // int64, 32/8=4
15 | 
16 |         // Mehtods  
17 | 
18 |         CATLASS_DEVICE
19 |         CopyGmToL1() {};
20 | 
21 |         CATLASS_DEVICE
22 |         void operator()(
23 |             AscendC::LocalTensor<Element> const &dstTensor,
24 |             AscendC::GlobalTensor<Element> const &srcTensor,
25 |             LayoutDst const &layoutDst, LayoutSrc const &layoutSrc)
26 |         {
27 |             uint32_t blockCount = 1;
28 |             uint32_t blockLen = CeilDiv<ELE_NUM_PER_C0>(layoutSrc.shape(0));
29 | 
30 |             AscendC::DataCopyParams repeatParams;
31 | 
32 |             repeatParams.blockCount = blockCount;
33 |             repeatParams.blockLen = blockLen;
34 |             repeatParams.srcStride = 0;
35 |             repeatParams.dstStride = 0;
36 |             AscendC::DataCopy(dstTensor, srcTensor, repeatParams);
37 |         }
38 |     };
39 | }
40 | #endif // COPY_GM_TO_L1_CUSTOM_HPP


--------------------------------------------------------------------------------
/.github/workflows/schedule_test_vllm_main.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | name: vLLM Main Schedule Test
18 | 
19 | on:
20 |   # Run full e2e tests per 4h
21 |   schedule:
22 |     - cron: '0 */4 * * *'
23 |   workflow_dispatch:
24 | 
25 | # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
26 | # declared as "shell: bash -el {0}" on steps that need to be properly activated.
27 | # It's used to activate ascend-toolkit environment variables.
28 | defaults:
29 |   run:
30 |     shell: bash -el {0}
31 | 
32 | jobs:
33 |   e2e-test:
34 |     uses: ./.github/workflows/_e2e_test.yaml
35 |     with:
36 |       vllm: main
37 |       runner: linux-aarch64-a2
38 |       image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
39 |       type: full
40 | 


--------------------------------------------------------------------------------
/tools/send_mm_request.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import os
 3 | 
 4 | import requests
 5 | from modelscope import snapshot_download  # type: ignore
 6 | 
 7 | mm_dir = snapshot_download("vllm-ascend/mm_request", repo_type='dataset')
 8 | image_path = os.path.join(mm_dir, "test_mm2.jpg")
 9 | with open(image_path, 'rb') as image_file:
10 |     image_data = base64.b64encode(image_file.read()).decode('utf-8')
11 | 
12 | data = {
13 |     "messages": [{
14 |         "role":
15 |         "user",
16 |         "content": [{
17 |             "type": "text",
18 |             "text": "What is the content of this image?"
19 |         }, {
20 |             "type": "image_url",
21 |             "image_url": {
22 |                 "url": f"data:image/jpeg;base64,{image_data}"
23 |             }
24 |         }]
25 |     }],
26 |     "eos_token_id": [1, 106],
27 |     "pad_token_id":
28 |     0,
29 |     "top_k":
30 |     64,
31 |     "top_p":
32 |     0.95,
33 |     "max_tokens":
34 |     8192,
35 |     "stream":
36 |     False
37 | }
38 | 
39 | headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
40 | 
41 | 
42 | def send_image_request(model, server):
43 |     data["model"] = model
44 |     url = server.url_for("v1", "chat", "completions")
45 |     response = requests.post(url, headers=headers, json=data)
46 |     print("Status Code:", response.status_code)
47 |     response_json = response.json()
48 |     print("Response:", response_json)
49 |     assert response_json["choices"][0]["message"]["content"], "empty response"
50 | 


--------------------------------------------------------------------------------
/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode_base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #ifndef DISPATCH_GMM_COMBINE_DECODE_BASE_H
11 | #define DISPATCH_GMM_COMBINE_DECODE_BASE_H
12 | 
13 | #include "../common/moe_distribute_base.h"
14 | 
15 | #define TemplateMC2TypeClass typename ExpandXType, typename ExpandIdxType, bool IsNeedReduceScatter, uint32_t EXEC_FLAG
16 | #define TemplateMC2TypeFunc ExpandXType, ExpandIdxType, IsNeedReduceScatter, EXEC_FLAG
17 | #define TemplateDispatchTypeClass                                                                          \
18 |     typename XType, typename ExpandXOutType, bool StaticQuant, bool DynamicQuant, bool IsSmoothScaleExist, \
19 |         bool IsNeedAllgater, uint32_t EXEC_FLAG
20 | #define TemplateDispatchTypeFunc XType, ExpandXOutType, StaticQuant, DynamicQuant, IsSmoothScaleExist, IsNeedAllgater, EXEC_FLAG
21 | #endif  // DISPATCH_GMM_COMBINE_DECODE_BASE_H
22 | 


--------------------------------------------------------------------------------
/tests/ut/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | # This file is a part of the vllm-ascend project.
14 | #
15 | 
16 | import unittest
17 | 
18 | import pytest
19 | 
20 | from vllm_ascend.utils import adapt_patch, register_ascend_customop
21 | 
22 | 
23 | class TestBase(unittest.TestCase):
24 | 
25 |     def __init__(self, *args, **kwargs):
26 |         # adapt patch by default.
27 |         adapt_patch(True)
28 |         adapt_patch()
29 |         register_ascend_customop()
30 |         super().setUp()
31 |         super(TestBase, self).__init__(*args, **kwargs)
32 | 
33 | 
34 | class PytestBase:
35 |     """Base class for pytest-based tests.
36 |     because pytest mocker and parametrize usage are not compatible with unittest.
37 |     so we need to use a separate base class for pytest tests.
38 |     """
39 | 
40 |     @pytest.fixture(autouse=True)
41 |     def setup(self):
42 |         adapt_patch(True)
43 |         adapt_patch()
44 |         register_ascend_customop()
45 | 


--------------------------------------------------------------------------------
/vllm_ascend/flash_common3_context.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | from vllm.model_executor.layers.linear import LinearBase
 6 | 
 7 | 
 8 | @dataclass
 9 | class FlashCommon3Context:
10 |     gate: Optional[LinearBase] = None
11 |     topk_weights: Optional[torch.Tensor] = None
12 |     topk_ids: Optional[torch.Tensor] = None
13 |     row_idx: Optional[torch.Tensor] = None
14 |     shared_experts: Optional[torch.nn.Module] = None
15 |     shared_out: Optional[torch.Tensor] = None
16 | 
17 | 
18 | _flash_common3_context: Optional[FlashCommon3Context] = None
19 | 
20 | 
21 | def get_flash_common3_context() -> Optional[FlashCommon3Context]:
22 |     return _flash_common3_context
23 | 
24 | 
25 | def set_flash_common3_context(
26 |     topk_weights: Optional[torch.Tensor] = None,
27 |     topk_ids: Optional[torch.Tensor] = None,
28 |     shared_experts: Optional[torch.nn.Module] = None,
29 |     shared_out: Optional[torch.Tensor] = None,
30 | ):
31 |     global _flash_common3_context
32 |     if _flash_common3_context is None:
33 |         _flash_common3_context = FlashCommon3Context()
34 | 
35 |     if topk_weights is not None:
36 |         _flash_common3_context.topk_weights = topk_weights
37 |     if topk_ids is not None:
38 |         _flash_common3_context.topk_ids = topk_ids
39 |     if shared_experts is not None:
40 |         _flash_common3_context.shared_experts = shared_experts
41 |     if shared_out is not None:
42 |         _flash_common3_context.shared_out = shared_out
43 | 


--------------------------------------------------------------------------------
/tests/e2e/vllm_interface/singlecard/test_sampler.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | # Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from vllm import SamplingParams
20 | 
21 | from tests.e2e.conftest import VllmRunner
22 | 
23 | 
24 | def test_models_topk() -> None:
25 |     example_prompts = [
26 |         "The capital of France is",
27 |     ]
28 |     sampling_params = SamplingParams(max_tokens=10,
29 |                                      temperature=0.0,
30 |                                      top_k=10,
31 |                                      top_p=0.9)
32 | 
33 |     with VllmRunner("Qwen/Qwen3-0.6B",
34 |                     max_model_len=4096,
35 |                     gpu_memory_utilization=0.7) as runner:
36 |         runner.generate(example_prompts, sampling_params)
37 | 


--------------------------------------------------------------------------------
/tests/e2e/multicard/test_chunk_gated_delta_rule.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from tests.ut.base import PytestBase
 4 | from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
 5 | 
 6 | 
 7 | class TestChunkGatedDeltaRule(PytestBase):
 8 | 
 9 |     def test_triton_fusion_ops(self, mock_moe_env):
10 |         q = torch.randn(1, 17, 4, 128, dtype=torch.bfloat16).npu()
11 |         k = torch.randn(1, 17, 4, 128, dtype=torch.bfloat16).npu()
12 |         v = torch.randn(1, 17, 8, 128, dtype=torch.bfloat16).npu()
13 |         g = torch.randn(1, 17, 8, dtype=torch.float32).npu()
14 |         beta = torch.randn(1, 17, 8, dtype=torch.bfloat16).npu()
15 |         initial_state = torch.randn(3, 8, 128, 128, dtype=torch.bfloat16).npu()
16 |         q_start_loc = torch.range(0, 3, dtype=torch.int).npu()
17 | 
18 |         (
19 |             core_attn_out_non_spec,
20 |             last_recurrent_state,
21 |         ) = chunk_gated_delta_rule(q=q,
22 |                                    k=k,
23 |                                    v=v,
24 |                                    g=g,
25 |                                    beta=beta,
26 |                                    initial_state=initial_state,
27 |                                    output_final_state=True,
28 |                                    cu_seqlens=q_start_loc,
29 |                                    head_first=False,
30 |                                    use_qk_l2norm_in_kernel=True)
31 | 
32 |         assert core_attn_out_non_spec.shape == (1, 17, 8, 128)
33 |         assert last_recurrent_state.shape == (3, 8, 128, 128)
34 | 


--------------------------------------------------------------------------------
/.github/workflows/_pre_commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |     workflow_call:
 5 |       inputs:
 6 |         vllm:
 7 |           required: true
 8 |           type: string
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   pre-commit:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - name: Checkout vllm-project/vllm-ascend repo
18 |       uses: actions/checkout@v6
19 |     - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
20 |       with:
21 |         python-version: "3.11"
22 |     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
23 |     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
24 |     - name: Checkout vllm-project/vllm repo
25 |       uses: actions/checkout@v6
26 |       with:
27 |         repository: vllm-project/vllm
28 |         path: ./vllm-empty
29 |         ref: ${{ inputs.vllm }}
30 |     - name: Install vllm
31 |       working-directory: vllm-empty
32 |       run: |
33 |         pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
34 |         VLLM_TARGET_DEVICE=empty pip install .
35 |     - name: Install vllm-ascend dev
36 |       run: |
37 |         pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
38 |     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
39 |       env:
40 |         SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086" # Exclude SC2046, SC2006, SC2086 for actionlint
41 |       with:
42 |         extra_args: --all-files --hook-stage manual
43 | 


--------------------------------------------------------------------------------
/vllm_ascend/xlite/xlite_model_runner.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # Copyright 2023 The vLLM team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # This file is a part of the vllm-ascend project.
17 | # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
18 | # isort: skip_file
19 | import torch.nn as nn
20 | from vllm.v1.kv_cache_interface import KVCacheConfig
21 | from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
22 | 
23 | 
24 | class XliteModelRunner(NPUModelRunner):
25 | 
26 |     def get_model(self) -> nn.Module:
27 |         return self.model.unwrap()
28 | 
29 |     def load_model(self) -> None:
30 |         super().load_model()
31 |         from vllm_ascend.xlite.xlite import XliteWrapper
32 |         self.model = XliteWrapper(self.model, self.vllm_config)
33 | 
34 |     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
35 |         super().initialize_kv_cache(kv_cache_config)
36 |         self.model.register_kv_caches(self.kv_caches)
37 | 


--------------------------------------------------------------------------------
/csrc/grouped_matmul_swiglu_quant_weight_nz_tensor_list/op_host/grouped_matmul_swiglu_quant_weight_nz_tensor_list.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #ifndef OP_API_INC_LEVEL0_OP_GROUPED_MATMUL_SWIGLU_QUANT_WEIGHT_NZ_TENSOR_LIST_OP_H
11 | #define OP_API_INC_LEVEL0_OP_GROUPED_MATMUL_SWIGLU_QUANT_WEIGHT_NZ_TENSOR_LIST_OP_H
12 | 
13 | #include "opdev/op_executor.h"
14 | 
15 | namespace l0op {
16 | const std::tuple<aclTensor*, aclTensor*> GroupedMatmulSwigluQuantWeightNzTensorList(const aclTensor *x,
17 |                                                                   const aclTensorList *weight,
18 |                                                                   const aclTensorList *perChannelScale,
19 |                                                                   const aclTensor *perTokenScale,
20 |                                                                   const aclTensor *groupList,
21 |                                                                   aclOpExecutor *executor);
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/ops/test_bgmv_expand.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import torch
 4 | 
 5 | from vllm_ascend.utils import enable_custom_op
 6 | 
 7 | enable_custom_op()
 8 | 
 9 | DEFAULT_ATOL = 1e-3
10 | DEFAULT_RTOL = 1e-3
11 | 
12 | 
13 | def bgmv_expand_cpu_impl(x: torch.Tensor, w: torch.Tensor,
14 |                          indices: torch.Tensor, y: torch.tensor,
15 |                          slice_offset: int, slice_size: int) -> torch.Tensor:
16 |     W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
17 |     z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
18 |     y[:, slice_offset:slice_offset + slice_size] += z
19 |     return y
20 | 
21 | 
22 | @torch.inference_mode()
23 | def test_bgmv_expand():
24 |     B = 1
25 |     x = torch.randn([B, 16], dtype=torch.float)
26 |     w = torch.randn([64, 128, 16], dtype=torch.float16)
27 |     indices = torch.zeros([B], dtype=torch.int64)
28 |     y = torch.randn([B, 128 * 3], dtype=torch.float16)
29 | 
30 |     x_npu = x.npu()
31 |     w_npu = w.npu()
32 |     indices_npu = indices.npu()
33 |     y_npu = y.npu()
34 | 
35 |     y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
36 |     y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
37 |                                                 y_npu, 0, 128)
38 | 
39 |     # Compare the results.
40 |     torch.testing.assert_close(y_out_npu.cpu(),
41 |                                y_out,
42 |                                atol=DEFAULT_ATOL,
43 |                                rtol=DEFAULT_RTOL)
44 |     gc.collect()
45 |     torch.npu.empty_cache()
46 |     torch.npu.reset_peak_memory_stats()
47 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/core/policy/policy_abstract.py:
--------------------------------------------------------------------------------
 1 | # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
 2 | # Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy.
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class DynamicConfig:
 7 |     placement_policy = None
 8 | 
 9 |     max_transferred_expert_per_layer = 100  # Maximum number of experts that can be migrated per layer on a single host
10 |     ep_worldsize = 64  # Total number of dies across the entire cluster where experts are distributed
11 |     num_die_per_host = 8  # Number of dies on each host machine
12 | 
13 | 
14 | class EplbPolicy:
15 | 
16 |     def __init__(self, config: DynamicConfig):
17 |         self.config = config
18 | 
19 |     @abstractmethod
20 |     def rebalance_experts(self, current_expert_table, expert_workload):
21 |         """
22 |         Pass in the weights and return expert replication and placement under relevant constraints.
23 |         INPUT:
24 |         current_expert_table: [layerId, rankId, expert_num_i]
25 |         expert_workload = expert_table[layer0][rankId][expert_num_i]
26 | 
27 |         RETURNED: (res, expert_table)
28 |         res:
29 |         1 -- table_changed
30 |         0 -- not_changed
31 | 
32 |         expert_table: [layerId, rankId, expert_num_i]
33 |         expert_num_i --- [0, MaxExpertPerRank]
34 |         expertID = expert_table[layer0][rankId][expert_num_i]
35 |         array_values:
36 |         [0, 1, 2, 3, 248]
37 |         [4, 5, 6, 7, 254]
38 |         [8, 9, 10, 11, 71]
39 |         ...
40 |         [252, 253, 254, 255, 0]
41 |         """
42 |         pass
43 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute_tiling.h:
--------------------------------------------------------------------------------
 1 | #ifndef MOE_TOKEN_UNPERMUTE_TILING
 2 | #define MOE_TOKEN_UNPERMUTE_TILING
 3 | 
 4 | struct MoeTokenUnpermuteTilingData {
 5 |     int64_t hidden_size;
 6 |     int64_t top_k;
 7 |     int64_t num_out_tokens;
 8 |     int64_t hidden_splited_length;
 9 |     int64_t hidden_splited_num;
10 |     int64_t hidden_splited_remain;
11 |     int64_t tokens_core_length;
12 |     int64_t tokens_core_remain;
13 |     int64_t tokens_splited_length;
14 |     int64_t tokens_splited_num;
15 |     int64_t tokens_splited_remain;
16 |     int64_t buffer_num;
17 | };
18 | 
19 | __forceinline__ [host, aicore] void
20 | MoeTokenUnpermuteTiling(int32_t m, int32_t n, int32_t topK, MoeTokenUnpermuteTilingData &tilingData, uint32_t coreNum)
21 | {
22 |     #define I64(x) static_cast<int64_t>(x)
23 |     tilingData.hidden_size = I64(n);
24 |     tilingData.top_k = I64(topK);
25 |     tilingData.num_out_tokens = I64(m);
26 |     tilingData.hidden_splited_length = tilingData.hidden_size;
27 |     tilingData.hidden_splited_num = 1;
28 |     tilingData.hidden_splited_remain = 0;
29 |     uint32_t outTokens = m / topK;
30 |     tilingData.tokens_core_length = I64(outTokens / coreNum);
31 |     tilingData.tokens_core_remain = I64(outTokens % coreNum);
32 |     tilingData.tokens_splited_length = I64(min(tilingData.tokens_core_length, 600));
33 |     tilingData.tokens_splited_num = I64(tilingData.tokens_core_length / tilingData.tokens_splited_length);
34 |     tilingData.tokens_splited_remain = I64(tilingData.tokens_core_length % tilingData.tokens_splited_length);
35 |     tilingData.buffer_num = 4;
36 | }
37 | 
38 | #endif


--------------------------------------------------------------------------------
/tests/e2e/models/report_template.md:
--------------------------------------------------------------------------------
 1 | # {{ model_name }}
 2 | 
 3 | - **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))  
 4 | - **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}  
 5 | - **Hardware Environment**: {{ hardware }}
 6 | - **Parallel mode**: {{ parallel_mode }}
 7 | - **Execution mode**: {{ execution_model }}
 8 | 
 9 | **Command**:  
10 | 
11 | ```bash
12 | export MODEL_ARGS={{ model_args }}
13 | lm_eval --model {{ model_type }} --model_args $MODEL_ARGS \
14 |   --tasks {{ datasets }} \
15 | {%- if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) %}
16 |   --apply_chat_template \
17 | {%- endif %}
18 | {%- if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) %}
19 |   --fewshot_as_multiturn \
20 | {%- endif %}
21 | {%- if num_fewshot is defined and num_fewshot != "N/A" %}
22 |   --num_fewshot {{ num_fewshot }} \
23 | {%- endif %}
24 | {%- if limit is defined and limit != "N/A" %}
25 |   --limit {{ limit }} \
26 | {%- endif %}
27 |   --batch_size {{ batch_size }}
28 | ```
29 | 
30 | | Task                  | Metric      | Value     | Stderr |
31 | |-----------------------|-------------|----------:|-------:|
32 | {% for row in rows -%}
33 | | {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
34 | {% endfor %}
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/600-new-model.yml:
--------------------------------------------------------------------------------
 1 | name: 🤗 Support request for new model supported from huggingface/modelscope/modelers on Ascend
 2 | description: Submit a proposal/request for a new model from huggingface/modelscope/modelers on Ascend
 3 | title: "[New Model]: "
 4 | labels: ["new model"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm-ascend/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | 
12 |       #### We also highly recommend you read https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html first to know which model already supported.
13 | - type: textarea
14 |   attributes:
15 |     label: The model to consider.
16 |     description: >
17 |       A huggingface/modelscope/modelers url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
18 |   validations:
19 |     required: true
20 | - type: textarea
21 |   attributes:
22 |     label: The closest model vllm already supports.
23 |     description: >
24 |       Here is the list of models already supported by vllm: https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html . Which model is the most similar to the model you want to add support for?
25 | - type: textarea
26 |   attributes:
27 |     label: What's your difficulty of supporting the model you want?
28 |     description: >
29 |       For example, any new operators or new architecture?
30 | - type: markdown
31 |   attributes:
32 |     value: >
33 |       Thanks for contributing 🎉!
34 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | check_command() {
23 |     if ! command -v "$1" &> /dev/null; then
24 |         echo "❓❓$1 is not installed, please run:"
25 |         echo "# Install lint deps"
26 |         echo "pip install -r requirements-lint.txt"
27 |         echo "# (optional) Enable git commit pre check"
28 |         echo "pre-commit install"
29 |         echo ""
30 |         echo "See step by step contribution guide:"
31 |         echo "https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution"
32 |         exit 1
33 |     fi
34 | }
35 | 
36 | check_command pre-commit
37 | 
38 | # TODO: cleanup SC exclude
39 | export SHELLCHECK_OPTS="--exclude=SC2046,SC2006,SC2086"
40 | if [[ "$1" != 'ci' ]]; then
41 |     pre-commit run --all-files
42 | else
43 |     pre-commit run --all-files --hook-stage manual
44 | fi
45 | 


--------------------------------------------------------------------------------
/vllm_ascend/ops/activation.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | 
18 | import torch
19 | from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
20 | 
21 | 
22 | class AscendQuickGELU(QuickGELU):
23 | 
24 |     def forward_oot(self, x: torch.tensor) -> torch.Tensor:
25 |         import torch_npu
26 | 
27 |         out = torch_npu.npu_fast_gelu(x)
28 |         return out
29 | 
30 | 
31 | class AscendSiluAndMul(SiluAndMul):
32 | 
33 |     def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
34 |         import torch_npu
35 | 
36 |         from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
37 | 
38 |         torch.ops.vllm.maybe_prefetch_mlp_down_proj(x)
39 |         if get_ascend_device_type() == AscendDeviceType._310P:
40 |             out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
41 |         else:
42 |             out = torch_npu.npu_swiglu(x)
43 |         torch.ops.vllm.maybe_wait_prefetch_done(out)
44 |         return out
45 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_proto.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | 
11 | /*!
12 |  * \file dispatch_ffn_proto.cpp
13 |  * \brief
14 |  */
15 | #include <graph/utils/type_utils.h>
16 | #include <register/op_impl_registry.h>
17 | // #include "../../common/ophost/op_util.h"
18 | // #include "../../common/ophost/hcom_topo_info.h"
19 | // #include "log/ops_log.h"
20 | 
21 | using namespace ge;
22 | namespace ops {
23 | const size_t ATTR_GROUP = 0;
24 | const size_t ATTR_RANK_SIZE = 1;
25 | const size_t SUPPORT_DIM_SIZE = 2;
26 | 
27 | static ge::graphStatus InferShapeDispatchFFNCombine(gert::InferShapeContext* context) {
28 |   return ge::GRAPH_SUCCESS;
29 | }
30 | 
31 | static ge::graphStatus InferDataTypeDispatchFFNCombine(gert::InferDataTypeContext* context) {
32 |   // auto d_type = context->GetInputDataType(0);
33 |   // context->SetOutputDataType(0, d_type);
34 |   return ge::GRAPH_SUCCESS;
35 | }
36 | 
37 | IMPL_OP_INFERSHAPE(DispatchFFNCombine)
38 |   .InferShape(InferShapeDispatchFFNCombine)
39 |   .InferDataType(InferDataTypeDispatchFFNCombine);
40 | }  // namespace ops
41 | 


--------------------------------------------------------------------------------
/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/dispatch_policy.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 |  * This file is a part of the CANN Open Software.
 4 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 5 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 6 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 7 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 8 |  * See LICENSE in the root of the software repository for the full text of the License.
 9 |  */
10 | #pragma once
11 | #include "catlass/gemm/dispatch_policy.hpp"
12 | 
13 | namespace Catlass::Gemm {
14 | 
15 | template <uint32_t PRELOAD_STAGES_, uint32_t L1A_STAGES_, uint32_t L1B_STAGES_, uint32_t L0A_STAGES_,
16 |           uint32_t L0B_STAGES_, uint32_t L0C_STAGES_, bool ENABLE_UNIT_FLAG_, bool ENABLE_SHUFFLE_K_>
17 | struct MmadAtlasA2PreloadAsyncWithCallbackResidentA : public MmadAtlasA2Async {
18 |     static constexpr uint32_t PRELOAD_STAGES = PRELOAD_STAGES_;  // Stages of emitting load instruction in advance
19 |     static constexpr uint32_t L1A_STAGES = L1A_STAGES_;
20 |     static constexpr uint32_t L1B_STAGES = L1B_STAGES_;
21 |     static constexpr uint32_t L0A_STAGES = L0A_STAGES_;
22 |     static constexpr uint32_t L0B_STAGES = L0B_STAGES_;
23 |     static constexpr uint32_t L0C_STAGES = L0C_STAGES_;
24 |     static constexpr bool ENABLE_UNIT_FLAG = ENABLE_UNIT_FLAG_;
25 |     static constexpr bool ENABLE_SHUFFLE_K = ENABLE_SHUFFLE_K_;
26 | };
27 | 
28 | }  // namespace Catlass::Gemm
29 | 


--------------------------------------------------------------------------------
/csrc/sparse_flash_attention/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This program is free software, you can redistribute it and/or modify it.
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 | # This file is a part of the CANN Open Software.
 4 | # Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
 5 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 6 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME SparseFlashAttention
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 |                 -fpermissive
16 | )
17 | 
18 | set(sparse_flash_attention_depends transformer/attention/sparse_flash_attention PARENT_SCOPE)
19 | target_sources(op_host_aclnn PRIVATE
20 |         sparse_flash_attention_def.cpp
21 | )
22 | 
23 | target_sources(optiling PRIVATE
24 |         sparse_flash_attention_tiling.cpp
25 | )
26 | 
27 | if (NOT BUILD_OPEN_PROJECT)
28 |     target_sources(opmaster_ct PRIVATE
29 |         sparse_flash_attention_tiling.cpp
30 |     )
31 | endif ()
32 | 
33 | target_sources(opsproto PRIVATE
34 |         sparse_flash_attention_proto.cpp
35 | )
36 | 
37 | target_include_directories(optiling PRIVATE
38 |         ${CMAKE_CURRENT_SOURCE_DIR}
39 | )
40 | 


--------------------------------------------------------------------------------
/csrc/mla_preprocess/op_kernel/kernel/hardware.h:
--------------------------------------------------------------------------------
 1 | /*  Adapted from
 2 |  *      https://gitee.com/ascend/ascend-transformer-boost.git
 3 |  *
 4 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 5 |  * This file is a part of the CANN Open Software.
 6 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 7 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 8 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 9 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
10 |  * See LICENSE in the root of the software repository for the full text of the License.
11 |  */
12 | #ifndef INCLUDE_HARDWARE_H
13 | #define INCLUDE_HARDWARE_H
14 | 
15 | enum class ArchType { ASCEND_V220, ASCEND_V200, ASCEND_M200 };
16 | 
17 | template <ArchType ArchTag>
18 | struct HardwareInfo {
19 |     static uint32_t const l2BW = 5;
20 |     static uint32_t const hbmBW = 1;
21 |     static uint32_t const supportMix = 0;
22 |     static uint32_t const l1Size = 512 * 1024;
23 |     static uint32_t const l0ASize = 64 * 1024;
24 |     static uint32_t const l0BSize = 64 * 1024;
25 |     static uint32_t const l0CSize = 128 * 1024;
26 |     static uint32_t const l2Size = 192 * 1024 * 1024;
27 |     static uint32_t const biasSize = 1024;
28 |     static uint32_t const fixBufSize = 7 * 1024;
29 |     static uint32_t const ubSize = 192 * 1024;
30 |     static uint32_t const fractalSize = 512;
31 |     static uint32_t const l1l0BlockSize = 32;
32 |     static uint32_t const btBlockSize = 64;
33 |     static uint32_t const fbBlockSize = 128;
34 | };
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/core/policy/policy_factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
 2 | # Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this factory.
 3 | from .policy_abstract import DynamicConfig, EplbPolicy
 4 | from .policy_dynamic_ep import DynamicEplb
 5 | from .policy_dynamic_ep_v2 import DynamicEplbV2
 6 | from .policy_flashlb import FlashLB, warm_up
 7 | from .policy_random import RandomLoadBalance
 8 | 
 9 | 
10 | class PolicyFactory:
11 | 
12 |     @staticmethod
13 |     def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy:
14 |         policy = {
15 |             # Constraint applying Dynamic EPLB policy V2:
16 |             # If there exists redundant expert:
17 |             # only one redundant expert can be placed in one NPU and its physical expert index must be 0
18 | 
19 |             # Applying greedy d2d expert weight update composing
20 |             0:
21 |             RandomLoadBalance,  # RandomLoadBalance: shuffle last physical expert on NPU 1 and 3
22 |             1:
23 |             DynamicEplb,  # Dynamic EPLB policy: overall expert replacement based on current moe load
24 |             2:
25 |             DynamicEplbV2,  # Dynamic EPLB policy V2:  expert replacement with constrained number of expert shuffle
26 |             3:
27 |             FlashLB,  # FlashLB EPLB policy: expert replacement based on Joint Optimization, Multi-Shot Enhancement and Incremental Adjustment
28 |         }
29 |         policy_class = policy.get(policy_type, RandomLoadBalance)
30 |         policy_instance = policy_class(config)
31 |         if policy_type == 3:
32 |             warm_up()
33 |         return policy_instance
34 | 


--------------------------------------------------------------------------------
/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml:
--------------------------------------------------------------------------------
 1 | test_name: "test DeepSeek-V3.2-Exp-bf16 multi-dp"
 2 | model: "Yanguan/DeepSeek-V3.2-Exp-bf16"
 3 | num_nodes: 2
 4 | npu_per_node: 16
 5 | env_common:
 6 |   VLLM_USE_MODELSCOPE: true
 7 |   OMP_PROC_BIND: false
 8 |   OMP_NUM_THREADS: 100
 9 |   HCCL_BUFFSIZE: 1024
10 |   SERVER_PORT: 8080
11 |   VLLM_ASCEND_ENABLE_MLAPO: 0
12 | 
13 | deployment:
14 |   -
15 |     server_cmd: >
16 |         vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16"
17 |         --host 0.0.0.0
18 |         --port $SERVER_PORT
19 |         --data-parallel-address $LOCAL_IP
20 |         --data-parallel-size 2
21 |         --data-parallel-size-local 1
22 |         --data-parallel-rpc-port 13389 
23 |         --tensor-parallel-size 16 
24 |         --seed 1024 
25 |         --enable-expert-parallel 
26 |         --max-num-seqs 16 
27 |         --max-model-len 17450 
28 |         --max-num-batched-tokens 17450 
29 |         --trust-remote-code 
30 |         --no-enable-prefix-caching 
31 |         --gpu-memory-utilization 0.9 
32 | 
33 |   -
34 |     server_cmd: >
35 |         vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16"
36 |         --headless
37 |         --data-parallel-size 2 
38 |         --data-parallel-size-local 1 
39 |         --data-parallel-start-rank 1 
40 |         --data-parallel-address $MASTER_IP
41 |         --data-parallel-rpc-port 13389 
42 |         --tensor-parallel-size 16 
43 |         --seed 1024 
44 |         --max-num-seqs 16 
45 |         --max-model-len 17450 
46 |         --max-num-batched-tokens 17450 
47 |         --enable-expert-parallel 
48 |         --trust-remote-code 
49 |         --no-enable-prefix-caching 
50 |         --gpu-memory-utilization 0.92 
51 | benchmarks:
52 | 


--------------------------------------------------------------------------------
/vllm_ascend/eplb/adaptor/abstract_adaptor.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | # Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this adaptor.
18 | from abc import abstractmethod
19 | from typing import Any
20 | 
21 | 
22 | class EplbAdaptor():
23 | 
24 |     def __init__(self, **args):
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def get_rank_expert_workload(self):
29 |         raise NotImplementedError
30 | 
31 |     @abstractmethod
32 |     def get_init_expert_map(self, num_moe_layers: Any) -> Any:
33 |         raise NotImplementedError
34 | 
35 |     @abstractmethod
36 |     def do_update_expert_map(self, layer_id: Any,
37 |                              updated_expert_map: Any) -> Any:
38 |         raise NotImplementedError
39 | 
40 |     @abstractmethod
41 |     def do_update_expert_weight(self, layer_id: Any,
42 |                                 local_expert_to_replace: Any,
43 |                                 buffer_tensor_id: Any) -> Any:
44 |         raise NotImplementedError
45 | 


--------------------------------------------------------------------------------
/csrc/kernels/utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "kernel_type.h"
19 | namespace vllm_ascend {
20 | 
21 | template <typename scalar_t> struct AccType;
22 | 
23 | #if (__CCE_AICORE__ >= 220)
24 | template <> struct AccType<bfloat16_t> {
25 |   using type = float;
26 | };
27 | #endif
28 | 
29 | template <> struct AccType<half> {
30 |     using type = half;
31 | };
32 | 
33 | template <> struct AccType<float> {
34 |     using type = float;
35 | };
36 | 
37 | template <> struct AccType<int8_t> {
38 |     using type = int;
39 | };
40 | 
41 | template <typename scalar_t>
42 | __aicore__ inline void local_mem_copy(AscendC::LocalTensor<scalar_t> dst, AscendC::LocalTensor<scalar_t> src, int size)
43 | {
44 |     constexpr int loadSize = 256 / sizeof(scalar_t);
45 |     int loopCnt = size / loadSize;
46 |     int tailSize = size % loadSize;
47 |     if (loopCnt)
48 |         AscendC::Copy(dst, src, loadSize, loopCnt, {1, 1, 8, 8});
49 |     AscendC::Copy(dst[loopCnt * loadSize], src[loopCnt * loadSize], tailSize, 1, {1, 1, 8, 8});
50 | }
51 | } // namespace vllm_ascend


--------------------------------------------------------------------------------
/vllm_ascend/spec_decode/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # Copyright 2023 The vLLM team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # This file is a part of the vllm-ascend project.
17 | # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
18 | #
19 | from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
20 | from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
21 | from vllm_ascend.spec_decode.ngram_proposer import NgramProposer
22 | from vllm_ascend.spec_decode.suffix_proposer import SuffixDecodingProposer
23 | 
24 | 
25 | def get_spec_decode_method(method, vllm_config, device, runner):
26 |     if method == "ngram":
27 |         return NgramProposer(vllm_config, device, runner)
28 |     elif method in ("eagle", "eagle3"):
29 |         return EagleProposer(vllm_config, device, runner)
30 |     elif method == "mtp":
31 |         return MtpProposer(vllm_config, device, runner)
32 |     elif method == 'suffix':
33 |         return SuffixDecodingProposer(vllm_config, device, runner)
34 |     else:
35 |         raise ValueError("Unknown speculative decoding method: "
36 |                          f"{method}")
37 | 


--------------------------------------------------------------------------------
/csrc/notify_dispatch/op_host/aclnn_notify_dispatch.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef ACLNN_NOTIFY_DISPATCH_H_
 3 | #define ACLNN_NOTIFY_DISPATCH_H_
 4 | 
 5 | #include "aclnn/acl_meta.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | /* funtion: aclnnNotifyDispatchGetWorkspaceSize
12 |  * parameters :
13 |  * sendData : required
14 |  * tokenPerExpertData : required
15 |  * sendCount : required
16 |  * numTokens : required
17 |  * commGroup : required
18 |  * rankSize : required
19 |  * rankId : required
20 |  * localRankSize : required
21 |  * localRankId : required
22 |  * sendDataOffset : required
23 |  * recvData : required
24 |  * workspaceSize : size of workspace(output).
25 |  * executor : executor context(output).
26 |  */
27 | __attribute__((visibility("default")))
28 | aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(
29 |     const aclTensor *sendData,
30 |     const aclTensor *tokenPerExpertData,
31 |     int64_t sendCount,
32 |     int64_t numTokens,
33 |     char *commGroup,
34 |     int64_t rankSize,
35 |     int64_t rankId,
36 |     int64_t localRankSize,
37 |     int64_t localRankId,
38 |     const aclTensor *sendDataOffset,
39 |     const aclTensor *recvData,
40 |     uint64_t *workspaceSize,
41 |     aclOpExecutor **executor);
42 | 
43 | /* funtion: aclnnNotifyDispatch
44 |  * parameters :
45 |  * workspace : workspace memory addr(input).
46 |  * workspaceSize : size of workspace(input).
47 |  * executor : executor context(input).
48 |  * stream : acl stream.
49 |  */
50 | __attribute__((visibility("default")))
51 | aclnnStatus aclnnNotifyDispatch(
52 |     void *workspace,
53 |     uint64_t workspaceSize,
54 |     aclOpExecutor *executor,
55 |     aclrtStream stream);
56 | 
57 | #ifdef __cplusplus
58 | }
59 | #endif
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/200-installation.yml:
--------------------------------------------------------------------------------
 1 | name: 🛠️ Installation
 2 | description: Report an issue here when you hit errors during installation.
 3 | title: "[Installation]: "
 4 | labels: ["installation"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm-ascend/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       npu-smi info
18 |       cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
19 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
20 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
21 |       python collect_env.py
22 |       ```
23 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
24 |     value: |
25 |       ```text
26 |       The output of `python collect_env.py`
27 |       ```
28 |   validations:
29 |     required: true
30 | - type: textarea
31 |   attributes:
32 |     label: How you are installing vllm and vllm-ascend
33 |     description: |
34 |       Paste the full command you are trying to execute.
35 |     value: |
36 |       ```sh
37 |       pip install -vvv vllm vllm-ascend
38 |       ```
39 | - type: markdown
40 |   attributes:
41 |     value: >
42 |       Thanks for contributing 🎉!
43 | 


--------------------------------------------------------------------------------
/csrc/matmul_allreduce_add_rmsnorm/op_host/aclnn_matmul_allreduce_add_rmsnorm.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #ifndef ACLNN_MATMUL_ALLREDUCE_ADD_RMSNORM
18 | #define ACLNN_MATMUL_ALLREDUCE_ADD_RMSNORM
19 | 
20 | #include "aclnn/acl_meta.h"
21 | 
22 | #ifdef __cplusplus
23 | extern "C" {
24 | #endif
25 | 
26 | __attribute__((visibility("default"))) aclnnStatus aclnnMatmulAllreduceAddRmsnormGetWorkspaceSize(
27 |     const aclTensor *x1,
28 |     const aclTensor *x2,
29 |     const aclTensor *residual,
30 |     const aclTensor *gamma,
31 |     char *groupTp,
32 |     int64_t tpRankSize,
33 |     int64_t tpRankId,
34 |     double epsilon,
35 |     bool isTransB,
36 |     bool isGatherAddOut,
37 |     const aclTensor *y,
38 |     const aclTensor *addOut,
39 |     uint64_t *workspaceSize,
40 |     aclOpExecutor **executor);
41 | 
42 | __attribute__((visibility("default"))) aclnnStatus aclnnMatmulAllreduceAddRmsnorm(
43 |     void *workspace,
44 |     uint64_t workspaceSize,
45 |     aclOpExecutor *executor,
46 |     aclrtStream stream);
47 | 
48 | #ifdef __cplusplus
49 | }
50 | #endif
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/patch_bert.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import torch
19 | from vllm.model_executor.models import bert
20 | 
21 | # aclgraph does not support shift operator for now
22 | # TODO: revert me when aclgraph supports shift operator
23 | TOKEN_TYPE_SHIFT = 30
24 | TOKEN_TYPE_MULTIPLIER = 1 << 30
25 | TOKEN_MASK = TOKEN_TYPE_MULTIPLIER - 1
26 | 
27 | 
28 | def _encode_token_type_ids(input_ids: torch.Tensor,
29 |                            token_type_ids: torch.Tensor) -> None:
30 |     # input_ids can be padded to the right
31 |     input_ids[:token_type_ids.shape[0]].bitwise_or_(token_type_ids *
32 |                                                     TOKEN_TYPE_MULTIPLIER)
33 | 
34 | 
35 | def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
36 | 
37 |     token_type_ids = input_ids // TOKEN_TYPE_MULTIPLIER
38 | 
39 |     input_ids.bitwise_and_(TOKEN_MASK)
40 | 
41 |     return token_type_ids
42 | 
43 | 
44 | bert._encode_token_type_ids = _encode_token_type_ids
45 | bert._decode_token_type_ids = _decode_token_type_ids
46 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/300-usage.yml:
--------------------------------------------------------------------------------
 1 | name: 💻 Usage
 2 | description: Raise an issue here if you don't know how to use vllm on Ascend.
 3 | title: "[Usage]: "
 4 | labels: ["usage"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm-ascend/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       npu-smi info
18 |       cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
19 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
20 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
21 |       python collect_env.py
22 |       ```
23 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
24 |     value: |
25 |       ```text
26 |       The output of above commands
27 |       ```
28 |   validations:
29 |     required: true
30 | - type: textarea
31 |   attributes:
32 |     label: How would you like to use vllm on ascend
33 |     description: |
34 |       A detailed description of how you want to use vllm on ascend.
35 |     value: |
36 |       I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
37 | - type: markdown
38 |   attributes:
39 |     value: >
40 |       Thanks for contributing 🎉!
41 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_host/error_log.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
 2 | #define OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
 3 | 
 4 | #include <string>
 5 | #include "toolchain/slog.h"
 6 | 
 7 | #define OP_LOGI(opname, ...)
 8 | #define OP_LOGW(opname, ...)             \
 9 |     do {                                 \
10 |         printf("[WARN][%s] ", (opname)); \
11 |         printf(__VA_ARGS__);             \
12 |         printf("\n");                    \
13 |     } while (0)
14 | 
15 | #define OP_LOGE_WITHOUT_REPORT(opname, ...) \
16 |     do {                                    \
17 |         printf("[ERRORx][%s] ", (opname));  \
18 |         printf(__VA_ARGS__);                \
19 |         printf("\n");                       \
20 |     } while (0)
21 | 
22 | #define OP_LOGE(opname, ...)              \
23 |     do {                                  \
24 |         printf("[ERROR][%s] ", (opname)); \
25 |         printf(__VA_ARGS__);              \
26 |         printf("\n");                     \
27 |     } while (0)
28 | 
29 | #define OP_LOGD(opname, ...)
30 | 
31 | namespace optiling {
32 | 
33 | #define VECTOR_INNER_ERR_REPORT_TILIING(op_name, err_msg, ...)   \
34 |     do {                                                         \
35 |         OP_LOGE_WITHOUT_REPORT(op_name, err_msg, ##__VA_ARGS__); \
36 |     } while (0)
37 | 
38 | #define OP_TILING_CHECK(cond, log_func, expr) \
39 |     do {                                      \
40 |         if (cond) {                           \
41 |             log_func;                         \
42 |             expr;                             \
43 |         }                                     \
44 |     } while (0)
45 | }  // namespace optiling
46 | 
47 | #endif  // OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
48 | 


--------------------------------------------------------------------------------
/csrc/mla_preprocess/op_kernel/kernel/set_fpc.h:
--------------------------------------------------------------------------------
 1 | /*  Adapted from
 2 |  *      https://gitee.com/ascend/ascend-transformer-boost.git
 3 |  *
 4 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 5 |  * This file is a part of the CANN Open Software.
 6 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 7 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 8 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 9 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
10 |  * See LICENSE in the root of the software repository for the full text of the License.
11 |  */
12 | #ifndef INCLUDE_SET_FPC_H
13 | #define INCLUDE_SET_FPC_H
14 | 
15 | #include "hardware.h"
16 | #include "kernel_tensor.h"
17 | 
18 | /////////////////////////////////////////////////////
19 | // SetQuantPreAddr
20 | /////////////////////////////////////////////////////
21 | template <ArchType ArchTag, typename DataType>
22 | struct SetQuantPreAddr {
23 |     __aicore__ SetQuantPreAddr(AscendC::LocalTensor<DataType> quantPreTensor) {};
24 | };
25 | 
26 | template <typename DataType>
27 | struct SetQuantPreAddr<ArchType::ASCEND_V220, DataType> {
28 |     static constexpr uint32_t QUANT_PRE_ADDR_MASK = 0xffff;
29 |     static constexpr uint32_t USELESS_BIT_NUM = 7;
30 |     static constexpr uint32_t QUANT_PRE_BIT_POS_IN_FPC = 8;
31 | 
32 |     __aicore__ SetQuantPreAddr(AscendC::LocalTensor<DataType> quantPreTensor)
33 |     {
34 |         uint64_t quantPreAddr = (uint64_t)(__fbuf__ uint64_t *)quantPreTensor.GetPhyAddr();
35 |         AscendC::SetFixPipeConfigImpl(quantPreTensor);
36 |     };
37 | };
38 | #endif
39 | 


--------------------------------------------------------------------------------
/csrc/lightning_indexer/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This program is free software, you can redistribute it and/or modify it.
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 3 | # This file is a part of the CANN Open Software.
 4 | # Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
 5 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 6 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME LightningIndexer
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 |                 -mllvm -cce-aicore-hoist-movemask=false
16 |                 --op_relocatable_kernel_binary=true
17 | )
18 | 
19 | set(lightning_indexer_depends transformer/attention/lightning_indexer PARENT_SCOPE)
20 | 
21 | target_sources(op_host_aclnn PRIVATE
22 |         lightning_indexer_def.cpp
23 | )
24 | 
25 | target_sources(optiling PRIVATE
26 |         lightning_indexer_tiling.cpp
27 | )
28 | 
29 | if (NOT BUILD_OPEN_PROJECT)
30 |     target_sources(opmaster_ct PRIVATE
31 |         lightning_indexer_tiling.cpp
32 |     )
33 | endif ()
34 | 
35 | target_include_directories(optiling PRIVATE
36 |         ${CMAKE_CURRENT_SOURCE_DIR}
37 | )
38 | 
39 | target_sources(opsproto PRIVATE
40 |         lightning_indexer_proto.cpp
41 | )
42 | 
43 | 


--------------------------------------------------------------------------------
/.github/Dockerfile.nightly.a2:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | 
18 | FROM quay.io/ascend/vllm-ascend:main
19 | 
20 | ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
21 | ARG AIS_BENCH_TAG="v3.0-20250930-master"
22 | ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git"
23 | 
24 | # Define environments
25 | ENV DEBIAN_FRONTEND=noninteractive
26 | 
27 | WORKDIR /workspace
28 | 
29 | RUN pip config set global.index-url ${PIP_INDEX_URL}
30 | 
31 | # Install requirements-dev.txt for tests
32 | RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
33 |     cd /vllm-workspace/vllm-ascend && \
34 |     python3 -m pip install -r requirements-dev.txt && \
35 |     python3 -m pip cache purge
36 | 
37 | # Install benchmark tools
38 | RUN git clone -b ${AIS_BENCH_TAG} --depth 1 ${AIS_BENCH_URL} /vllm-workspace/vllm-ascend/benchmark && \
39 |     cd /vllm-workspace/vllm-ascend/benchmark && \
40 |     pip install -e . -r requirements/api.txt -r requirements/extra.txt && \
41 |     python3 -m pip cache purge
42 | 
43 | CMD ["/bin/bash"]
44 | 


--------------------------------------------------------------------------------
/.github/Dockerfile.nightly.a3:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | 
18 | FROM quay.io/ascend/vllm-ascend:main-a3
19 | 
20 | ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
21 | ARG AIS_BENCH_TAG="v3.0-20250930-master"
22 | ARG AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git"
23 | 
24 | # Define environments
25 | ENV DEBIAN_FRONTEND=noninteractive
26 | 
27 | WORKDIR /workspace
28 | 
29 | RUN pip config set global.index-url ${PIP_INDEX_URL}
30 | 
31 | # Install requirements-dev.txt for tests
32 | RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
33 |     cd /vllm-workspace/vllm-ascend && \
34 |     python3 -m pip install -r requirements-dev.txt && \
35 |     python3 -m pip cache purge
36 | 
37 | # Install benchmark tools
38 | RUN git clone -b ${AIS_BENCH_TAG} --depth 1 ${AIS_BENCH_URL} /vllm-workspace/vllm-ascend/benchmark && \
39 |     cd /vllm-workspace/vllm-ascend/benchmark && \
40 |     pip install -e . -r requirements/api.txt -r requirements/extra.txt && \
41 |     python3 -m pip cache purge
42 | 
43 | CMD ["/bin/bash"]
44 | 


--------------------------------------------------------------------------------
/tools/shellcheck.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | # This file is a part of the vllm-ascend project.
19 | # Adapted from https://github.com/vllm-project/vllm/tree/main/tools
20 | #
21 | 
22 | set -e
23 | 
24 | scversion="stable"
25 | 
26 | if [ -d "shellcheck-${scversion}" ]; then
27 |     PATH="$PATH:$(pwd)/shellcheck-${scversion}"
28 |     export PATH
29 | fi
30 | 
31 | if ! [ -x "$(command -v shellcheck)" ]; then
32 |     if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
33 |         echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
34 |         exit 1
35 |     fi
36 | 
37 |     # automatic local install if linux x86_64
38 |     wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
39 |     PATH="$PATH:$(pwd)/shellcheck-${scversion}"
40 |     export PATH
41 | fi
42 | 
43 | # should enable this
44 | # find . -path ./.git -prune -o -name "*.sh" -print0 \
45 | # | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
46 | 


--------------------------------------------------------------------------------
/.github/Dockerfile.buildwheel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # This file is a part of the vllm-ascend project.
16 | #
17 | ARG PY_VERSION=3.11
18 | FROM quay.io/ascend/manylinux:8.3.rc2-910b-manylinux_2_28-py${PY_VERSION}
19 | 
20 | ARG SOC_VERSION="ascend910b1"
21 | 
22 | # Define environments
23 | ENV DEBIAN_FRONTEND=noninteractive
24 | ENV SOC_VERSION=$SOC_VERSION
25 | RUN yum update -y && \
26 |     yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
27 |     rm -rf /var/cache/yum
28 | 
29 | WORKDIR /workspace
30 | 
31 | COPY . /workspace/vllm-ascend/
32 | 
33 | # Install req
34 | RUN python3 -m pip install -r vllm-ascend/requirements.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
35 |     python3 -m pip install twine
36 | 
37 | # Install vllm-ascend
38 | RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
39 |     source /usr/local/Ascend/nnal/atb/set_env.sh && \
40 |     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
41 |     cd vllm-ascend && \
42 |     python3 setup.py bdist_wheel && \
43 |     ls -l dist 
44 | 
45 | CMD ["/bin/bash"]
46 | 


--------------------------------------------------------------------------------
/csrc/dispatch_ffn_combine/op_kernel/utils/dispatch_policy_custom.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DISPATH_POLICY_CUSTOM_HPP
 2 | #define DISPATH_POLICY_CUSTOM_HPP
 3 | 
 4 | namespace Catlass::Gemm {
 5 |     template <bool ENABLE_UNIT_FLAG_ = false, bool ENABLE_SHUFFLE_K_ = false>
 6 |     struct MmadAtlasA2PreloadFixpipeQuant : public MmadAtlasA2 {
 7 |         static constexpr uint32_t STAGES = 2;
 8 |         static constexpr bool ENABLE_UNIT_FLAG = ENABLE_UNIT_FLAG_;
 9 |         static constexpr bool ENABLE_SHUFFLE_K = ENABLE_SHUFFLE_K_;
10 |     };
11 | 
12 |     template <uint32_t PRELOAD_STAGES_, uint32_t L1_STAGES_, uint32_t L0A_STAGES_, uint32_t L0B_STAGES_,
13 |         uint32_t L0C_STAGES_, bool ENABLE_UNIT_FLAG_, bool ENABLE_SHUFFLE_K_>
14 |     struct MmadAtlasA2PreloadAsyncFixpipe :
15 |         public MmadAtlasA2PreloadAsync<
16 |             PRELOAD_STAGES_,
17 |             L1_STAGES_, 
18 |             L0A_STAGES_,
19 |             L0B_STAGES_,
20 |             L0C_STAGES_,
21 |             ENABLE_UNIT_FLAG_,
22 |             ENABLE_SHUFFLE_K_
23 |         > {
24 |     };
25 | }
26 | 
27 | namespace Catlass::Epilogue {
28 |         
29 |     template <uint32_t UB_STAGES_>
30 |     struct EpilogueAtlasA2UnQuant {
31 |         using ArchTag = Arch::AtlasA2;
32 |         static constexpr uint32_t UB_STAGES = UB_STAGES_;
33 |     };
34 | 
35 |     template <uint32_t UB_STAGES_>
36 |     struct EpilogueAtlasA2PerTokenDequantQuant {
37 |         using ArchTag = Arch::AtlasA2;
38 |         static constexpr uint32_t UB_STAGES = UB_STAGES_;
39 |     };
40 |     
41 |     template <uint32_t UB_STAGES_>
42 |     struct EpilogueAtlasA2PerTokenDequantSwigluQuant {
43 |         using ArchTag = Arch::AtlasA2;
44 |         static constexpr uint32_t UB_STAGES = UB_STAGES_;
45 |     };
46 | }
47 | #endif // DISPATH_POLICY_CUSTOM_HPP


--------------------------------------------------------------------------------
/vllm_ascend/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | from vllm.distributed.kv_transfer.kv_connector.factory import \
19 |     KVConnectorFactory
20 | 
21 | 
22 | def register_connector():
23 |     KVConnectorFactory.register_connector(
24 |         "MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector",
25 |         "MooncakeConnector")
26 | 
27 |     KVConnectorFactory.register_connector(
28 |         "MooncakeConnectorStoreV1",
29 |         "vllm_ascend.distributed.kvpool.ascend_store_connector",
30 |         "AscendStoreConnector")
31 | 
32 |     KVConnectorFactory.register_connector(
33 |         "AscendStoreConnector",
34 |         "vllm_ascend.distributed.kvpool.ascend_store_connector",
35 |         "AscendStoreConnector")
36 | 
37 |     KVConnectorFactory.register_connector(
38 |         "MooncakeLayerwiseConnector",
39 |         "vllm_ascend.distributed.mooncake_layerwise_connector",
40 |         "MooncakeLayerwiseConnector")
41 | 
42 |     KVConnectorFactory.register_connector(
43 |         "UCMConnector", "vllm_ascend.distributed.ucm_connector",
44 |         "UCMConnectorV1")
45 | 


--------------------------------------------------------------------------------
/csrc/dispatch_layout/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 2 | # This file is a part of the CANN Open Software.
 3 | # Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 4 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 5 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 6 | # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME DispatchLayout
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 | )
16 | 
17 | target_sources(op_host_aclnnInner PRIVATE
18 |         dispatch_layout.cpp
19 | )
20 | 
21 | target_sources(opapi PRIVATE
22 |         aclnn_dispatch_layout.cpp
23 | )
24 | 
25 | if (NOT BUILD_OPEN_PROJECT)
26 |     target_sources(aclnn_ops_train PRIVATE
27 |         aclnn_dispatch_layout.cpp
28 |     )
29 | 
30 |     target_sources(aclnn_ops_infer PRIVATE
31 |         aclnn_dispatch_layout.cpp
32 |     )
33 | endif ()
34 | 
35 | target_sources(optiling PRIVATE
36 |         dispatch_layout_tiling.cpp
37 | )
38 | 
39 | target_include_directories(optiling PRIVATE
40 |         ${CMAKE_CURRENT_SOURCE_DIR}
41 | )
42 | 
43 | target_sources(opsproto PRIVATE)
44 | 
45 | file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_dispatch_layout.h")
46 | 
47 | install(FILES ${_GMM_Aclnn_header}
48 |         DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
49 | )
50 | 


--------------------------------------------------------------------------------
/csrc/notify_dispatch/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 2 | # This file is a part of the CANN Open Software.
 3 | # Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 4 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 5 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 6 | # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME NotifyDispatch
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 | )
16 | 
17 | target_sources(op_host_aclnnInner PRIVATE
18 |         notify_dispatch.cpp
19 | )
20 | 
21 | target_sources(opapi PRIVATE
22 |         aclnn_notify_dispatch.cpp
23 | )
24 | 
25 | if (NOT BUILD_OPEN_PROJECT)
26 |     target_sources(aclnn_ops_train PRIVATE
27 |         aclnn_notify_dispatch.cpp
28 |     )
29 | 
30 |     target_sources(aclnn_ops_infer PRIVATE
31 |         aclnn_notify_dispatch.cpp
32 |     )
33 | endif ()
34 | 
35 | target_sources(optiling PRIVATE
36 |         notify_dispatch_tiling.cpp
37 | )
38 | 
39 | target_include_directories(optiling PRIVATE
40 |         ${CMAKE_CURRENT_SOURCE_DIR}
41 | )
42 | 
43 | target_sources(opsproto PRIVATE)
44 | 
45 | file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_notify_dispatch.h")
46 | 
47 | install(FILES ${_GMM_Aclnn_header}
48 |         DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
49 | )
50 | 


--------------------------------------------------------------------------------
/vllm_ascend/patch/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | from vllm.triton_utils import HAS_TRITON
19 | 
20 | if HAS_TRITON:
21 |     import vllm_ascend.patch.worker.patch_triton
22 | 
23 | # isort: off
24 | import vllm_ascend.patch.platform.patch_sched_yield  # noqa
25 | import vllm_ascend.patch.worker.patch_bert  # noqa
26 | import vllm_ascend.patch.worker.patch_distributed  # noqa
27 | import vllm_ascend.patch.worker.patch_deepseek  # noqa
28 | import vllm_ascend.patch.worker.patch_weight_loader  # noqa
29 | import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
30 | import vllm_ascend.patch.worker.patch_minicpm  # noqa
31 | import vllm_ascend.patch.worker.patch_qwen2_5_vl  # noqa
32 | import vllm_ascend.patch.worker.patch_qwen2_5_omni  # noqa
33 | import vllm_ascend.patch.worker.patch_qwen3_vl  # noqa
34 | import vllm_ascend.patch.worker.patch_rope  # noqa
35 | import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
36 | import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa
37 | import vllm_ascend.patch.worker.patch_rejection_sampler  # noqa
38 | import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
39 | 


--------------------------------------------------------------------------------
/csrc/moe_combine_normal/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 2 | # This file is a part of the CANN Open Software.
 3 | # Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 4 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 5 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 6 | # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME MoeCombineNormal
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 | )
16 | 
17 | target_sources(op_host_aclnnInner PRIVATE
18 |         moe_combine_normal.cpp
19 | )
20 | 
21 | target_sources(opapi PRIVATE
22 |         aclnn_moe_combine_normal.cpp
23 | )
24 | 
25 | if (NOT BUILD_OPEN_PROJECT)
26 |     target_sources(aclnn_ops_train PRIVATE
27 |         aclnn_moe_combine_normal.cpp
28 |     )
29 | 
30 |     target_sources(aclnn_ops_infer PRIVATE
31 |         aclnn_moe_combine_normal.cpp
32 |     )
33 | endif ()
34 | 
35 | target_sources(optiling PRIVATE
36 |         moe_combine_normal_tiling.cpp
37 | )
38 | 
39 | target_include_directories(optiling PRIVATE
40 |         ${CMAKE_CURRENT_SOURCE_DIR}
41 | )
42 | 
43 | target_sources(opsproto PRIVATE)
44 | 
45 | file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_moe_combine_normal.h")
46 | 
47 | install(FILES ${_GMM_Aclnn_header}
48 |         DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
49 | )
50 | 


--------------------------------------------------------------------------------
/csrc/moe_dispatch_normal/op_host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 2 | # This file is a part of the CANN Open Software.
 3 | # Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 4 | # Please refer to the License for details. You may not use this file except in compliance with the License.
 5 | # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 6 | # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 7 | # See LICENSE in the root of the software repository for the full text of the License.
 8 | # ======================================================================================================================
 9 | 
10 | add_ops_compile_options(
11 |         OP_NAME MoeDispatchNormal
12 |         OPTIONS --cce-auto-sync=off
13 |                 -Wno-deprecated-declarations
14 |                 -Werror
15 | )
16 | 
17 | target_sources(op_host_aclnnInner PRIVATE
18 |         moe_dispatch_normal.cpp
19 | )
20 | 
21 | target_sources(opapi PRIVATE
22 |         aclnn_moe_dispatch_normal.cpp
23 | )
24 | 
25 | if (NOT BUILD_OPEN_PROJECT)
26 |     target_sources(aclnn_ops_train PRIVATE
27 |         aclnn_moe_dispatch_normal.cpp
28 |     )
29 | 
30 |     target_sources(aclnn_ops_infer PRIVATE
31 |         aclnn_moe_dispatch_normal.cpp
32 |     )
33 | endif ()
34 | 
35 | target_sources(optiling PRIVATE
36 |         moe_dispatch_normal_tiling.cpp
37 | )
38 | 
39 | target_include_directories(optiling PRIVATE
40 |         ${CMAKE_CURRENT_SOURCE_DIR}
41 | )
42 | 
43 | target_sources(opsproto PRIVATE)
44 | 
45 | file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_moe_dispatch_normal.h")
46 | 
47 | install(FILES ${_GMM_Aclnn_header}
48 |         DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
49 | )
50 | 


--------------------------------------------------------------------------------
/examples/offline_inference_npu.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 3 | # This file is a part of the vllm-ascend project.
 4 | # Adapted from vllm-project/vllm/examples/offline_inference/basic.py
 5 | # Copyright 2023 The vLLM team.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # isort: skip_file
21 | import os
22 | 
23 | os.environ["VLLM_USE_MODELSCOPE"] = "True"
24 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
25 | 
26 | from vllm import LLM, SamplingParams
27 | 
28 | 
29 | def main():
30 |     prompts = [
31 |         "Hello, my name is",
32 |         "The president of the United States is",
33 |         "The capital of France is",
34 |         "The future of AI is",
35 |     ]
36 | 
37 |     # Create a sampling params object.
38 |     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
39 |     # Create an LLM.
40 |     llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
41 | 
42 |     # Generate texts from the prompts.
43 |     outputs = llm.generate(prompts, sampling_params)
44 |     for output in outputs:
45 |         prompt = output.prompt
46 |         generated_text = output.outputs[0].text
47 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_fb_iterator.inc:
--------------------------------------------------------------------------------
 1 | /* Adapted from
 2 |  *      https://gitee.com/ascend/ascend-transformer-boost.git
 3 |  *
 4 |  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 5 |  * This file is a part of the CANN Open Software.
 6 |  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 7 |  * Please refer to the License for details. You may not use this file except in compliance with the License.
 8 |  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 9 |  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
10 |  * See LICENSE in the root of the software repository for the full text of the License.
11 |  */
12 | #include "../iterator.h"
13 | 
14 | /////////////////////////////////////////////////////
15 | // l1_to_fb
16 | /////////////////////////////////////////////////////
17 | 
18 | // Partial specialization for V220
19 | template <typename DataType>
20 | struct l1_to_fb<ArchType::ASCEND_V220, DataType> {
21 |     __aicore__ l1_to_fb(AscendC::LocalTensor<DataType> &dst,
22 |                         AscendC::LocalTensor<DataType> &src,
23 |                         uint16_t burstNum,
24 |                         uint16_t burstLen,
25 |                         uint16_t srcGap,
26 |                         uint16_t dstGap)
27 |     {
28 |         dst.address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::C2PIPE2GM);
29 |         AscendC::DataCopy(dst,
30 |                           src,
31 |                           AscendC::DataCopyParams(burstNum, // nBurst
32 |                                                   burstLen, // lenBurst
33 |                                                   srcGap,   // srcGap
34 |                                                   dstGap)); // dstGap);
35 |     }
36 | };
37 | 


--------------------------------------------------------------------------------