├── .buildkite
    ├── check-wheel-size.py
    ├── generate_index.py
    ├── lm-eval-harness
    │   ├── configs
    │   │   ├── DeepSeek-V2-Lite-Chat.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-QQQ.yaml
    │   │   ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Minitron-4B-Base-FP8.yaml
    │   │   ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1.yaml
    │   │   ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-FP8W8.yaml
    │   │   ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
    │   │   ├── Qwen2-57B-A14-Instruct.yaml
    │   │   ├── Qwen2.5-1.5B-Instruct.yaml
    │   │   ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
    │   │   ├── SparseLlama3.1_2of4_fp8_compressed.yaml
    │   │   ├── models-large.txt
    │   │   └── models-small.txt
    │   ├── conftest.py
    │   ├── run-lm-eval-gsm-hf-baseline.sh
    │   ├── run-lm-eval-gsm-vllm-baseline.sh
    │   └── test_lm_eval_correctness.py
    ├── nightly-benchmarks
    │   ├── README.md
    │   ├── benchmark-pipeline.yaml
    │   ├── nightly-annotation.md
    │   ├── nightly-descriptions.md
    │   ├── nightly-pipeline.yaml
    │   ├── performance-benchmarks-descriptions.md
    │   ├── scripts
    │   │   ├── convert-results-json-to-markdown.py
    │   │   ├── download-tokenizer.py
    │   │   ├── generate-nightly-markdown.py
    │   │   ├── get-lmdeploy-modelname.py
    │   │   ├── launch-server.sh
    │   │   ├── nightly-annotate.sh
    │   │   ├── run-nightly-benchmarks.sh
    │   │   ├── run-performance-benchmarks.sh
    │   │   ├── summary-nightly-results.py
    │   │   └── wait-for-image.sh
    │   └── tests
    │   │   ├── genai-perf-tests.json
    │   │   ├── latency-tests.json
    │   │   ├── nightly-tests.json
    │   │   ├── serving-tests.json
    │   │   └── throughput-tests.json
    ├── pyproject.toml
    ├── release-pipeline.yaml
    ├── scripts
    │   ├── annotate-release.sh
    │   ├── hardware_ci
    │   │   ├── run-amd-test.sh
    │   │   ├── run-cpu-test-ppc64le.sh
    │   │   ├── run-cpu-test-s390x.sh
    │   │   ├── run-cpu-test.sh
    │   │   ├── run-gh200-test.sh
    │   │   ├── run-hpu-test.sh
    │   │   ├── run-neuron-test.sh
    │   │   ├── run-tpu-v1-test.sh
    │   │   └── run-xpu-test.sh
    │   ├── run-benchmarks.sh
    │   ├── run-multi-node-test.sh
    │   └── upload-wheels.sh
    ├── test-pipeline.yaml
    └── test-template.j2
├── .clang-format
├── .dockerignore
├── .github
    ├── CODEOWNERS
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug-report.yml
    │   ├── 450-ci-failure.yml
    │   ├── 500-feature-request.yml
    │   ├── 600-new-model.yml
    │   ├── 700-performance-discussion.yml
    │   ├── 750-RFC.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    ├── mergify.yml
    ├── scripts
    │   └── cleanup_pr_body.sh
    └── workflows
    │   ├── add_label_automerge.yml
    │   ├── cleanup_pr_body.yml
    │   ├── matchers
    │       ├── actionlint.json
    │       └── mypy.json
    │   ├── pre-commit.yml
    │   ├── publish.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── stale.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .shellcheckrc
├── .yapfignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DCO
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASE.md
├── ROCm_performance.md
├── SECURITY.md
├── benchmarks
    ├── P3L.py
    ├── P3L_mling.py
    ├── README.md
    ├── auto_tune.sh
    ├── backend_request_func.py
    ├── benchmark_dataset.py
    ├── benchmark_latency.py
    ├── benchmark_long_document_qa_throughput.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_prioritization.py
    ├── benchmark_serving.py
    ├── benchmark_serving_structured_output.py
    ├── benchmark_throughput.py
    ├── benchmark_utils.py
    ├── cutlass_benchmarks
    │   ├── sparse_benchmarks.py
    │   ├── utils.py
    │   ├── w8a8_benchmarks.py
    │   └── weight_shapes.py
    ├── disagg_benchmarks
    │   ├── disagg_overhead_benchmark.sh
    │   ├── disagg_performance_benchmark.sh
    │   ├── disagg_prefill_proxy_server.py
    │   ├── round_robin_proxy.py
    │   └── visualize_benchmark_results.py
    ├── fused_kernels
    │   └── layernorm_rms_benchmarks.py
    ├── kernels
    │   ├── bench_fp8_gemm.py
    │   ├── benchmark_aqlm.py
    │   ├── benchmark_bitblas.py
    │   ├── benchmark_cutlass_fp4_moe.py
    │   ├── benchmark_grouped_gemm_cutlass.py
    │   ├── benchmark_layernorm.py
    │   ├── benchmark_lora.py
    │   ├── benchmark_machete.py
    │   ├── benchmark_marlin.py
    │   ├── benchmark_moe.py
    │   ├── benchmark_moe_permute_unpermute.py
    │   ├── benchmark_paged_attention.py
    │   ├── benchmark_quant.py
    │   ├── benchmark_rmsnorm.py
    │   ├── benchmark_rope.py
    │   ├── benchmark_shapes.py
    │   ├── benchmark_w8a8_block_fp8.py
    │   ├── deepgemm
    │   │   ├── README.md
    │   │   └── benchmark_fp8_block_dense_gemm.py
    │   ├── graph_machete_bench.py
    │   ├── moe_tune_script.sh
    │   ├── requirements.txt
    │   ├── utils.py
    │   └── weight_shapes.py
    ├── overheads
    │   └── benchmark_hashing.py
    ├── profiling
    │   ├── README.md
    │   ├── benchmark_latency.py
    │   └── benchmark_throughput.py
    ├── pyproject.toml
    ├── run_structured_output_benchmark.sh
    ├── sonnet.txt
    └── structured_schemas
    │   └── structured_schema_1.json
├── cmake
    ├── cpu_extension.cmake
    ├── external_projects
    │   ├── flashmla.cmake
    │   └── vllm_flash_attn.cmake
    ├── hipify.py
    └── utils.cmake
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cuh
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   ├── dtype_float32.cuh
    │   ├── dtype_fp8.cuh
    │   ├── merge_attn_states.cu
    │   ├── mla
    │   │   ├── cutlass_mla_entry.cu
    │   │   └── cutlass_mla_kernels.cu
    │   ├── paged_attention_v1.cu
    │   ├── paged_attention_v2.cu
    │   └── vertical_slash_index.cu
    ├── cache.h
    ├── cache_kernels.cu
    ├── core
    │   ├── exception.hpp
    │   ├── math.hpp
    │   ├── registration.h
    │   └── scalar_type.hpp
    ├── cpu
    │   ├── activation.cpp
    │   ├── attention.cpp
    │   ├── cache.cpp
    │   ├── cpu_types.hpp
    │   ├── cpu_types_arm.hpp
    │   ├── cpu_types_vsx.hpp
    │   ├── cpu_types_vxe.hpp
    │   ├── cpu_types_x86.hpp
    │   ├── dnnl_helper.hpp
    │   ├── layernorm.cpp
    │   ├── mla_decode.cpp
    │   ├── pos_encoding.cpp
    │   ├── quant.cpp
    │   ├── shm.cpp
    │   ├── torch_bindings.cpp
    │   └── utils.cpp
    ├── cuda_compat.h
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── cuda_view.cu
    ├── cumem_allocator.cpp
    ├── custom_all_reduce.cu
    ├── custom_all_reduce.cuh
    ├── custom_all_reduce_test.cu
    ├── cutlass_extensions
    │   ├── common.cpp
    │   ├── common.hpp
    │   ├── cute_utils.cuh
    │   ├── epilogue
    │   │   ├── broadcast_load_epilogue_array_c3x.hpp
    │   │   ├── broadcast_load_epilogue_c2x.hpp
    │   │   ├── broadcast_load_epilogue_c3x.hpp
    │   │   ├── scaled_mm_epilogues_c2x.hpp
    │   │   └── scaled_mm_epilogues_c3x.hpp
    │   ├── gemm
    │   │   ├── collective
    │   │   │   ├── collective_builder.hpp
    │   │   │   ├── fp8_accumulation.hpp
    │   │   │   └── sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
    │   │   └── dispatch_policy.hpp
    │   ├── torch_utils.hpp
    │   ├── vllm_collective_builder.cuh
    │   ├── vllm_custom_types.cuh
    │   ├── vllm_cutlass_library_extension.py
    │   ├── vllm_numeric_conversion.cuh
    │   └── vllm_type_utils.cuh
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── layernorm_quant_kernels.cu
    ├── mamba
    │   ├── causal_conv1d
    │   │   ├── causal_conv1d.cu
    │   │   ├── causal_conv1d.h
    │   │   └── static_switch.h
    │   └── mamba_ssm
    │   │   ├── selective_scan.h
    │   │   ├── selective_scan_fwd.cu
    │   │   └── static_switch.h
    ├── moe
    │   ├── marlin_moe_wna16
    │   │   ├── .gitignore
    │   │   ├── generate_kernels.py
    │   │   ├── kernel.h
    │   │   ├── marlin_template.h
    │   │   └── ops.cu
    │   ├── moe_align_sum_kernels.cu
    │   ├── moe_ops.h
    │   ├── moe_permute_unpermute_op.cu
    │   ├── moe_wna16.cu
    │   ├── moe_wna16_utils.h
    │   ├── permute_unpermute_kernels
    │   │   ├── dispatch.h
    │   │   ├── moe_permute_unpermute_kernel.cu
    │   │   ├── moe_permute_unpermute_kernel.h
    │   │   └── moe_permute_unpermute_kernel.inl
    │   ├── topk_softmax_kernels.cu
    │   └── torch_bindings.cpp
    ├── ops.h
    ├── permute_cols.cu
    ├── pos_encoding_kernels.cu
    ├── prepare_inputs
    │   ├── advance_step.cu
    │   └── advance_step.cuh
    ├── quantization
    │   ├── activation_kernels.cu
    │   ├── aqlm
    │   │   └── gemm_kernels.cu
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   ├── compressed_tensors
    │   │   └── int8_quant_kernels.cu
    │   ├── cutlass_w8a8
    │   │   ├── Epilogues.md
    │   │   ├── c3x
    │   │   │   ├── cutlass_gemm_caller.cuh
    │   │   │   ├── scaled_mm.cuh
    │   │   │   ├── scaled_mm_azp_sm90_int8.cu
    │   │   │   ├── scaled_mm_blockwise_sm100_fp8.cu
    │   │   │   ├── scaled_mm_blockwise_sm100_fp8_dispatch.cuh
    │   │   │   ├── scaled_mm_blockwise_sm90_fp8.cu
    │   │   │   ├── scaled_mm_blockwise_sm90_fp8_dispatch.cuh
    │   │   │   ├── scaled_mm_helper.hpp
    │   │   │   ├── scaled_mm_kernels.hpp
    │   │   │   ├── scaled_mm_sm100_fp8.cu
    │   │   │   ├── scaled_mm_sm100_fp8_dispatch.cuh
    │   │   │   ├── scaled_mm_sm90_fp8.cu
    │   │   │   ├── scaled_mm_sm90_fp8_dispatch.cuh
    │   │   │   ├── scaled_mm_sm90_int8.cu
    │   │   │   └── scaled_mm_sm90_int8_dispatch.cuh
    │   │   ├── moe
    │   │   │   ├── get_group_starts.cuh
    │   │   │   ├── grouped_mm_c3x.cu
    │   │   │   ├── grouped_mm_c3x.cuh
    │   │   │   └── moe_data.cu
    │   │   ├── scaled_mm_c2x.cu
    │   │   ├── scaled_mm_c2x.cuh
    │   │   ├── scaled_mm_c2x_sm75_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm80_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
    │   │   ├── scaled_mm_c3x_sm100.cu
    │   │   ├── scaled_mm_c3x_sm90.cu
    │   │   └── scaled_mm_entry.cu
    │   ├── fp4
    │   │   ├── nvfp4_blockwise_moe_kernel.cu
    │   │   ├── nvfp4_experts_quant.cu
    │   │   ├── nvfp4_quant_entry.cu
    │   │   ├── nvfp4_quant_kernels.cu
    │   │   ├── nvfp4_scaled_mm_entry.cu
    │   │   └── nvfp4_scaled_mm_kernels.cu
    │   ├── fp8
    │   │   ├── amd
    │   │   │   └── quant_utils.cuh
    │   │   ├── common.cu
    │   │   ├── common.cuh
    │   │   └── nvidia
    │   │   │   └── quant_utils.cuh
    │   ├── fused_kernels
    │   │   ├── fused_layernorm_dynamic_per_token_quant.cu
    │   │   ├── layernorm_utils.cuh
    │   │   └── quant_conversions.cuh
    │   ├── gguf
    │   │   ├── dequantize.cuh
    │   │   ├── ggml-common.h
    │   │   ├── gguf_kernel.cu
    │   │   ├── mmq.cuh
    │   │   ├── mmvq.cuh
    │   │   ├── moe.cuh
    │   │   ├── moe_vec.cuh
    │   │   └── vecdotq.cuh
    │   ├── gptq
    │   │   ├── compat.cuh
    │   │   ├── matrix_view.cuh
    │   │   ├── q_gemm.cu
    │   │   ├── qdq_2.cuh
    │   │   ├── qdq_3.cuh
    │   │   ├── qdq_4.cuh
    │   │   ├── qdq_8.cuh
    │   │   └── qdq_util.cuh
    │   ├── gptq_allspark
    │   │   ├── allspark_qgemm_w8a16.cu
    │   │   ├── allspark_repack.cu
    │   │   └── allspark_utils.cuh
    │   ├── gptq_marlin
    │   │   ├── .gitignore
    │   │   ├── awq_marlin_repack.cu
    │   │   ├── dequant.h
    │   │   ├── generate_kernels.py
    │   │   ├── gptq_marlin.cu
    │   │   ├── gptq_marlin_repack.cu
    │   │   ├── kernel.h
    │   │   ├── marlin.cuh
    │   │   ├── marlin_dtypes.cuh
    │   │   └── marlin_template.h
    │   ├── machete
    │   │   ├── Readme.md
    │   │   ├── generate.py
    │   │   ├── machete_collective_builder.cuh
    │   │   ├── machete_interleaving_utils.cuh
    │   │   ├── machete_mainloop.cuh
    │   │   ├── machete_mm_kernel.cuh
    │   │   ├── machete_mm_launcher.cuh
    │   │   ├── machete_prepack_kernel.cuh
    │   │   ├── machete_prepack_launcher.cuh
    │   │   ├── machete_prepacked_layout.cuh
    │   │   └── machete_pytorch.cu
    │   ├── marlin
    │   │   ├── dense
    │   │   │   ├── LICENSE
    │   │   │   ├── common
    │   │   │   │   ├── base.h
    │   │   │   │   └── mem.h
    │   │   │   └── marlin_cuda_kernel.cu
    │   │   ├── qqq
    │   │   │   └── marlin_qqq_gemm_kernel.cu
    │   │   └── sparse
    │   │   │   ├── LICENSE
    │   │   │   ├── common
    │   │   │       ├── base.h
    │   │   │       ├── mem.h
    │   │   │       └── mma.h
    │   │   │   └── marlin_24_cuda_kernel.cu
    │   ├── utils.cuh
    │   └── vectorization.cuh
    ├── rocm
    │   ├── attention.cu
    │   ├── custom.cu
    │   ├── fused_kernels.cu
    │   ├── ops.h
    │   ├── skinny_gemms.cu
    │   └── torch_bindings.cpp
    ├── sampler.cu
    ├── sparse
    │   └── cutlass
    │   │   ├── sparse_compressor_c3x.cuh
    │   │   ├── sparse_scaled_mm_c3x.cu
    │   │   ├── sparse_scaled_mm_c3x.cuh
    │   │   └── sparse_scaled_mm_entry.cu
    ├── torch_bindings.cpp
    └── type_convert.cuh
├── docker
    ├── Dockerfile
    ├── Dockerfile.arm
    ├── Dockerfile.cpu
    ├── Dockerfile.hpu
    ├── Dockerfile.neuron
    ├── Dockerfile.nightly_torch
    ├── Dockerfile.ppc64le
    ├── Dockerfile.rocm
    ├── Dockerfile.rocm_base
    ├── Dockerfile.s390x
    ├── Dockerfile.tpu
    └── Dockerfile.xpu
├── docs
    ├── .nav.yml
    ├── README.md
    ├── api
    │   ├── README.md
    │   └── vllm
    │   │   └── .meta.yml
    ├── assets
    │   ├── contributing
    │   │   └── dockerfile-stages-dependency.png
    │   ├── deployment
    │   │   ├── anything-llm-chat-with-doc.png
    │   │   ├── anything-llm-chat-without-doc.png
    │   │   ├── anything-llm-provider.png
    │   │   ├── anything-llm-upload-doc.png
    │   │   ├── architecture_helm_deployment.png
    │   │   ├── chatbox-chat.png
    │   │   ├── chatbox-settings.png
    │   │   ├── dify-chat.png
    │   │   ├── dify-create-chatbot.png
    │   │   ├── dify-settings.png
    │   │   ├── open_webui.png
    │   │   └── streamlit-chat.png
    │   ├── design
    │   │   ├── arch_overview
    │   │   │   ├── entrypoints.excalidraw.png
    │   │   │   └── llm_engine.excalidraw.png
    │   │   ├── hierarchy.png
    │   │   └── v1
    │   │   │   ├── metrics
    │   │   │       ├── intervals-1.png
    │   │   │       ├── intervals-2.png
    │   │   │       └── intervals-3.png
    │   │   │   └── prefix_caching
    │   │   │       ├── example-time-1.png
    │   │   │       ├── example-time-3.png
    │   │   │       ├── example-time-4.png
    │   │   │       ├── example-time-5.png
    │   │   │       ├── example-time-6.png
    │   │   │       ├── example-time-7.png
    │   │   │       ├── free.png
    │   │   │       └── overview.png
    │   ├── features
    │   │   └── disagg_prefill
    │   │   │   ├── abstraction.jpg
    │   │   │   └── overview.jpg
    │   ├── kernel
    │   │   ├── k_vecs.png
    │   │   ├── key.png
    │   │   ├── logits_vec.png
    │   │   ├── q_vecs.png
    │   │   ├── query.png
    │   │   ├── v_vec.png
    │   │   └── value.png
    │   └── logos
    │   │   ├── vllm-logo-only-light.ico
    │   │   ├── vllm-logo-only-light.png
    │   │   ├── vllm-logo-text-dark.png
    │   │   └── vllm-logo-text-light.png
    ├── cli
    │   └── README.md
    ├── community
    │   ├── meetups.md
    │   └── sponsors.md
    ├── configuration
    │   ├── README.md
    │   ├── conserving_memory.md
    │   ├── engine_args.md
    │   ├── env_vars.md
    │   ├── model_resolution.md
    │   ├── optimization.md
    │   └── serve_args.md
    ├── contributing
    │   ├── README.md
    │   ├── benchmarks.md
    │   ├── ci-failures.md
    │   ├── deprecation_policy.md
    │   ├── dockerfile
    │   │   └── dockerfile.md
    │   ├── model
    │   │   ├── README.md
    │   │   ├── basic.md
    │   │   ├── multimodal.md
    │   │   ├── registration.md
    │   │   └── tests.md
    │   ├── profiling.md
    │   └── vulnerability_management.md
    ├── deployment
    │   ├── docker.md
    │   ├── frameworks
    │   │   ├── anything-llm.md
    │   │   ├── autogen.md
    │   │   ├── bentoml.md
    │   │   ├── cerebrium.md
    │   │   ├── chatbox.md
    │   │   ├── dify.md
    │   │   ├── dstack.md
    │   │   ├── haystack.md
    │   │   ├── helm.md
    │   │   ├── litellm.md
    │   │   ├── lobe-chat.md
    │   │   ├── lws.md
    │   │   ├── modal.md
    │   │   ├── open-webui.md
    │   │   ├── retrieval_augmented_generation.md
    │   │   ├── skypilot.md
    │   │   ├── streamlit.md
    │   │   └── triton.md
    │   ├── integrations
    │   │   ├── kserve.md
    │   │   ├── kubeai.md
    │   │   ├── llamastack.md
    │   │   ├── llmaz.md
    │   │   └── production-stack.md
    │   ├── k8s.md
    │   └── nginx.md
    ├── design
    │   ├── arch_overview.md
    │   ├── automatic_prefix_caching.md
    │   ├── huggingface_integration.md
    │   ├── kernel
    │   │   └── paged_attention.md
    │   ├── mm_processing.md
    │   ├── multiprocessing.md
    │   ├── plugin_system.md
    │   └── v1
    │   │   ├── metrics.md
    │   │   ├── prefix_caching.md
    │   │   └── torch_compile.md
    ├── dev-docker
    │   └── README.md
    ├── features
    │   ├── automatic_prefix_caching.md
    │   ├── compatibility_matrix.md
    │   ├── disagg_prefill.md
    │   ├── lora.md
    │   ├── multimodal_inputs.md
    │   ├── prompt_embeds.md
    │   ├── quantization
    │   │   ├── README.md
    │   │   ├── auto_awq.md
    │   │   ├── bitblas.md
    │   │   ├── bnb.md
    │   │   ├── fp8.md
    │   │   ├── gguf.md
    │   │   ├── gptqmodel.md
    │   │   ├── int4.md
    │   │   ├── int8.md
    │   │   ├── modelopt.md
    │   │   ├── quantized_kvcache.md
    │   │   ├── quark.md
    │   │   ├── supported_hardware.md
    │   │   └── torchao.md
    │   ├── reasoning_outputs.md
    │   ├── spec_decode.md
    │   ├── structured_outputs.md
    │   └── tool_calling.md
    ├── getting_started
    │   ├── installation
    │   │   ├── .nav.yml
    │   │   ├── README.md
    │   │   ├── ai_accelerator.md
    │   │   ├── ai_accelerator
    │   │   │   ├── hpu-gaudi.inc.md
    │   │   │   ├── neuron.inc.md
    │   │   │   └── tpu.inc.md
    │   │   ├── cpu.md
    │   │   ├── cpu
    │   │   │   ├── apple.inc.md
    │   │   │   ├── arm.inc.md
    │   │   │   ├── build.inc.md
    │   │   │   ├── s390x.inc.md
    │   │   │   └── x86.inc.md
    │   │   ├── device.template.md
    │   │   ├── gpu.md
    │   │   ├── gpu
    │   │   │   ├── cuda.inc.md
    │   │   │   ├── rocm.inc.md
    │   │   │   └── xpu.inc.md
    │   │   └── python_env_setup.inc.md
    │   └── quickstart.md
    ├── mkdocs
    │   ├── hooks
    │   │   ├── generate_examples.py
    │   │   ├── remove_announcement.py
    │   │   └── url_schemes.py
    │   ├── javascript
    │   │   └── run_llm_widget.js
    │   ├── overrides
    │   │   └── main.html
    │   └── stylesheets
    │   │   └── extra.css
    ├── models
    │   ├── extensions
    │   │   ├── fastsafetensor.md
    │   │   ├── runai_model_streamer.md
    │   │   └── tensorizer.md
    │   ├── generative_models.md
    │   ├── pooling_models.md
    │   └── supported_models.md
    ├── serving
    │   ├── distributed_serving.md
    │   ├── integrations
    │   │   ├── langchain.md
    │   │   └── llamaindex.md
    │   ├── offline_inference.md
    │   └── openai_compatible_server.md
    ├── training
    │   ├── rlhf.md
    │   └── trl.md
    └── usage
    │   ├── README.md
    │   ├── faq.md
    │   ├── metrics.md
    │   ├── reproducibility.md
    │   ├── security.md
    │   ├── troubleshooting.md
    │   ├── usage_stats.md
    │   └── v1_guide.md
├── examples
    ├── offline_inference
    │   ├── audio_language.py
    │   ├── automatic_prefix_caching.py
    │   ├── basic
    │   │   ├── README.md
    │   │   ├── basic.py
    │   │   ├── chat.py
    │   │   ├── classify.py
    │   │   ├── embed.py
    │   │   ├── generate.py
    │   │   └── score.py
    │   ├── batch_llm_inference.py
    │   ├── chat_with_tools.py
    │   ├── context_extension.py
    │   ├── data_parallel.py
    │   ├── disaggregated-prefill-v1
    │   │   ├── README.md
    │   │   ├── decode_example.py
    │   │   ├── prefill_example.py
    │   │   └── run.sh
    │   ├── disaggregated_prefill.py
    │   ├── eagle.py
    │   ├── embed_jina_embeddings_v3.py
    │   ├── embed_matryoshka_fy.py
    │   ├── encoder_decoder.py
    │   ├── encoder_decoder_multimodal.py
    │   ├── llm_engine_example.py
    │   ├── load_sharded_state.py
    │   ├── lora_with_quantization_inference.py
    │   ├── metrics.py
    │   ├── mistral-small.py
    │   ├── mlpspeculator.py
    │   ├── multilora_inference.py
    │   ├── neuron.py
    │   ├── neuron_eagle.py
    │   ├── neuron_int8_quantization.py
    │   ├── neuron_multimodal.py
    │   ├── neuron_speculation.py
    │   ├── openai_batch
    │   │   ├── README.md
    │   │   └── openai_example_batch.jsonl
    │   ├── prefix_caching.py
    │   ├── prithvi_geospatial_mae.py
    │   ├── profiling.py
    │   ├── profiling_tpu
    │   │   ├── README.md
    │   │   └── profiling.py
    │   ├── prompt_embed_inference.py
    │   ├── qwen2_5_omni
    │   │   ├── README.md
    │   │   └── only_thinker.py
    │   ├── qwen_1m.py
    │   ├── reproducibility.py
    │   ├── rlhf.py
    │   ├── rlhf_colocate.py
    │   ├── rlhf_utils.py
    │   ├── save_sharded_state.py
    │   ├── simple_profiling.py
    │   ├── structured_outputs.py
    │   ├── torchrun_example.py
    │   ├── tpu.py
    │   ├── vision_language.py
    │   ├── vision_language_embedding.py
    │   └── vision_language_multi_image.py
    ├── online_serving
    │   ├── api_client.py
    │   ├── chart-helm
    │   │   ├── .helmignore
    │   │   ├── Chart.yaml
    │   │   ├── README.md
    │   │   ├── ct.yaml
    │   │   ├── lintconf.yaml
    │   │   ├── templates
    │   │   │   ├── _helpers.tpl
    │   │   │   ├── configmap.yaml
    │   │   │   ├── custom-objects.yaml
    │   │   │   ├── deployment.yaml
    │   │   │   ├── hpa.yaml
    │   │   │   ├── job.yaml
    │   │   │   ├── poddisruptionbudget.yaml
    │   │   │   ├── pvc.yaml
    │   │   │   ├── secrets.yaml
    │   │   │   └── service.yaml
    │   │   ├── values.schema.json
    │   │   └── values.yaml
    │   ├── cohere_rerank_client.py
    │   ├── disaggregated_prefill.sh
    │   ├── disaggregated_serving
    │   │   ├── README.md
    │   │   ├── disagg_proxy_demo.py
    │   │   └── kv_events.sh
    │   ├── gradio_openai_chatbot_webserver.py
    │   ├── gradio_webserver.py
    │   ├── jinaai_rerank_client.py
    │   ├── kv_events_subscriber.py
    │   ├── multi-node-serving.sh
    │   ├── multi_instance_data_parallel.py
    │   ├── openai_chat_completion_client.py
    │   ├── openai_chat_completion_client_for_multimodal.py
    │   ├── openai_chat_completion_client_with_tools.py
    │   ├── openai_chat_completion_client_with_tools_required.py
    │   ├── openai_chat_completion_structured_outputs.py
    │   ├── openai_chat_completion_structured_outputs_structural_tag.py
    │   ├── openai_chat_completion_structured_outputs_with_reasoning.py
    │   ├── openai_chat_completion_tool_calls_with_reasoning.py
    │   ├── openai_chat_completion_with_reasoning.py
    │   ├── openai_chat_completion_with_reasoning_streaming.py
    │   ├── openai_chat_embedding_client_for_multimodal.py
    │   ├── openai_classification_client.py
    │   ├── openai_completion_client.py
    │   ├── openai_cross_encoder_score.py
    │   ├── openai_embedding_client.py
    │   ├── openai_embedding_matryoshka_fy.py
    │   ├── openai_pooling_client.py
    │   ├── openai_transcription_client.py
    │   ├── opentelemetry
    │   │   ├── README.md
    │   │   └── dummy_client.py
    │   ├── prometheus_grafana
    │   │   ├── README.md
    │   │   ├── docker-compose.yaml
    │   │   ├── grafana.json
    │   │   └── prometheus.yaml
    │   ├── prompt_embed_inference_with_openai_client.py
    │   ├── ray_serve_deepseek.py
    │   ├── retrieval_augmented_generation_with_langchain.py
    │   ├── retrieval_augmented_generation_with_llamaindex.py
    │   ├── run_cluster.sh
    │   ├── sagemaker-entrypoint.sh
    │   ├── streamlit_openai_chatbot_webserver.py
    │   └── utils.py
    ├── others
    │   ├── lmcache
    │   │   ├── README.md
    │   │   ├── cpu_offload_lmcache.py
    │   │   ├── disagg_prefill_lmcache_v0.py
    │   │   ├── disagg_prefill_lmcache_v1
    │   │   │   ├── configs
    │   │   │   │   ├── lmcache-decoder-config.yaml
    │   │   │   │   └── lmcache-prefiller-config.yaml
    │   │   │   ├── disagg_example_nixl.sh
    │   │   │   ├── disagg_proxy_server.py
    │   │   │   └── disagg_vllm_launcher.sh
    │   │   └── kv_cache_sharing_lmcache_v1.py
    │   ├── logging_configuration.md
    │   └── tensorize_vllm_model.py
    ├── pyproject.toml
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_dse_qwen2_vl.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    ├── template_inkbot.jinja
    ├── template_teleflm.jinja
    ├── template_vlm2vec.jinja
    ├── tool_chat_template_deepseekr1.jinja
    ├── tool_chat_template_deepseekv3.jinja
    ├── tool_chat_template_granite.jinja
    ├── tool_chat_template_granite_20b_fc.jinja
    ├── tool_chat_template_hermes.jinja
    ├── tool_chat_template_internlm2_tool.jinja
    ├── tool_chat_template_llama3.1_json.jinja
    ├── tool_chat_template_llama3.2_json.jinja
    ├── tool_chat_template_llama3.2_pythonic.jinja
    ├── tool_chat_template_llama4_json.jinja
    ├── tool_chat_template_llama4_pythonic.jinja
    ├── tool_chat_template_mistral.jinja
    ├── tool_chat_template_mistral3.jinja
    ├── tool_chat_template_mistral_parallel.jinja
    ├── tool_chat_template_phi4_mini.jinja
    └── tool_chat_template_toolace.jinja
├── find_cuda_init.py
├── format.sh
├── mkdocs.yaml
├── pyproject.toml
├── requirements
    ├── build.txt
    ├── common.txt
    ├── cpu.txt
    ├── cuda.txt
    ├── dev.txt
    ├── docs.txt
    ├── hpu.txt
    ├── lint.txt
    ├── neuron.txt
    ├── nightly_torch_test.txt
    ├── rocm-build.txt
    ├── rocm-test.txt
    ├── rocm.txt
    ├── test.in
    ├── test.txt
    ├── tpu.txt
    └── xpu.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── __init__.py
    │   ├── api_server_async_engine.py
    │   ├── conftest.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   ├── __init__.py
    │   ├── test_basic_correctness.py
    │   ├── test_chunked_prefill.py
    │   ├── test_cpu_offload.py
    │   ├── test_cumem.py
    │   └── test_preemption.py
    ├── benchmarks
    │   ├── __init__.py
    │   ├── test_latency_cli.py
    │   ├── test_serve_cli.py
    │   └── test_throughput_cli.py
    ├── build_cython.py
    ├── compile
    │   ├── __init__.py
    │   ├── backend.py
    │   ├── conftest.py
    │   ├── piecewise
    │   │   ├── __init__.py
    │   │   ├── test_full_cudagraph.py
    │   │   ├── test_simple.py
    │   │   └── test_toy_llama.py
    │   ├── test_async_tp.py
    │   ├── test_basic_correctness.py
    │   ├── test_full_graph.py
    │   ├── test_functionalization.py
    │   ├── test_fusion.py
    │   ├── test_pass_manager.py
    │   ├── test_sequence_parallelism.py
    │   ├── test_silu_mul_quant_fusion.py
    │   └── test_wrapper.py
    ├── config
    │   ├── test_config.yaml
    │   └── test_config_with_model.yaml
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── block
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── e2e
    │   │   │   ├── __init__.py
    │   │   │   ├── conftest.py
    │   │   │   ├── test_correctness.py
    │   │   │   └── test_correctness_sliding_window.py
    │   │   ├── test_block_manager.py
    │   │   ├── test_block_table.py
    │   │   ├── test_common.py
    │   │   ├── test_cpu_gpu_block_allocator.py
    │   │   ├── test_naive_block.py
    │   │   └── test_prefix_caching_block.py
    │   ├── conftest.py
    │   ├── test_chunked_prefill_scheduler.py
    │   ├── test_num_computed_tokens_update.py
    │   ├── test_scheduler.py
    │   ├── test_scheduler_encoder_decoder.py
    │   ├── test_serialization.py
    │   └── utils.py
    ├── detokenizer
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_disable_detokenization.py
    │   ├── test_stop_checker.py
    │   ├── test_stop_reason.py
    │   └── test_stop_strings.py
    ├── distributed
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_ca_buffer_sharing.py
    │   ├── test_comm_ops.py
    │   ├── test_custom_all_reduce.py
    │   ├── test_distributed_oot.py
    │   ├── test_events.py
    │   ├── test_expert_parallel.py
    │   ├── test_multi_node_assignment.py
    │   ├── test_pipeline_parallel.py
    │   ├── test_pipeline_partition.py
    │   ├── test_pp_cudagraph.py
    │   ├── test_pynccl.py
    │   ├── test_same_node.py
    │   ├── test_sequence_parallel.py
    │   ├── test_shm_broadcast.py
    │   ├── test_torchrun_example.py
    │   └── test_utils.py
    ├── encoder_decoder
    │   ├── __init__.py
    │   └── test_e2e_correctness.py
    ├── engine
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_arg_utils.py
    │   ├── test_computed_prefix_blocks.py
    │   ├── test_executor.py
    │   ├── test_multi_step_output_processor.py
    │   ├── test_multiproc_workers.py
    │   ├── test_options.py
    │   └── test_short_mm_context.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── test_accuracy.py
    │   │   ├── test_chat.py
    │   │   ├── test_collective_rpc.py
    │   │   ├── test_encode.py
    │   │   ├── test_generate.py
    │   │   ├── test_generate_multiple_loras.py
    │   │   ├── test_gpu_utilization.py
    │   │   ├── test_guided_generate.py
    │   │   ├── test_lazy_outlines.py
    │   │   └── test_prompt_validation.py
    │   ├── offline_mode
    │   │   ├── __init__.py
    │   │   └── test_offline_mode.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── correctness
    │   │   │   ├── __init__.py
    │   │   │   ├── test_lmeval.py
    │   │   │   ├── test_mteb.py
    │   │   │   └── test_transcription_api_correctness.py
    │   │   ├── test_async_tokenization.py
    │   │   ├── test_audio.py
    │   │   ├── test_basic.py
    │   │   ├── test_chat.py
    │   │   ├── test_chat_echo.py
    │   │   ├── test_chat_logit_bias_validation.py
    │   │   ├── test_chat_template.py
    │   │   ├── test_chat_with_tool_reasoning.py
    │   │   ├── test_chunked_prompt.py
    │   │   ├── test_classification.py
    │   │   ├── test_cli_args.py
    │   │   ├── test_completion.py
    │   │   ├── test_completion_with_function_calling.py
    │   │   ├── test_completion_with_prompt_embeds.py
    │   │   ├── test_embedding.py
    │   │   ├── test_embedding_dimensions.py
    │   │   ├── test_encoder_decoder.py
    │   │   ├── test_lora_adapters.py
    │   │   ├── test_lora_resolvers.py
    │   │   ├── test_metrics.py
    │   │   ├── test_models.py
    │   │   ├── test_oot_registration.py
    │   │   ├── test_openai_schema.py
    │   │   ├── test_pooling.py
    │   │   ├── test_prompt_validation.py
    │   │   ├── test_rerank.py
    │   │   ├── test_return_tokens_as_ids.py
    │   │   ├── test_root_path.py
    │   │   ├── test_run_batch.py
    │   │   ├── test_score.py
    │   │   ├── test_serving_chat.py
    │   │   ├── test_serving_models.py
    │   │   ├── test_shutdown.py
    │   │   ├── test_sleep.py
    │   │   ├── test_tensorizer_entrypoint.py
    │   │   ├── test_tokenization.py
    │   │   ├── test_transcription_validation.py
    │   │   ├── test_truncation.py
    │   │   ├── test_video.py
    │   │   ├── test_vision.py
    │   │   ├── test_vision_embedding.py
    │   │   └── tool_parsers
    │   │   │   ├── __init__.py
    │   │   │   ├── test_llama4_pythonic_tool_parser.py
    │   │   │   ├── test_pythonic_tool_parser.py
    │   │   │   └── utils.py
    │   ├── test_api_server_process_manager.py
    │   ├── test_chat_utils.py
    │   └── test_ssl_cert_refresher.py
    ├── fastsafetensors_loader
    │   ├── __init__.py
    │   ├── test_fastsafetensors_loader.py
    │   └── test_weight_utils.py
    ├── kernels
    │   ├── __init__.py
    │   ├── allclose_default.py
    │   ├── attention
    │   │   ├── conftest.py
    │   │   ├── test_attention.py
    │   │   ├── test_attention_selector.py
    │   │   ├── test_blocksparse_attention.py
    │   │   ├── test_cache.py
    │   │   ├── test_cascade_flash_attn.py
    │   │   ├── test_encoder_decoder_attn.py
    │   │   ├── test_flash_attn.py
    │   │   ├── test_flashinfer.py
    │   │   ├── test_flashmla.py
    │   │   ├── test_lightning_attn.py
    │   │   ├── test_merge_attn_states.py
    │   │   ├── test_mha_attn.py
    │   │   ├── test_mla_decode_cpu.py
    │   │   ├── test_prefix_prefill.py
    │   │   ├── test_rocm_attention_selector.py
    │   │   ├── test_triton_decode_attention.py
    │   │   └── test_triton_unified_attention.py
    │   ├── core
    │   │   ├── test_activation.py
    │   │   ├── test_fused_quant_layernorm.py
    │   │   ├── test_layernorm.py
    │   │   ├── test_opcheck.py
    │   │   ├── test_permute_cols.py
    │   │   ├── test_pos_encoding.py
    │   │   ├── test_rotary_embedding.py
    │   │   └── test_uva.py
    │   ├── mamba
    │   │   ├── test_causal_conv1d.py
    │   │   ├── test_mamba_mixer2.py
    │   │   ├── test_mamba_ssm.py
    │   │   └── test_mamba_ssm_ssd.py
    │   ├── moe
    │   │   ├── __init__.py
    │   │   ├── deepep_utils.py
    │   │   ├── test_batched_moe.py
    │   │   ├── test_cutlass_moe.py
    │   │   ├── test_deepep_deepgemm_moe.py
    │   │   ├── test_deepep_moe.py
    │   │   ├── test_moe.py
    │   │   ├── test_moe_permute_unpermute.py
    │   │   ├── test_nvfp4_moe.py
    │   │   ├── test_pplx_moe.py
    │   │   ├── test_rocm_aiter_topk.py
    │   │   └── test_triton_moe_ptpc_fp8.py
    │   ├── quant_utils.py
    │   ├── quantization
    │   │   ├── nvfp4_utils.py
    │   │   ├── test_allspark_gemm.py
    │   │   ├── test_aqlm.py
    │   │   ├── test_awq.py
    │   │   ├── test_awq_triton.py
    │   │   ├── test_block_fp8.py
    │   │   ├── test_block_int8.py
    │   │   ├── test_cutlass_2of4_sparse.py
    │   │   ├── test_cutlass_scaled_mm.py
    │   │   ├── test_fp8_quant.py
    │   │   ├── test_ggml.py
    │   │   ├── test_gguf.py
    │   │   ├── test_gptq.py
    │   │   ├── test_int8_kernel.py
    │   │   ├── test_int8_quant.py
    │   │   ├── test_machete_mm.py
    │   │   ├── test_marlin_gemm.py
    │   │   ├── test_nvfp4_quant.py
    │   │   ├── test_nvfp4_scaled_mm.py
    │   │   ├── test_rocm_skinny_gemms.py
    │   │   └── test_triton_scaled_mm.py
    │   ├── test_apply_repetition_penalties.py
    │   ├── test_cutlass_mla_decode.py
    │   ├── test_fused_quant_activation.py
    │   ├── test_triton_flash_attention.py
    │   └── utils.py
    ├── kv_transfer
    │   ├── test_disagg.py
    │   ├── test_lookup_buffer.py
    │   ├── test_lookup_buffer.sh
    │   ├── test_module.py
    │   ├── test_send_recv.py
    │   └── test_send_recv.sh
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_add_lora.py
    │   ├── test_baichuan.py
    │   ├── test_chatglm3_tp.py
    │   ├── test_layers.py
    │   ├── test_llama_tp.py
    │   ├── test_lora_allowed_token_ids.py
    │   ├── test_lora_checkpoints.py
    │   ├── test_lora_functions.py
    │   ├── test_lora_huggingface.py
    │   ├── test_lora_manager.py
    │   ├── test_minicpmv_tp.py
    │   ├── test_mixtral.py
    │   ├── test_peft_helper.py
    │   ├── test_phi.py
    │   ├── test_punica_ops.py
    │   ├── test_quant_model.py
    │   ├── test_qwen2vl.py
    │   ├── test_resolver.py
    │   ├── test_tokenizer_group.py
    │   ├── test_transfomers_model.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   ├── __init__.py
    │   └── test_metrics.py
    ├── mistral_tool_use
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_mistral_tool_calls.py
    │   └── utils.py
    ├── model_executor
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_enabled_custom_ops.py
    │   ├── test_guided_processors.py
    │   ├── test_logits_processor.py
    │   ├── test_model_load_with_params.py
    │   └── test_weight_utils.py
    ├── models
    │   ├── __init__.py
    │   ├── fixtures
    │   │   ├── mistral_small_3_chat.json
    │   │   └── pixtral_chat.json
    │   ├── language
    │   │   ├── __init__.py
    │   │   ├── generation
    │   │   │   ├── __init__.py
    │   │   │   ├── test_bart.py
    │   │   │   ├── test_common.py
    │   │   │   ├── test_granite.py
    │   │   │   ├── test_granitemoehybrid.py
    │   │   │   ├── test_hybrid.py
    │   │   │   ├── test_mistral.py
    │   │   │   └── test_phimoe.py
    │   │   └── pooling
    │   │   │   ├── __init__.py
    │   │   │   ├── embed_utils.py
    │   │   │   ├── mteb_utils.py
    │   │   │   ├── test_baai.py
    │   │   │   ├── test_classification.py
    │   │   │   ├── test_embedding.py
    │   │   │   ├── test_gritlm.py
    │   │   │   ├── test_gte.py
    │   │   │   ├── test_intfloat.py
    │   │   │   ├── test_jina.py
    │   │   │   ├── test_nomic.py
    │   │   │   ├── test_nomic_max_model_len.py
    │   │   │   ├── test_scoring.py
    │   │   │   ├── test_snowflake_arctic_embed.py
    │   │   │   └── test_truncation_control.py
    │   ├── multimodal
    │   │   ├── __init__.py
    │   │   ├── generation
    │   │   │   ├── __init__.py
    │   │   │   ├── test_common.py
    │   │   │   ├── test_florence2.py
    │   │   │   ├── test_granite_speech.py
    │   │   │   ├── test_interleaved.py
    │   │   │   ├── test_mllama.py
    │   │   │   ├── test_phi4mm.py
    │   │   │   ├── test_pixtral.py
    │   │   │   ├── test_qwen2_vl.py
    │   │   │   ├── test_ultravox.py
    │   │   │   ├── test_whisper.py
    │   │   │   └── vlm_utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── builders.py
    │   │   │   │   ├── case_filtering.py
    │   │   │   │   ├── core.py
    │   │   │   │   ├── custom_inputs.py
    │   │   │   │   ├── model_utils.py
    │   │   │   │   ├── runners.py
    │   │   │   │   └── types.py
    │   │   ├── pooling
    │   │   │   ├── __init__.py
    │   │   │   ├── test_dse_qwen2_vl.py
    │   │   │   ├── test_intern_vit.py
    │   │   │   ├── test_llava_next.py
    │   │   │   └── test_phi3v.py
    │   │   └── processing
    │   │   │   ├── __init__.py
    │   │   │   ├── test_common.py
    │   │   │   ├── test_h2ovl.py
    │   │   │   ├── test_idefics3.py
    │   │   │   ├── test_internvl.py
    │   │   │   ├── test_llama4.py
    │   │   │   ├── test_llava_next.py
    │   │   │   ├── test_llava_onevision.py
    │   │   │   ├── test_minimax_vl_01.py
    │   │   │   ├── test_mllama.py
    │   │   │   ├── test_phi3v.py
    │   │   │   ├── test_phi4mm.py
    │   │   │   ├── test_qwen2_vl.py
    │   │   │   └── test_smolvlm.py
    │   ├── quantization
    │   │   ├── __init__.py
    │   │   ├── test_aqlm.py
    │   │   ├── test_awq.py
    │   │   ├── test_bitblas.py
    │   │   ├── test_fp8.py
    │   │   ├── test_gguf.py
    │   │   ├── test_gptq_bitblas.py
    │   │   ├── test_gptq_marlin.py
    │   │   ├── test_gptq_marlin_24.py
    │   │   ├── test_modelopt.py
    │   │   ├── test_mxfp4.py
    │   │   └── test_nvfp4.py
    │   ├── registry.py
    │   ├── test_initialization.py
    │   ├── test_oot_registration.py
    │   ├── test_registry.py
    │   ├── test_transformers.py
    │   ├── test_utils.py
    │   ├── test_vision.py
    │   └── utils.py
    ├── mq_llm_engine
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_abort.py
    │   ├── test_error_handling.py
    │   ├── test_load.py
    │   └── utils.py
    ├── multi_step
    │   ├── __init__.py
    │   ├── test_correctness_async_llm.py
    │   └── test_correctness_llm.py
    ├── multimodal
    │   ├── __init__.py
    │   ├── assets
    │   │   ├── image1.png
    │   │   ├── image2.png
    │   │   └── rgba.png
    │   ├── test_hasher.py
    │   ├── test_image.py
    │   ├── test_inputs.py
    │   ├── test_processing.py
    │   ├── test_utils.py
    │   ├── test_video.py
    │   └── utils.py
    ├── neuron
    │   ├── 1_core
    │   │   ├── test_activation.py
    │   │   ├── test_block_table.py
    │   │   ├── test_cache.py
    │   │   ├── test_layernorm.py
    │   │   ├── test_logits_processor.py
    │   │   ├── test_neuron_model_runner.py
    │   │   ├── test_neuron_quant.py
    │   │   ├── test_prefix_prefill.py
    │   │   └── test_rotary_embedding.py
    │   └── 2_core
    │   │   ├── test_comm_ops.py
    │   │   ├── test_eagle.py
    │   │   ├── test_mistral.py
    │   │   └── test_multi_lora.py
    ├── plugins
    │   ├── lora_resolvers
    │   │   ├── __init__.py
    │   │   └── test_filesystem_resolver.py
    │   ├── vllm_add_dummy_model
    │   │   ├── setup.py
    │   │   └── vllm_add_dummy_model
    │   │   │   ├── __init__.py
    │   │   │   ├── my_gemma_embedding.py
    │   │   │   ├── my_llava.py
    │   │   │   └── my_opt.py
    │   └── vllm_add_dummy_platform
    │   │   ├── setup.py
    │   │   └── vllm_add_dummy_platform
    │   │       ├── __init__.py
    │   │       ├── dummy_attention_backend.py
    │   │       └── dummy_platform.py
    ├── plugins_tests
    │   ├── conftest.py
    │   ├── test_platform_plugins.py
    │   └── test_scheduler_plugins.py
    ├── prefix_caching
    │   ├── __init__.py
    │   ├── test_disable_sliding_window.py
    │   └── test_prefix_caching.py
    ├── prompt_adapter
    │   ├── test_bloom.py
    │   ├── test_multi_adapter_inference.py
    │   └── test_pa_lora.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── quantization
    │   ├── __init__.py
    │   ├── test_auto_round.py
    │   ├── test_bitsandbytes.py
    │   ├── test_compressed_tensors.py
    │   ├── test_configs.py
    │   ├── test_cpu_offload.py
    │   ├── test_experts_int8.py
    │   ├── test_fp8.py
    │   ├── test_gptq_dynamic.py
    │   ├── test_ipex_quant.py
    │   ├── test_lm_head.py
    │   ├── test_ptpc_fp8.py
    │   ├── test_quark.py
    │   ├── test_register_quantization_config.py
    │   ├── test_torchao.py
    │   └── utils.py
    ├── reasoning
    │   ├── __init__.py
    │   ├── test_deepseekr1_reasoning_parser.py
    │   ├── test_granite_reasoning_parser.py
    │   ├── test_qwen3_reasoning_parser.py
    │   └── utils.py
    ├── runai_model_streamer_test
    │   ├── __init__.py
    │   ├── test_runai_model_streamer_loader.py
    │   └── test_weight_utils.py
    ├── samplers
    │   ├── __init__.py
    │   ├── test_beam_search.py
    │   ├── test_ignore_eos.py
    │   ├── test_logits_processor.py
    │   ├── test_logprobs.py
    │   ├── test_no_bad_words.py
    │   ├── test_ranks.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   ├── test_seeded_generate.py
    │   └── test_typical_acceptance_sampler.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compatibility.py
    │   │   ├── test_eagle_correctness.py
    │   │   ├── test_integration.py
    │   │   ├── test_integration_dist_tp2.py
    │   │   ├── test_integration_dist_tp4.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_medusa_correctness.py
    │   │   ├── test_mlp_correctness.py
    │   │   ├── test_mtp_correctness.py
    │   │   ├── test_multistep_correctness.py
    │   │   ├── test_ngram_correctness.py
    │   │   └── test_seed.py
    │   ├── test_batch_expansion.py
    │   ├── test_dynamic_spec_decode.py
    │   ├── test_memory_usage.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_ngram_worker.py
    │   ├── test_scorer.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── standalone_tests
    │   ├── lazy_imports.py
    │   └── python_only_compile.sh
    ├── system_messages
    │   └── sonnet3.5_nov2024.txt
    ├── tensorizer_loader
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_tensorizer.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_embedded_commit.py
    ├── test_inputs.py
    ├── test_logger.py
    ├── test_outputs.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_scalartype.py
    ├── test_seed_behavior.py
    ├── test_sequence.py
    ├── test_sharded_state_loader.py
    ├── test_triton_utils.py
    ├── test_utils.py
    ├── test_version.py
    ├── test_vllm_port.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   ├── test_get_eos.py
    │   ├── test_mistral_tokenizer.py
    │   ├── test_tokenizer.py
    │   ├── test_tokenizer_group.py
    │   └── test_tokenizer_registry.py
    ├── tool_use
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_chat_completion_request_validations.py
    │   ├── test_chat_completions.py
    │   ├── test_jamba_tool_parser.py
    │   ├── test_parallel_tool_calls.py
    │   ├── test_tool_calls.py
    │   ├── test_tool_choice_required.py
    │   └── utils.py
    ├── tpu
    │   ├── __init__.py
    │   ├── lora
    │   │   ├── __init__.py
    │   │   └── test_lora.py
    │   ├── test_compilation.py
    │   ├── test_custom_dispatcher.py
    │   ├── test_moe_pallas.py
    │   └── test_quantization_accuracy.py
    ├── tracing
    │   ├── __init__.py
    │   └── test_tracing.py
    ├── utils.py
    ├── v1
    │   ├── __init__.py
    │   ├── core
    │   │   ├── test_kv_cache_utils.py
    │   │   ├── test_prefix_caching.py
    │   │   ├── test_scheduler.py
    │   │   ├── test_scheduler_e2e.py
    │   │   └── test_specialized_manager.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── test_cascade_attention.py
    │   │   ├── test_correctness_sliding_window.py
    │   │   └── test_spec_decode.py
    │   ├── engine
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_async_llm.py
    │   │   ├── test_engine_args.py
    │   │   ├── test_engine_core.py
    │   │   ├── test_engine_core_client.py
    │   │   ├── test_llm_engine.py
    │   │   ├── test_output_processor.py
    │   │   └── utils.py
    │   ├── entrypoints
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── llm
    │   │   │   ├── __init__.py
    │   │   │   └── test_struct_output_generate.py
    │   │   └── openai
    │   │   │   ├── test_chat_completion.py
    │   │   │   ├── test_completion.py
    │   │   │   └── test_multi_api_servers.py
    │   ├── kv_connector
    │   │   ├── nixl_integration
    │   │   │   ├── run_accuracy_test.sh
    │   │   │   ├── run_edge_case_test.sh
    │   │   │   ├── test_accuracy.py
    │   │   │   ├── test_edge_cases.py
    │   │   │   └── toy_proxy_server.py
    │   │   └── unit
    │   │   │   ├── __init__.py
    │   │   │   ├── test_multi_connector.py
    │   │   │   ├── test_nixl_connector.py
    │   │   │   ├── test_remote_decode_lifecycle.py
    │   │   │   ├── test_remote_prefill_lifecycle.py
    │   │   │   └── utils.py
    │   ├── metrics
    │   │   └── test_ray_metrics.py
    │   ├── sample
    │   │   ├── __init__.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_logprobs_e2e.py
    │   │   ├── test_rejection_sampler.py
    │   │   ├── test_sampler.py
    │   │   ├── test_sampling_params_e2e.py
    │   │   ├── test_topk_topp_sampler.py
    │   │   └── utils.py
    │   ├── shutdown
    │   │   ├── test_delete.py
    │   │   ├── test_forward_error.py
    │   │   ├── test_processor_error.py
    │   │   ├── test_startup_error.py
    │   │   └── utils.py
    │   ├── spec_decode
    │   │   ├── test_eagle.py
    │   │   ├── test_max_len.py
    │   │   └── test_ngram.py
    │   ├── structured_output
    │   │   ├── __init__.py
    │   │   └── test_utils.py
    │   ├── test_async_llm_dp.py
    │   ├── test_metrics_reader.py
    │   ├── test_oracle.py
    │   ├── test_serial_utils.py
    │   ├── test_utils.py
    │   ├── tpu
    │   │   ├── __init__.py
    │   │   ├── test_basic.py
    │   │   ├── test_mha_attn.py
    │   │   ├── test_multimodal.py
    │   │   ├── test_pallas.py
    │   │   ├── test_perf.py
    │   │   ├── test_sampler.py
    │   │   ├── test_spmd_model_weight_loading.py
    │   │   ├── test_topk_topp_sampler.py
    │   │   ├── test_tpu_qkv_linear.py
    │   │   └── worker
    │   │   │   ├── __init__.py
    │   │   │   └── test_tpu_model_runner.py
    │   └── worker
    │   │   ├── __init__.py
    │   │   ├── test_gpu_input_batch.py
    │   │   └── test_gpu_model_runner.py
    ├── vllm_test_utils
    │   ├── setup.py
    │   └── vllm_test_utils
    │   │   ├── __init__.py
    │   │   ├── blame.py
    │   │   └── monitor.py
    ├── weight_loading
    │   ├── models-large.txt
    │   ├── models.txt
    │   ├── run_model_weight_loading_test.sh
    │   └── test_weight_loading.py
    └── worker
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_encoder_decoder_model_runner.py
    │   ├── test_model_input.py
    │   ├── test_model_runner.py
    │   ├── test_profile.py
    │   └── test_swap.py
├── tools
    ├── check_repo.sh
    ├── check_spdx_header.py
    ├── check_triton_import.py
    ├── enforce_regex_import.py
    ├── ep_kernels
    │   ├── README.md
    │   ├── install_python_libraries.sh
    │   ├── install_system_drivers.sh
    │   └── install_system_libraries.sh
    ├── install_nixl.sh
    ├── mypy.sh
    ├── png-lint.sh
    ├── profiler
    │   ├── print_layerwise_table.py
    │   └── visualize_layerwise_profile.py
    ├── report_build_time_ninja.py
    ├── shellcheck.sh
    └── update-dockerfile-graph.sh
├── use_existing_torch.py
└── vllm
    ├── __init__.py
    ├── _custom_ops.py
    ├── _ipex_ops.py
    ├── adapter_commons
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── assets
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── image.py
        └── video.py
    ├── attention
        ├── __init__.py
        ├── backends
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── blocksparse_attn.py
        │   ├── cpu_mla.py
        │   ├── dual_chunk_flash_attn.py
        │   ├── flash_attn.py
        │   ├── flashinfer.py
        │   ├── flashmla.py
        │   ├── hpu_attn.py
        │   ├── ipex_attn.py
        │   ├── mla
        │   │   ├── __init__.py
        │   │   └── common.py
        │   ├── pallas.py
        │   ├── placeholder_attn.py
        │   ├── rocm_aiter_mla.py
        │   ├── rocm_flash_attn.py
        │   ├── torch_sdpa.py
        │   ├── triton_mla.py
        │   ├── utils.py
        │   └── xformers.py
        ├── layer.py
        ├── ops
        │   ├── __init__.py
        │   ├── blocksparse_attention
        │   │   ├── __init__.py
        │   │   ├── blocksparse_attention_kernel.py
        │   │   ├── interface.py
        │   │   └── utils.py
        │   ├── chunked_prefill_paged_decode.py
        │   ├── flashmla.py
        │   ├── hpu_paged_attn.py
        │   ├── ipex_attn.py
        │   ├── merge_attn_states.py
        │   ├── nki_flash_attn.py
        │   ├── paged_attn.py
        │   ├── prefix_prefill.py
        │   ├── rocm_aiter_mla.py
        │   ├── rocm_aiter_paged_attn.py
        │   ├── triton_decode_attention.py
        │   ├── triton_flash_attention.py
        │   ├── triton_merge_attn_states.py
        │   └── triton_unified_attention.py
        ├── selector.py
        └── utils
        │   └── fa_utils.py
    ├── beam_search.py
    ├── benchmarks
        ├── __init__.py
        ├── datasets.py
        ├── endpoint_request_func.py
        ├── latency.py
        ├── serve.py
        ├── throughput.py
        └── utils.py
    ├── collect_env.py
    ├── compilation
        ├── __init__.py
        ├── activation_quant_fusion.py
        ├── backends.py
        ├── base_piecewise_backend.py
        ├── collective_fusion.py
        ├── compiler_interface.py
        ├── counter.py
        ├── cuda_piecewise_backend.py
        ├── decorators.py
        ├── fix_functionalization.py
        ├── fusion.py
        ├── fx_utils.py
        ├── inductor_pass.py
        ├── monitor.py
        ├── multi_output_match.py
        ├── noop_elimination.py
        ├── pass_manager.py
        ├── sequence_parallelism.py
        ├── torch25_custom_graph_pass.py
        ├── vllm_inductor_pass.py
        └── wrapper.py
    ├── config.py
    ├── connections.py
    ├── core
        ├── __init__.py
        ├── block
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── common.py
        │   ├── cpu_gpu_block_allocator.py
        │   ├── interfaces.py
        │   ├── naive_block.py
        │   ├── prefix_caching_block.py
        │   └── utils.py
        ├── block_manager.py
        ├── evictor.py
        ├── interfaces.py
        ├── placeholder_block_space_manager.py
        └── scheduler.py
    ├── device_allocator
        ├── __init__.py
        └── cumem.py
    ├── distributed
        ├── __init__.py
        ├── communication_op.py
        ├── device_communicators
        │   ├── __init__.py
        │   ├── all2all.py
        │   ├── base_device_communicator.py
        │   ├── cpu_communicator.py
        │   ├── cuda_communicator.py
        │   ├── cuda_wrapper.py
        │   ├── custom_all_reduce.py
        │   ├── custom_all_reduce_utils.py
        │   ├── hpu_communicator.py
        │   ├── neuron_communicator.py
        │   ├── pynccl.py
        │   ├── pynccl_wrapper.py
        │   ├── shm_broadcast.py
        │   ├── tpu_communicator.py
        │   └── xpu_communicator.py
        ├── kv_events.py
        ├── kv_transfer
        │   ├── README.md
        │   ├── __init__.py
        │   ├── disagg_prefill_workflow.jpg
        │   ├── kv_connector
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── factory.py
        │   │   ├── lmcache_connector.py
        │   │   ├── mooncake_store_connector.py
        │   │   ├── simple_connector.py
        │   │   ├── utils.py
        │   │   └── v1
        │   │   │   ├── __init__.py
        │   │   │   ├── base.py
        │   │   │   ├── lmcache_connector.py
        │   │   │   ├── multi_connector.py
        │   │   │   ├── nixl_connector.py
        │   │   │   └── shared_storage_connector.py
        │   ├── kv_connector_agent.py
        │   ├── kv_lookup_buffer
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── mooncake_store.py
        │   │   └── simple_buffer.py
        │   ├── kv_pipe
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── mooncake_pipe.py
        │   │   └── pynccl_pipe.py
        │   └── kv_transfer_state.py
        ├── parallel_state.py
        ├── tpu_distributed_utils.py
        └── utils.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── async_timeout.py
        ├── llm_engine.py
        ├── metrics.py
        ├── metrics_types.py
        ├── multiprocessing
        │   ├── __init__.py
        │   ├── client.py
        │   └── engine.py
        ├── output_processor
        │   ├── __init__.py
        │   ├── interfaces.py
        │   ├── multi_step.py
        │   ├── single_step.py
        │   ├── stop_checker.py
        │   └── util.py
        └── protocol.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── chat_utils.py
        ├── cli
        │   ├── __init__.py
        │   ├── benchmark
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── latency.py
        │   │   ├── main.py
        │   │   ├── serve.py
        │   │   └── throughput.py
        │   ├── collect_env.py
        │   ├── main.py
        │   ├── openai.py
        │   ├── run_batch.py
        │   ├── serve.py
        │   └── types.py
        ├── launcher.py
        ├── llm.py
        ├── logger.py
        ├── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── logits_processors.py
        │   ├── protocol.py
        │   ├── run_batch.py
        │   ├── serving_chat.py
        │   ├── serving_classification.py
        │   ├── serving_completion.py
        │   ├── serving_embedding.py
        │   ├── serving_engine.py
        │   ├── serving_models.py
        │   ├── serving_pooling.py
        │   ├── serving_score.py
        │   ├── serving_tokenization.py
        │   ├── serving_transcription.py
        │   └── tool_parsers
        │   │   ├── __init__.py
        │   │   ├── abstract_tool_parser.py
        │   │   ├── deepseekv3_tool_parser.py
        │   │   ├── granite_20b_fc_tool_parser.py
        │   │   ├── granite_tool_parser.py
        │   │   ├── hermes_tool_parser.py
        │   │   ├── internlm2_tool_parser.py
        │   │   ├── jamba_tool_parser.py
        │   │   ├── llama4_pythonic_tool_parser.py
        │   │   ├── llama_tool_parser.py
        │   │   ├── mistral_tool_parser.py
        │   │   ├── phi4mini_tool_parser.py
        │   │   ├── pythonic_tool_parser.py
        │   │   └── utils.py
        ├── score_utils.py
        ├── ssl.py
        └── utils.py
    ├── env_override.py
    ├── envs.py
    ├── executor
        ├── __init__.py
        ├── executor_base.py
        ├── mp_distributed_executor.py
        ├── msgspec_utils.py
        ├── multiproc_worker_utils.py
        ├── ray_distributed_executor.py
        ├── ray_utils.py
        └── uniproc_executor.py
    ├── forward_context.py
    ├── inputs
        ├── __init__.py
        ├── data.py
        ├── parse.py
        ├── preprocess.py
        └── registry.py
    ├── jsontree.py
    ├── logger.py
    ├── logging_utils
        ├── __init__.py
        ├── dump_input.py
        └── formatter.py
    ├── logits_process.py
    ├── lora
        ├── __init__.py
        ├── fully_sharded_layers.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── ops
        │   ├── __init__.py
        │   ├── torch_ops
        │   │   ├── __init__.py
        │   │   └── lora_ops.py
        │   ├── triton_ops
        │   │   ├── __init__.py
        │   │   ├── kernel_utils.py
        │   │   ├── lora_expand_op.py
        │   │   ├── lora_kernel_metadata.py
        │   │   ├── lora_shrink_op.py
        │   │   └── utils.py
        │   └── xla_ops
        │   │   ├── __init__.py
        │   │   └── lora_ops.py
        ├── peft_helper.py
        ├── punica_wrapper
        │   ├── __init__.py
        │   ├── punica_base.py
        │   ├── punica_cpu.py
        │   ├── punica_gpu.py
        │   ├── punica_hpu.py
        │   ├── punica_selector.py
        │   ├── punica_tpu.py
        │   └── utils.py
        ├── request.py
        ├── resolver.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── custom_op.py
        ├── guided_decoding
        │   ├── __init__.py
        │   ├── guidance_decoding.py
        │   ├── guidance_logits_processors.py
        │   ├── guided_fields.py
        │   ├── lm_format_enforcer_decoding.py
        │   ├── outlines_decoding.py
        │   ├── outlines_logits_processors.py
        │   ├── utils.py
        │   └── xgrammar_decoding.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── batched_deep_gemm_moe.py
        │   │   ├── batched_triton_or_deep_gemm_moe.py
        │   │   ├── configs
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=128,N=1024,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=128,N=192,device_name=NVIDIA_H20.json
        │   │   │   ├── E=128,N=192,device_name=NVIDIA_H200.json
        │   │   │   ├── E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=384,device_name=NVIDIA_H20.json
        │   │   │   ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=384,device_name=NVIDIA_H200.json
        │   │   │   ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20.json
        │   │   │   ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=128,N=768,device_name=NVIDIA_H200.json
        │   │   │   ├── E=128,N=96,device_name=NVIDIA_H20.json
        │   │   │   ├── E=16,N=1024,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=16,N=1024,device_name=NVIDIA_H100.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
        │   │   │   ├── E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
        │   │   │   ├── E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
        │   │   │   ├── E=60,N=1408,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=60,N=176,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=60,N=352,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=60,N=704,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H200.json
        │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H200.json
        │   │   │   ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=320,device_name=NVIDIA_H200.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H200.json
        │   │   │   ├── E=64,N=896,device_name=NVIDIA_H20.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=16384,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=16384,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_L40S.json
        │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H200.json
        │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI325X.json
        │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
        │   │   │   └── README
        │   │   ├── cutlass_moe.py
        │   │   ├── deep_gemm_moe.py
        │   │   ├── deepep_ht_prepare_finalize.py
        │   │   ├── deepep_ll_prepare_finalize.py
        │   │   ├── fused_batched_moe.py
        │   │   ├── fused_marlin_moe.py
        │   │   ├── fused_moe.py
        │   │   ├── layer.py
        │   │   ├── modular_kernel.py
        │   │   ├── moe_align_block_size.py
        │   │   ├── moe_pallas.py
        │   │   ├── moe_permute_unpermute.py
        │   │   ├── moe_torch_iterative.py
        │   │   ├── pplx_prepare_finalize.py
        │   │   ├── prepare_finalize.py
        │   │   ├── rocm_aiter_fused_moe.py
        │   │   ├── triton_deep_gemm_moe.py
        │   │   └── utils.py
        │   ├── layernorm.py
        │   ├── lightning_attn.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── mamba
        │   │   ├── __init__.py
        │   │   ├── mamba2_metadata.py
        │   │   ├── mamba_mixer.py
        │   │   ├── mamba_mixer2.py
        │   │   └── ops
        │   │   │   ├── __init__.py
        │   │   │   ├── causal_conv1d.py
        │   │   │   ├── mamba_ssm.py
        │   │   │   ├── ssd_bmm.py
        │   │   │   ├── ssd_chunk_scan.py
        │   │   │   ├── ssd_chunk_state.py
        │   │   │   ├── ssd_combined.py
        │   │   │   └── ssd_state_passing.py
        │   ├── pooler.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── aqlm.py
        │   │   ├── auto_round.py
        │   │   ├── awq.py
        │   │   ├── awq_marlin.py
        │   │   ├── awq_triton.py
        │   │   ├── base_config.py
        │   │   ├── bitblas.py
        │   │   ├── bitsandbytes.py
        │   │   ├── compressed_tensors
        │   │   │   ├── __init__.py
        │   │   │   ├── compressed_tensors.py
        │   │   │   ├── compressed_tensors_moe.py
        │   │   │   ├── schemes
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── compressed_tensors_24.py
        │   │   │   │   ├── compressed_tensors_scheme.py
        │   │   │   │   ├── compressed_tensors_w4a16_24.py
        │   │   │   │   ├── compressed_tensors_w4a16_nvfp4.py
        │   │   │   │   ├── compressed_tensors_w8a16_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_int8.py
        │   │   │   │   └── compressed_tensors_wNa16.py
        │   │   │   ├── triton_scaled_mm.py
        │   │   │   └── utils.py
        │   │   ├── deepspeedfp.py
        │   │   ├── experts_int8.py
        │   │   ├── fbgemm_fp8.py
        │   │   ├── fp8.py
        │   │   ├── gguf.py
        │   │   ├── gptq.py
        │   │   ├── gptq_bitblas.py
        │   │   ├── gptq_marlin.py
        │   │   ├── gptq_marlin_24.py
        │   │   ├── hqq_marlin.py
        │   │   ├── ipex_quant.py
        │   │   ├── kernels
        │   │   │   ├── __init__.py
        │   │   │   ├── mixed_precision
        │   │   │   │   ├── MPLinearKernel.py
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── allspark.py
        │   │   │   │   ├── bitblas.py
        │   │   │   │   ├── exllama.py
        │   │   │   │   ├── machete.py
        │   │   │   │   └── marlin.py
        │   │   │   └── scaled_mm
        │   │   │   │   ├── ScaledMMLinearKernel.py
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── aiter.py
        │   │   │   │   ├── cutlass.py
        │   │   │   │   ├── triton.py
        │   │   │   │   └── xla.py
        │   │   ├── kv_cache.py
        │   │   ├── marlin.py
        │   │   ├── modelopt.py
        │   │   ├── moe_wna16.py
        │   │   ├── neuron_quant.py
        │   │   ├── ptpc_fp8.py
        │   │   ├── qqq.py
        │   │   ├── quark
        │   │   │   ├── __init__.py
        │   │   │   ├── quark.py
        │   │   │   ├── quark_moe.py
        │   │   │   ├── schemes
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── quark_scheme.py
        │   │   │   │   ├── quark_w4a4_mxfp4.py
        │   │   │   │   ├── quark_w8a8_fp8.py
        │   │   │   │   └── quark_w8a8_int8.py
        │   │   │   └── utils.py
        │   │   ├── schema.py
        │   │   ├── torchao.py
        │   │   ├── tpu_int8.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── allspark_utils.py
        │   │   │   ├── bitblas_utils.py
        │   │   │   ├── configs
        │   │   │       ├── N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       ├── N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │       └── N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
        │   │   │   ├── fp8_utils.py
        │   │   │   ├── gptq_utils.py
        │   │   │   ├── int8_utils.py
        │   │   │   ├── layer_utils.py
        │   │   │   ├── machete_utils.py
        │   │   │   ├── marlin_utils.py
        │   │   │   ├── marlin_utils_fp4.py
        │   │   │   ├── marlin_utils_fp8.py
        │   │   │   ├── marlin_utils_test.py
        │   │   │   ├── marlin_utils_test_24.py
        │   │   │   ├── marlin_utils_test_qqq.py
        │   │   │   ├── mxfp4_utils.py
        │   │   │   ├── nvfp4_emulation_utils.py
        │   │   │   ├── quant_utils.py
        │   │   │   └── w8a8_utils.py
        │   ├── rejection_sampler.py
        │   ├── resampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   ├── spec_decode_base_sampler.py
        │   ├── typical_acceptance_sampler.py
        │   ├── utils.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader
        │   ├── __init__.py
        │   ├── base_loader.py
        │   ├── bitsandbytes_loader.py
        │   ├── default_loader.py
        │   ├── dummy_loader.py
        │   ├── gguf_loader.py
        │   ├── neuron.py
        │   ├── neuronx_distributed.py
        │   ├── runai_streamer_loader.py
        │   ├── sharded_state_loader.py
        │   ├── tensorizer.py
        │   ├── tensorizer_loader.py
        │   ├── tpu.py
        │   ├── utils.py
        │   └── weight_utils.py
        ├── models
        │   ├── __init__.py
        │   ├── adapters.py
        │   ├── aimv2.py
        │   ├── arctic.py
        │   ├── aria.py
        │   ├── aya_vision.py
        │   ├── baichuan.py
        │   ├── bamba.py
        │   ├── bart.py
        │   ├── bert.py
        │   ├── bert_with_rope.py
        │   ├── blip.py
        │   ├── blip2.py
        │   ├── bloom.py
        │   ├── chameleon.py
        │   ├── chatglm.py
        │   ├── clip.py
        │   ├── commandr.py
        │   ├── constant_size_cache.py
        │   ├── dbrx.py
        │   ├── deepseek.py
        │   ├── deepseek_mtp.py
        │   ├── deepseek_v2.py
        │   ├── deepseek_vl2.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── fairseq2_llama.py
        │   ├── falcon.py
        │   ├── falcon_h1.py
        │   ├── florence2.py
        │   ├── fuyu.py
        │   ├── gemma.py
        │   ├── gemma2.py
        │   ├── gemma3.py
        │   ├── gemma3_mm.py
        │   ├── glm.py
        │   ├── glm4.py
        │   ├── glm4v.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── granite.py
        │   ├── granite_speech.py
        │   ├── granitemoe.py
        │   ├── granitemoehybrid.py
        │   ├── granitemoeshared.py
        │   ├── gritlm.py
        │   ├── grok1.py
        │   ├── h2ovl.py
        │   ├── idefics2_vision_model.py
        │   ├── idefics3.py
        │   ├── interfaces.py
        │   ├── interfaces_base.py
        │   ├── intern_vit.py
        │   ├── internlm2.py
        │   ├── internlm2_ve.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── jamba.py
        │   ├── kimi_vl.py
        │   ├── llama.py
        │   ├── llama4.py
        │   ├── llama_eagle.py
        │   ├── llama_eagle3.py
        │   ├── llava.py
        │   ├── llava_next.py
        │   ├── llava_next_video.py
        │   ├── llava_onevision.py
        │   ├── mamba.py
        │   ├── mamba2.py
        │   ├── mamba_cache.py
        │   ├── medusa.py
        │   ├── mimo.py
        │   ├── mimo_mtp.py
        │   ├── minicpm.py
        │   ├── minicpm3.py
        │   ├── minicpm_eagle.py
        │   ├── minicpmo.py
        │   ├── minicpmv.py
        │   ├── minimax_cache.py
        │   ├── minimax_text_01.py
        │   ├── minimax_vl_01.py
        │   ├── mistral3.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mllama.py
        │   ├── mllama4.py
        │   ├── mlp_speculator.py
        │   ├── modernbert.py
        │   ├── module_mapping.py
        │   ├── molmo.py
        │   ├── moonvit.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nemotron_nas.py
        │   ├── nvlm_d.py
        │   ├── olmo.py
        │   ├── olmo2.py
        │   ├── olmoe.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── ovis.py
        │   ├── paligemma.py
        │   ├── persimmon.py
        │   ├── phi.py
        │   ├── phi3.py
        │   ├── phi3_small.py
        │   ├── phi3v.py
        │   ├── phi4mm.py
        │   ├── phi4mm_audio.py
        │   ├── phi4mm_utils.py
        │   ├── phimoe.py
        │   ├── pixtral.py
        │   ├── plamo2.py
        │   ├── prithvi_geospatial_mae.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── qwen2_5_omni_thinker.py
        │   ├── qwen2_5_vl.py
        │   ├── qwen2_audio.py
        │   ├── qwen2_moe.py
        │   ├── qwen2_rm.py
        │   ├── qwen2_vl.py
        │   ├── qwen3.py
        │   ├── qwen3_moe.py
        │   ├── qwen_vl.py
        │   ├── registry.py
        │   ├── roberta.py
        │   ├── siglip.py
        │   ├── skyworkr1v.py
        │   ├── smolvlm.py
        │   ├── solar.py
        │   ├── stablelm.py
        │   ├── starcoder2.py
        │   ├── tarsier.py
        │   ├── telechat2.py
        │   ├── teleflm.py
        │   ├── transformers.py
        │   ├── ultravox.py
        │   ├── utils.py
        │   ├── vision.py
        │   ├── whisper.py
        │   └── zamba2.py
        ├── parameter.py
        ├── pooling_metadata.py
        ├── sampling_metadata.py
        └── utils.py
    ├── multimodal
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── hasher.py
        ├── image.py
        ├── inputs.py
        ├── parse.py
        ├── processing.py
        ├── profiling.py
        ├── registry.py
        ├── utils.py
        └── video.py
    ├── outputs.py
    ├── platforms
        ├── __init__.py
        ├── cpu.py
        ├── cuda.py
        ├── hpu.py
        ├── interface.py
        ├── neuron.py
        ├── rocm.py
        ├── tpu.py
        └── xpu.py
    ├── plugins
        ├── __init__.py
        └── lora_resolvers
        │   ├── README.md
        │   ├── __init__.py
        │   └── filesystem_resolver.py
    ├── pooling_params.py
    ├── profiler
        ├── __init__.py
        ├── layerwise_profile.py
        └── utils.py
    ├── prompt_adapter
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── py.typed
    ├── reasoning
        ├── __init__.py
        ├── abs_reasoning_parsers.py
        ├── deepseek_r1_reasoning_parser.py
        ├── granite_reasoning_parser.py
        └── qwen3_reasoning_parser.py
    ├── sampling_params.py
    ├── scalar_type.py
    ├── scripts.py
    ├── sequence.py
    ├── spec_decode
        ├── __init__.py
        ├── batch_expansion.py
        ├── draft_model_runner.py
        ├── interfaces.py
        ├── medusa_worker.py
        ├── metrics.py
        ├── mlp_speculator_worker.py
        ├── mqa_scorer.py
        ├── multi_step_worker.py
        ├── ngram_worker.py
        ├── proposer_worker_base.py
        ├── smaller_tp_proposer_worker.py
        ├── spec_decode_worker.py
        ├── target_model_runner.py
        ├── top1_proposer.py
        └── util.py
    ├── test_utils.py
    ├── third_party
        ├── __init__.py
        └── pynvml.py
    ├── tracing.py
    ├── transformers_utils
        ├── __init__.py
        ├── chat_templates
        │   ├── __init__.py
        │   ├── registry.py
        │   ├── template_basic.jinja
        │   ├── template_blip2.jinja
        │   ├── template_chatml.jinja
        │   ├── template_deepseek_vl2.jinja
        │   └── template_fuyu.jinja
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── chatglm.py
        │   ├── cohere2.py
        │   ├── dbrx.py
        │   ├── deepseek_vl2.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── falcon.py
        │   ├── h2ovl.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── kimi_vl.py
        │   ├── medusa.py
        │   ├── minimax_text_01.py
        │   ├── minimax_vl_01.py
        │   ├── mllama.py
        │   ├── mlp_speculator.py
        │   ├── moonvit.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nvlm_d.py
        │   ├── ovis.py
        │   ├── skyworkr1v.py
        │   ├── solar.py
        │   ├── telechat2.py
        │   └── ultravox.py
        ├── detokenizer.py
        ├── detokenizer_utils.py
        ├── processor.py
        ├── processors
        │   ├── __init__.py
        │   ├── deepseek_vl2.py
        │   └── ovis.py
        ├── s3_utils.py
        ├── tokenizer.py
        ├── tokenizer_base.py
        ├── tokenizer_group.py
        ├── tokenizers
        │   ├── __init__.py
        │   └── mistral.py
        └── utils.py
    ├── triton_utils
        ├── __init__.py
        └── importing.py
    ├── usage
        ├── __init__.py
        └── usage_lib.py
    ├── utils.py
    ├── v1
        ├── __init__.py
        ├── attention
        │   ├── __init__.py
        │   └── backends
        │   │   ├── __init__.py
        │   │   ├── cpu_attn.py
        │   │   ├── flash_attn.py
        │   │   ├── flashinfer.py
        │   │   ├── mla
        │   │       ├── __init__.py
        │   │       ├── common.py
        │   │       ├── cutlass_mla.py
        │   │       ├── flashmla.py
        │   │       ├── rocm_aiter_mla.py
        │   │       └── triton_mla.py
        │   │   ├── pallas.py
        │   │   ├── triton_attn.py
        │   │   └── utils.py
        ├── core
        │   ├── __init__.py
        │   ├── block_pool.py
        │   ├── encoder_cache_manager.py
        │   ├── kv_cache_manager.py
        │   ├── kv_cache_utils.py
        │   ├── sched
        │   │   ├── __init__.py
        │   │   ├── interface.py
        │   │   ├── output.py
        │   │   ├── scheduler.py
        │   │   └── utils.py
        │   └── single_type_kv_cache_manager.py
        ├── engine
        │   ├── __init__.py
        │   ├── async_llm.py
        │   ├── coordinator.py
        │   ├── core.py
        │   ├── core_client.py
        │   ├── detokenizer.py
        │   ├── exceptions.py
        │   ├── llm_engine.py
        │   ├── logprobs.py
        │   ├── mm_input_cache.py
        │   ├── output_processor.py
        │   ├── parallel_sampling.py
        │   └── processor.py
        ├── executor
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── multiproc_executor.py
        │   └── ray_distributed_executor.py
        ├── kv_cache_interface.py
        ├── metrics
        │   ├── __init__.py
        │   ├── loggers.py
        │   ├── prometheus.py
        │   ├── ray_wrappers.py
        │   ├── reader.py
        │   └── stats.py
        ├── outputs.py
        ├── request.py
        ├── sample
        │   ├── __init__.py
        │   ├── metadata.py
        │   ├── ops
        │   │   ├── __init__.py
        │   │   ├── bad_words.py
        │   │   ├── penalties.py
        │   │   └── topk_topp_sampler.py
        │   ├── rejection_sampler.py
        │   ├── sampler.py
        │   └── tpu
        │   │   ├── __init__.py
        │   │   ├── metadata.py
        │   │   └── sampler.py
        ├── serial_utils.py
        ├── spec_decode
        │   ├── __init__.py
        │   ├── eagle.py
        │   ├── medusa.py
        │   ├── metadata.py
        │   ├── metrics.py
        │   ├── ngram_proposer.py
        │   └── utils.py
        ├── structured_output
        │   ├── __init__.py
        │   ├── backend_guidance.py
        │   ├── backend_types.py
        │   ├── backend_xgrammar.py
        │   ├── request.py
        │   └── utils.py
        ├── utils.py
        └── worker
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── cpu_model_runner.py
        │   ├── cpu_worker.py
        │   ├── gpu_input_batch.py
        │   ├── gpu_model_runner.py
        │   ├── gpu_worker.py
        │   ├── lora_model_runner_mixin.py
        │   ├── tpu_model_runner.py
        │   ├── tpu_worker.py
        │   ├── utils.py
        │   └── worker_base.py
    ├── version.py
    ├── vllm_flash_attn
        └── .gitkeep
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── cpu_enc_dec_model_runner.py
        ├── cpu_model_runner.py
        ├── cpu_pooling_model_runner.py
        ├── cpu_worker.py
        ├── enc_dec_model_runner.py
        ├── hpu_model_runner.py
        ├── hpu_worker.py
        ├── model_runner.py
        ├── model_runner_base.py
        ├── multi_step_hpu_worker.py
        ├── multi_step_model_runner.py
        ├── multi_step_neuron_model_runner.py
        ├── multi_step_neuronx_distributed_model_runner.py
        ├── multi_step_tpu_worker.py
        ├── multi_step_worker.py
        ├── neuron_model_runner.py
        ├── neuron_worker.py
        ├── neuronx_distributed_model_runner.py
        ├── pooling_model_runner.py
        ├── tpu_model_runner.py
        ├── tpu_worker.py
        ├── utils.py
        ├── worker.py
        ├── worker_base.py
        ├── xpu_model_runner.py
        └── xpu_worker.py


/.buildkite/generate_index.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | template = """<!DOCTYPE html>
 8 | <html>
 9 |     <body>
10 |     <h1>Links for vLLM</h1/>
11 |         <a href="../{wheel_html_escaped}">{wheel}</a><br/>
12 |     </body>
13 | </html>
14 | """
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("--wheel", help="The wheel path.", required=True)
18 | args = parser.parse_args()
19 | 
20 | filename = os.path.basename(args.wheel)
21 | 
22 | with open("index.html", "w") as f:
23 |     print(f"Generated index.html for {args.wheel}")
24 |     # cloudfront requires escaping the '+' character
25 |     f.write(
26 |         template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
27 |     )
28 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 3 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.671
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.664
11 | limit: 1000
12 | num_fewshot: 5
13 | trust_remote_code: True


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 3 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.905
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.905
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 3 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.892
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.892
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.752
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.754
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.753
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.755
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.755
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 3 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.753
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.764
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.764
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.728
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.728
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.758
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.759
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 3 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.756
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.752
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 3 | model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.419
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.416
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 2 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.335
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.323
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.356
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.358
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 3 | model_name: "mgoin/Minitron-4B-Base-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.231
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.22
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 3 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.86
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.86
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 3 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.624
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.624
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 3 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.616
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.632
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.30
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.465
11 | limit: 1319
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.578
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.585
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.593
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.588
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.595
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.582
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 3 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.792
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.824
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 2 | model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.54
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.59
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 2 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.47
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.64
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 3 | model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.6353
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.637
11 | limit: null
12 | num_fewshot: null 
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 | Meta-Llama-3-8B-QQQ.yaml
7 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen2.5-1.5B-Instruct.yaml
2 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
4 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
5 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
6 | Qwen1.5-MoE-W4A16-compressed-tensors.yaml
7 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/nightly-annotation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Description
 3 | 
 4 | This file contains the downloading link for benchmarking results.
 5 | 
 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 7 | - [benchmarking results](artifact://results.zip)
 8 | - [benchmarking code](artifact://nightly-benchmarks.zip)
 9 | 
10 | Please download the visualization scripts in the post
11 | 
12 | ## Results reproduction
13 | 
14 | - Find the docker we use in `benchmarking pipeline`
15 | - Deploy the docker, and inside the docker:
16 |   - Download `nightly-benchmarks.zip`.
17 |   - In the same folder, run the following code:
18 | 
19 |   ```console
20 |   export HF_TOKEN=<your HF token>
21 |   apt update
22 |   apt install -y git
23 |   unzip nightly-benchmarks.zip
24 |   VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
25 |   ```
26 | 
27 | And the results will be inside `./benchmarks/results`.
28 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import argparse
 5 | 
 6 | from transformers import AutoTokenizer
 7 | 
 8 | 
 9 | def main(model, cachedir):
10 |     # Load the tokenizer and save it to the specified directory
11 |     tokenizer = AutoTokenizer.from_pretrained(model)
12 |     tokenizer.save_pretrained(cachedir)
13 |     print(f"Tokenizer saved to {cachedir}")
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser(
18 |         description="Download and save Hugging Face tokenizer"
19 |     )
20 |     parser.add_argument("--model", type=str, required=True, help="Name of the model")
21 |     parser.add_argument(
22 |         "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
23 |     )
24 | 
25 |     args = parser.parse_args()
26 |     main(args.model, args.cachedir)
27 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from lmdeploy.serve.openai.api_client import APIClient
 5 | 
 6 | api_client = APIClient("http://localhost:8000")
 7 | model_name = api_client.available_models[0]
 8 | 
 9 | print(model_name)
10 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 3 | if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
 4 |     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 5 | else
 6 |     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 7 | fi
 8 | 
 9 | TIMEOUT_SECONDS=10
10 | 
11 | retries=0
12 | while [ $retries -lt 1000 ]; do
13 |     if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
14 |         exit 0
15 |     fi
16 | 
17 |     echo "Waiting for image to be available..."
18 | 
19 |     retries=$((retries + 1))
20 |     sleep 5
21 | done
22 | 
23 | exit 1
24 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "llama8B_tp1_genai_perf",
 4 |         "qps_list": [4,8,16,32],
 5 |         "common_parameters": {
 6 |             "model": "meta-llama/Meta-Llama-3-8B-Instruct",
 7 |             "tp": 1,
 8 |             "port": 8000,
 9 |             "num_prompts": 500,
10 |             "reuse_server": false
11 |         },
12 |         "vllm_server_parameters": {
13 |             "disable_log_stats": "",
14 |             "disable_log_requests": "",
15 |             "gpu_memory_utilization": 0.9,
16 |             "num_scheduler_steps": 10,
17 |             "max_num_seqs": 512,
18 |             "dtype": "bfloat16"
19 |         },
20 |         "genai_perf_input_parameters": {
21 |         }
22 |     }
23 | ]


--------------------------------------------------------------------------------
/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the CPU docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Setup cleanup
 8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 9 | trap remove_docker_container EXIT
10 | remove_docker_container
11 | 
12 | # Try building the docker image
13 | docker build -t cpu-test -f docker/Dockerfile.s390x .
14 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | UseTab: Never
 3 | IndentWidth: 2
 4 | ColumnLimit: 80
 5 | 
 6 | # Force pointers to the type for C++.
 7 | DerivePointerAlignment: false
 8 | PointerAlignment: Left
 9 | 
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 | 
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 | 
18 | IncludeCategories:
19 |   - Regex:           '^<'
20 |     Priority:        4
21 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 |     Priority:        3
23 |   - Regex:           '^"(qoda|\.\.)/'
24 |     Priority:        2
25 |   - Regex:           '.*'
26 |     Priority:        1
27 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | /.venv
 2 | /build
 3 | dist
 4 | vllm/*.so
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | 
11 | .mypy_cache
12 | 
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/about-codeowners/
 2 | # for more info about CODEOWNERS file
 3 | 
 4 | * @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 5 | 
 6 | /csrc/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 7 | /vllm/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 8 | 
 9 | fused_moe @divakar-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
10 | 
11 | /tests/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
12 | /.buildkite/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
13 | 
14 | /benchmarks/profiling @AdrianAbeyta @dllehr-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
15 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: vllm
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Questions
4 |     url: https://discuss.vllm.ai
5 |     about: Ask questions and discuss with other vLLM community members
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Please direct your PRs to the upstream vllm (https://github.com/vllm-project/vllm.git)
2 | 
3 | Accepting PRs into the ROCm fork (https://github.com/ROCm/vllm) will require a clear previously communicated exception
4 | 


--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
 1 | name: Add label on auto-merge enabled
 2 | permissions:
 3 |     pull-requests: write
 4 | on:
 5 |     pull_request_target:
 6 |         types:
 7 |             - auto_merge_enabled
 8 | jobs:
 9 |     add-label-on-auto-merge:
10 |         runs-on: ubuntu-latest
11 |         steps:
12 |             -   name: Add label
13 |                 uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
14 |                 with:
15 |                     script: |
16 |                         github.rest.issues.addLabels({
17 |                             owner: context.repo.owner,
18 |                             repo: context.repo.repo,
19 |                             issue_number: context.issue.number,
20 |                             labels: ['ready']
21 |                         })
22 |                 env:
23 |                     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/cleanup_pr_body.yml:
--------------------------------------------------------------------------------
 1 | name: Cleanup PR Body
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, reopened, edited]
 6 | 
 7 | permissions:
 8 |   pull-requests: write
 9 | 
10 | jobs:
11 |   update-description:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
20 |         with:
21 |           python-version: '3.12'
22 | 
23 |       - name: Install Python dependencies
24 |         run: |
25 |           python3 -m pip install --upgrade pip
26 |           python3 -m pip install regex
27 | 
28 |       - name: Update PR description
29 |         env:
30 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31 |         run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
32 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "mypy",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "severity": 3,
11 |           "message": 4
12 |         }
13 |       ]
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   pre-commit:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
16 |     - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
17 |       with:
18 |         python-version: "3.12"
19 |     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
20 |     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
21 |     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
22 |       with:
23 |         extra_args: --all-files --hook-stage manual
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | python_executable=python3
 5 | 
 6 | # Update paths
 7 | # Install requirements
 8 | $python_executable -m pip install -r requirements/rocm.txt
 9 | 
10 | # Limit the number of parallel jobs to avoid OOM
11 | export MAX_JOBS=1
12 | # Make sure release wheels are built for the following architectures
13 | export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
14 | 
15 | rm -f "$(which sccache)"
16 | 
17 | export MAX_JOBS=32
18 | 
19 | # Build
20 | $python_executable setup.py bdist_wheel --dist-dir=dist
21 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses GitHub's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo "$1" | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo "$2" | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.12"
10 | 
11 | mkdocs:
12 |   configuration: mkdocs.yaml
13 | 
14 | # Optionally declare the Python requirements required to build your docs
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 | 


--------------------------------------------------------------------------------
/.shellcheckrc:
--------------------------------------------------------------------------------
 1 | # rules currently disabled:
 2 | #
 3 | #   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
 4 | #   SC2004 (style): $/${} is unnecessary on arithmetic variables.
 5 | #   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
 6 | #   SC2155 (warning): Declare and assign separately to avoid masking return values.
 7 | #   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
 8 | #
 9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164
10 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 | 
3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements/common.txt
 3 | include requirements/cuda.txt
 4 | include requirements/rocm.txt
 5 | include requirements/neuron.txt
 6 | include requirements/cpu.txt
 7 | include CMakeLists.txt
 8 | 
 9 | recursive-include cmake *
10 | recursive-include csrc *
11 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Reporting a Vulnerability
 4 | 
 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 6 | 
 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
 8 | 
 9 | ---
10 | 
11 | Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
12 | 
13 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
14 | 


--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/benchmarks/structured_schemas/structured_schema_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |       "name": { "type": "string" },
 5 |       "email": { "type": "string" },
 6 |       "street": { "type": "string" },
 7 |       "city": { "type": "string" },
 8 |       "state": { "type": "string" },
 9 |       "zip": { "type": "string" },
10 |       "phone": { "type": "string" },
11 |       "website": { "type": "string" },
12 |       "company": { "type": "string" },
13 |       "age": { "type": "integer" }
14 |     },
15 |     "required": [
16 |       "name",
17 |       "email"
18 |     ]
19 | }
20 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 | 


--------------------------------------------------------------------------------
/csrc/core/math.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <climits>
 4 | #include <iostream>
 5 | 
 6 | inline constexpr uint32_t next_pow_2(uint32_t const num) {
 7 |   if (num <= 1) return num;
 8 |   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 9 | }
10 | 
11 | template <typename A, typename B>
12 | static inline constexpr auto div_ceil(A a, B b) {
13 |   return (a + b - 1) / b;
14 | }
15 | 
16 | // Round a down to the next multiple of b. The caller is responsible for making
17 | // sure that b is non-zero
18 | template <typename T>
19 | inline constexpr T round_to_previous_multiple_of(T a, T b) {
20 |   return a % b == 0 ? a : (a / b) * b;
21 | }
22 | 
23 | // Round a up to the next multiple of b. The caller is responsible for making
24 | // sure that b is non-zero
25 | template <typename T>
26 | inline constexpr T round_to_next_multiple_of(T a, T b) {
27 |   return a % b == 0 ? a : ((a / b) + 1) * b;
28 | }
29 | 


--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_TYPES_HPP
 2 | #define CPU_TYPES_HPP
 3 | 
 4 | #if defined(__x86_64__)
 5 |   // x86 implementation
 6 |   #include "cpu_types_x86.hpp"
 7 | #elif defined(__POWER9_VECTOR__)
 8 |   // ppc implementation
 9 |   #include "cpu_types_vsx.hpp"
10 | #elif defined(__s390x__)
11 |   // s390 implementation
12 |   #include "cpu_types_vxe.hpp"
13 | #elif defined(__aarch64__)
14 |   // arm implementation
15 |   #include "cpu_types_arm.hpp"
16 | #else
17 |   #warning "unsupported vLLM cpu implementation"
18 | #endif
19 | 
20 | #endif


--------------------------------------------------------------------------------
/csrc/cutlass_extensions/common.cpp:
--------------------------------------------------------------------------------
 1 | #include "cutlass_extensions/common.hpp"
 2 | 
 3 | int32_t get_sm_version_num() {
 4 |   int32_t major_capability, minor_capability;
 5 |   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
 6 |                          0);
 7 |   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
 8 |                          0);
 9 |   int32_t version_num = major_capability * 10 + minor_capability;
10 |   return version_num;
11 | }


--------------------------------------------------------------------------------
/csrc/moe/marlin_moe_wna16/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu


--------------------------------------------------------------------------------
/csrc/prepare_inputs/advance_step.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <c10/cuda/CUDAGuard.h>
 7 | #include <cuda.h>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace prepare_inputs {
13 | 
14 | static constexpr int max_threads = 256;
15 | static constexpr bool logging = false;
16 | 
17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
18 | 
19 | }  // namespace prepare_inputs
20 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu:
--------------------------------------------------------------------------------
 1 | #include "scaled_mm_kernels.hpp"
 2 | #include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
 3 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 4 | 
 5 | namespace vllm {
 6 | 
 7 | void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
 8 |                                            torch::Tensor const& a,
 9 |                                            torch::Tensor const& b,
10 |                                            torch::Tensor const& a_scales,
11 |                                            torch::Tensor const& b_scales) {
12 |   if (out.dtype() == torch::kBFloat16) {
13 |     cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
14 |         out, a, b, a_scales, b_scales);
15 | 
16 |   } else {
17 |     TORCH_CHECK(out.dtype() == torch::kFloat16);
18 |     cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::half_t>(
19 |         out, a, b, a_scales, b_scales);
20 |   }
21 | }
22 | 
23 | }  // namespace vllm
24 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "scaled_mm_kernels.hpp"
 3 | #include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
 4 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 5 | 
 6 | namespace vllm {
 7 | 
 8 | void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
 9 |                                           torch::Tensor const& a,
10 |                                           torch::Tensor const& b,
11 |                                           torch::Tensor const& a_scales,
12 |                                           torch::Tensor const& b_scales) {
13 |   if (out.dtype() == torch::kBFloat16) {
14 |     cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
15 |         out, a, b, a_scales, b_scales);
16 | 
17 |   } else {
18 |     TORCH_CHECK(out.dtype() == torch::kFloat16);
19 |     cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
20 |         out, a, b, a_scales, b_scales);
21 |   }
22 | }
23 | 
24 | }  // namespace vllm


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu:
--------------------------------------------------------------------------------
 1 | #include "c3x/scaled_mm_helper.hpp"
 2 | #include "c3x/scaled_mm_kernels.hpp"
 3 | 
 4 | /*
 5 |    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
 6 |    NVIDIA GPUs with sm100 (Blackwell).
 7 | */
 8 | 
 9 | #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
10 | 
11 | void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
12 |                              torch::Tensor const& b,
13 |                              torch::Tensor const& a_scales,
14 |                              torch::Tensor const& b_scales,
15 |                              std::optional<torch::Tensor> const& bias) {
16 |   dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 |                      vllm::cutlass_scaled_mm_sm100_fp8,
18 |                      nullptr,  // int8 not supported on SM100
19 |                      vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq_marlin/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu


--------------------------------------------------------------------------------
/csrc/rocm/custom.cu:
--------------------------------------------------------------------------------
 1 | #include <torch/all.h>
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | // declare templates for front (cpp) and back (cuda) sides of function:
 6 | // template <typename T>
 7 | 
 8 | void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K,
 9 |                  cudaStream_t stream, const int rows_per_block);
10 | void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
11 |                const int64_t rows_per_block) {
12 |   auto M = in_a.size(0);
13 |   auto K = in_a.size(1);
14 |   LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(), M, K,
15 |               at::cuda::getCurrentCUDAStream(), rows_per_block);
16 | }
17 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.hpu:
--------------------------------------------------------------------------------
 1 | FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 2 | 
 3 | COPY ./ /workspace/vllm
 4 | 
 5 | WORKDIR /workspace/vllm
 6 | 
 7 | RUN pip install -v -r requirements/hpu.txt
 8 | 
 9 | ENV no_proxy=localhost,127.0.0.1
10 | ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
11 | 
12 | RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
13 | 
14 | # install development dependencies (for testing)
15 | RUN python3 -m pip install -e tests/vllm_test_utils
16 | 
17 | WORKDIR /workspace/
18 | 
19 | RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
20 | 
21 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
22 | 


--------------------------------------------------------------------------------
/docs/api/vllm/.meta.yml:
--------------------------------------------------------------------------------
1 | search:
2 |   boost: 0.5
3 | 


--------------------------------------------------------------------------------
/docs/assets/contributing/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/contributing/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-with-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-with-doc.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-without-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-without-doc.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-provider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-provider.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-upload-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-upload-doc.png


--------------------------------------------------------------------------------
/docs/assets/deployment/architecture_helm_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/architecture_helm_deployment.png


--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-chat.png


--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-settings.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-chat.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-create-chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-create-chatbot.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-settings.png


--------------------------------------------------------------------------------
/docs/assets/deployment/open_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/open_webui.png


--------------------------------------------------------------------------------
/docs/assets/deployment/streamlit-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/streamlit-chat.png


--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/entrypoints.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/entrypoints.excalidraw.png


--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/llm_engine.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/llm_engine.excalidraw.png


--------------------------------------------------------------------------------
/docs/assets/design/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/hierarchy.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-1.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-2.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-3.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-1.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-3.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-4.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-5.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-6.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-7.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/free.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/free.png


--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/overview.png


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/abstraction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/abstraction.jpg


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/overview.jpg


--------------------------------------------------------------------------------
/docs/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.ico


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/community/sponsors.md:
--------------------------------------------------------------------------------
 1 | # Sponsors
 2 | 
 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 4 | 
 5 | <!-- Note: Please sort them in alphabetical order. -->
 6 | <!-- Note: Please keep these consistent with README.md. -->
 7 | 
 8 | Cash Donations:
 9 | 
10 | - a16z
11 | - Dropbox
12 | - Sequoia Capital
13 | - Skywork AI
14 | - ZhenFund
15 | 
16 | Compute Resources:
17 | 
18 | - AMD
19 | - Anyscale
20 | - AWS
21 | - Crusoe Cloud
22 | - Databricks
23 | - DeepInfra
24 | - Google Cloud
25 | - Intel
26 | - Lambda Lab
27 | - Nebius
28 | - Novita AI
29 | - NVIDIA
30 | - Replicate
31 | - Roblox
32 | - RunPod
33 | - Trainy
34 | - UC Berkeley
35 | - UC San Diego
36 | 
37 | Slack Sponsor: Anyscale
38 | 
39 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
40 | 


--------------------------------------------------------------------------------
/docs/configuration/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration Options
 2 | 
 3 | This section lists the most common options for running vLLM.
 4 | 
 5 | There are three main levels of configuration, from highest priority to lowest priority:
 6 | 
 7 | - [Request parameters][completions-api] and [input arguments][sampling-params]
 8 | - [Engine arguments](./engine_args.md)
 9 | - [Environment variables](./env_vars.md)
10 | 


--------------------------------------------------------------------------------
/docs/configuration/env_vars.md:
--------------------------------------------------------------------------------
 1 | # Environment Variables
 2 | 
 3 | vLLM uses the following environment variables to configure the system:
 4 | 
 5 | !!! warning
 6 |     Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
 7 | 
 8 |     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 9 | 
10 | ```python
11 | --8<-- "vllm/envs.py:env-vars-definition"
12 | ```
13 | 


--------------------------------------------------------------------------------
/docs/contributing/model/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Adding a New Model
 3 | ---
 4 | [](){ #new-model }
 5 | 
 6 | This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 7 | 
 8 | Contents:
 9 | 
10 | - [Basic](basic.md)
11 | - [Registration](registration.md)
12 | - [Tests](tests.md)
13 | - [Multimodal](multimodal.md)
14 | 
15 | !!! note
16 |     The complexity of adding a new model depends heavily on the model's architecture.
17 |     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
18 |     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
19 | 
20 | !!! tip
21 |     If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
22 |     or ask on our [developer slack](https://slack.vllm.ai).
23 |     We will be happy to help you out!
24 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/bentoml.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: BentoML
3 | ---
4 | [](){ #deployment-bentoml }
5 | 
6 | [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
9 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/lobe-chat.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Lobe Chat
 3 | ---
 4 | [](){ #deployment-lobe-chat }
 5 | 
 6 | [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
 7 | 
 8 | Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
 9 | 
10 | One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
11 | 
12 | It supports vLLM as a AI model provider to efficiently serve large language models.
13 | 
14 | For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
15 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/modal.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Modal
3 | ---
4 | [](){ #deployment-modal }
5 | 
6 | vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
7 | 
8 | For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
9 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/open-webui.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Open WebUI
 3 | ---
 4 | [](){ #deployment-open-webui }
 5 | 
 6 | 1. Install the [Docker](https://docs.docker.com/engine/install/)
 7 | 
 8 | 2. Start the vLLM server with the supported chat completion model, e.g.
 9 | 
10 | ```console
11 | vllm serve qwen/Qwen1.5-0.5B-Chat
12 | ```
13 | 
14 | 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
15 | 
16 | ```console
17 | docker run -d -p 3000:8080 \
18 | --name open-webui \
19 | -v open-webui:/app/backend/data \
20 | -e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
21 | --restart always \
22 | ghcr.io/open-webui/open-webui:main
23 | ```
24 | 
25 | 1. Open it in the browser: <http://open-webui-host:3000/>
26 | 
27 | On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
28 | 
29 | ![](../../assets/deployment/open_webui.png)
30 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/triton.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: NVIDIA Triton
3 | ---
4 | [](){ #deployment-triton }
5 | 
6 | The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
7 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/kserve.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: KServe
3 | ---
4 | [](){ #deployment-kserve }
5 | 
6 | vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/kubeai.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: KubeAI
 3 | ---
 4 | [](){ #deployment-kubeai }
 5 | 
 6 | [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 7 | 
 8 | Please see the Installation Guides for environment specific instructions:
 9 | 
10 | - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
11 | - [EKS](https://www.kubeai.org/installation/eks/)
12 | - [GKE](https://www.kubeai.org/installation/gke/)
13 | 
14 | Once you have KubeAI installed, you can
15 | [configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
16 | using vLLM.
17 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/llmaz.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: llmaz
3 | ---
4 | [](){ #deployment-llmaz }
5 | 
6 | [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
7 | 
8 | Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
9 | 


--------------------------------------------------------------------------------
/docs/features/quantization/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Quantization
 3 | ---
 4 | [](){ #quantization-index }
 5 | 
 6 | Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 7 | 
 8 | Contents:
 9 | 
10 | - [Supported_Hardware](supported_hardware.md)
11 | - [Auto_Awq](auto_awq.md)
12 | - [Bnb](bnb.md)
13 | - [Bitblas](bitblas.md)
14 | - [Gguf](gguf.md)
15 | - [Gptqmodel](gptqmodel.md)
16 | - [Int4](int4.md)
17 | - [Int8](int8.md)
18 | - [Fp8](fp8.md)
19 | - [Modelopt](modelopt.md)
20 | - [Quark](quark.md)
21 | - [Quantized_Kvcache](quantized_kvcache.md)
22 | - [Torchao](torchao.md)
23 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 |   - README.md
3 |   - gpu.md
4 |   - cpu.md
5 |   - ai_accelerator.md


--------------------------------------------------------------------------------
/docs/getting_started/installation/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Installation
 3 | ---
 4 | [](){ #installation-index }
 5 | 
 6 | vLLM supports the following hardware platforms:
 7 | 
 8 | - [GPU](gpu.md)
 9 |     - [NVIDIA CUDA](gpu.md#nvidia-cuda)
10 |     - [AMD ROCm](gpu.md#amd-rocm)
11 |     - [Intel XPU](gpu.md#intel-xpu)
12 | - [CPU](cpu.md)
13 |     - [Intel/AMD x86](cpu.md#intelamd-x86)
14 |     - [ARM AArch64](cpu.md#arm-aarch64)
15 |     - [Apple silicon](cpu.md#apple-silicon)
16 |     - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
17 | - [Other AI accelerators](ai_accelerator.md)
18 |     - [Google TPU](ai_accelerator.md#google-tpu)
19 |     - [Intel Gaudi](ai_accelerator.md#intel-gaudi)
20 |     - [AWS Neuron](ai_accelerator.md#aws-neuron)
21 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation/device.template.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Requirements
 4 | 
 5 | ## Set up using Python
 6 | 
 7 | ### Pre-built wheels
 8 | 
 9 | ### Build wheel from source
10 | 
11 | ## Set up using Docker
12 | 
13 | ### Pre-built images
14 | 
15 | ### Build image from source
16 | 
17 | ## Extra information
18 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation/python_env_setup.inc.md:
--------------------------------------------------------------------------------
1 | It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
2 | 
3 | ```console
4 | uv venv --python 3.12 --seed
5 | source .venv/bin/activate
6 | ```
7 | 


--------------------------------------------------------------------------------
/docs/mkdocs/hooks/remove_announcement.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import os
 4 | from typing import Literal
 5 | 
 6 | 
 7 | def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 8 |     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
 9 |     if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
10 |         # remove the warning banner if the version is a tagged release
11 |         docs_dir = os.path.dirname(__file__)
12 |         announcement_path = os.path.join(docs_dir,
13 |                                          "mkdocs/overrides/main.html")
14 |         # The file might be removed already if the build is triggered multiple
15 |         # times (readthedocs build both HTML and PDF versions separately)
16 |         if os.path.exists(announcement_path):
17 |             os.remove(announcement_path)
18 | 


--------------------------------------------------------------------------------
/docs/mkdocs/javascript/run_llm_widget.js:
--------------------------------------------------------------------------------
 1 | // Add RunLLM widget
 2 | document.addEventListener("DOMContentLoaded", function () {
 3 |     var script = document.createElement("script");
 4 |     script.type = "module";
 5 |     script.id = "runllm-widget-script"
 6 |   
 7 |     script.src = "https://widget.runllm.com";
 8 |   
 9 |     script.setAttribute("version", "stable");
10 |     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
11 |     script.setAttribute("runllm-name", "vLLM");
12 |     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
13 |     script.setAttribute("runllm-position-y", "120px");
14 |     script.setAttribute("runllm-position-x", "20px");
15 |     script.setAttribute("runllm-assistant-id", "207");
16 |   
17 |     script.async = true;
18 |     document.head.appendChild(script);
19 |   });
20 | 


--------------------------------------------------------------------------------
/docs/mkdocs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block announce %}
4 |   <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
5 | {% endblock %}
6 | 


--------------------------------------------------------------------------------
/docs/models/extensions/fastsafetensor.md:
--------------------------------------------------------------------------------
1 | Loading Model weights with fastsafetensors
2 | ===================================================================
3 | 
4 | Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
5 | For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
6 | 


--------------------------------------------------------------------------------
/docs/serving/integrations/langchain.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LangChain
 3 | ---
 4 | [](){ #serving-langchain }
 5 | 
 6 | vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 7 | 
 8 | To install LangChain, run
 9 | 
10 | ```console
11 | pip install langchain langchain_community -q
12 | ```
13 | 
14 | To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
15 | 
16 | ```python
17 | from langchain_community.llms import VLLM
18 | 
19 | llm = VLLM(model="mosaicml/mpt-7b",
20 |            trust_remote_code=True,  # mandatory for hf models
21 |            max_new_tokens=128,
22 |            top_k=10,
23 |            top_p=0.95,
24 |            temperature=0.8,
25 |            # tensor_parallel_size=... # for distributed inference
26 | )
27 | 
28 | print(llm("What is the capital of France ?"))
29 | ```
30 | 
31 | Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
32 | 


--------------------------------------------------------------------------------
/docs/serving/integrations/llamaindex.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: LlamaIndex
 3 | ---
 4 | [](){ #serving-llamaindex }
 5 | 
 6 | vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 7 | 
 8 | To install LlamaIndex, run
 9 | 
10 | ```console
11 | pip install llama-index-llms-vllm -q
12 | ```
13 | 
14 | To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
15 | 
16 | ```python
17 | from llama_index.llms.vllm import Vllm
18 | 
19 | llm = Vllm(
20 |     model="microsoft/Orca-2-7b",
21 |     tensor_parallel_size=4,
22 |     max_new_tokens=100,
23 |     vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
24 | )
25 | ```
26 | 
27 | Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
28 | 


--------------------------------------------------------------------------------
/docs/training/rlhf.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning from Human Feedback
 2 | 
 3 | Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
 4 | 
 5 | vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
 6 | 
 7 | See the following basic examples to get started if you don't want to use an existing library:
 8 | 
 9 | - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
10 | - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
11 | - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
12 | 


--------------------------------------------------------------------------------
/docs/usage/README.md:
--------------------------------------------------------------------------------
1 | # Using vLLM
2 | 
3 | vLLM supports the following usage patterns:
4 | 
5 | - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
6 | - [Deployment](../deployment/docker.md): Scale up model instances for production.
7 | - [Training](../training/rlhf.md): Train or fine-tune a model.
8 | 


--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/README.md:
--------------------------------------------------------------------------------
 1 | # Disaggregated Prefill V1
 2 | 
 3 | This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
 4 | 
 5 | ## Files
 6 | 
 7 | - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
 8 |   - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 9 | - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
10 | - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
11 | 


--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/run.sh:
--------------------------------------------------------------------------------
 1 | rm -rf local_storage/
 2 | 
 3 | if [ -f "output.txt" ]; then
 4 |     rm output.txt
 5 | fi
 6 | 
 7 | # The directory of current script
 8 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 9 | 
10 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
11 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
12 | 


--------------------------------------------------------------------------------
/examples/offline_inference/openai_batch/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
3 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/.helmignore:
--------------------------------------------------------------------------------
1 | *.png
2 | .git/
3 | ct.yaml
4 | lintconf.yaml
5 | values.schema.json
6 | /workflows


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: chart-vllm
 3 | description: Chart vllm
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.0.1
19 | 
20 | maintainers:
21 |   - name: mfournioux
22 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/ct.yaml:
--------------------------------------------------------------------------------
1 | chart-dirs:
2 |   - charts
3 | validate-maintainers: false


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.configs -}}
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: "{{ .Release.Name }}-configs"
 6 |   namespace: {{ .Release.Namespace }}
 7 | data:
 8 |   {{- with .Values.configs }}
 9 |   {{- toYaml . | nindent 2 }}
10 |   {{- end }}
11 | {{- end -}}


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/custom-objects.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.customObjects }}
2 | {{- range .Values.customObjects }}
3 | {{- tpl (. | toYaml) $ }}
4 | ---
5 | {{- end }}
6 | {{- end }}


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: policy/v1
2 | kind: PodDisruptionBudget
3 | metadata:
4 |   name: "{{ .Release.Name }}-pdb"
5 |   namespace: {{ .Release.Namespace }}
6 | spec:
7 |   maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/pvc.yaml:
--------------------------------------------------------------------------------
 1 | {{-   if .Values.extraInit  }}
 2 | apiVersion: v1
 3 | kind: PersistentVolumeClaim
 4 | metadata:
 5 |   name: "{{ .Release.Name }}-storage-claim"
 6 |   namespace: {{ .Release.Namespace }}
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.extraInit.pvcStorage }}
13 | {{- end }}


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/secrets.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   name: "{{ .Release.Name }}-secrets"
 5 |   namespace: {{ .Release.Namespace }}
 6 | type: Opaque
 7 | data:
 8 |   {{- range $key, $val := .Values.secrets }}
 9 |   {{ $key }}: {{ $val | b64enc | quote }}
10 |   {{- end }}


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: "{{ .Release.Name }}-service"
 5 |   namespace: {{ .Release.Namespace }}
 6 | spec:
 7 |   type: ClusterIP
 8 |   ports:
 9 |     - name: {{ include "chart.service-port-name" . }}
10 |       port: {{ include "chart.service-port" . }}
11 |       targetPort: {{ include "chart.container-port-name" . }}
12 |       protocol: TCP
13 |   selector:
14 |   {{- include "chart.labels" . | nindent 4 }}


--------------------------------------------------------------------------------
/examples/online_serving/disaggregated_serving/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Serving
2 | 
3 | This example contains scripts that demonstrate the disaggregated serving features of vLLM.
4 | 
5 | ## Files
6 | 
7 | - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
8 | - `kv_events.sh` - Demonstrates KV cache event publishing.
9 | 


--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/online_serving/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from openai import APIConnectionError, OpenAI
 4 | from openai.pagination import SyncPage
 5 | from openai.types.model import Model
 6 | 
 7 | 
 8 | def get_first_model(client: OpenAI) -> str:
 9 |     """
10 |     Get the first model from the vLLM server.
11 |     """
12 |     try:
13 |         models: SyncPage[Model] = client.models.list()
14 |     except APIConnectionError as e:
15 |         raise RuntimeError(
16 |             "Failed to get the list of models from the vLLM server at "
17 |             f"{client.base_url} with API key {client.api_key}. Check\n"
18 |             "1. the server is running\n"
19 |             "2. the server URL is correct\n"
20 |             "3. the API key is correct"
21 |         ) from e
22 | 
23 |     if len(models.data) == 0:
24 |         raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
25 | 
26 |     return models.data[0].id
27 | 


--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml:
--------------------------------------------------------------------------------
 1 | local_cpu: False
 2 | max_local_cpu_size: 0
 3 | #local_disk: 
 4 | max_local_disk_size: 0
 5 | remote_serde: NULL
 6 | 
 7 | enable_nixl: True
 8 | nixl_role: "receiver"
 9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 | 


--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml:
--------------------------------------------------------------------------------
 1 | local_cpu: False
 2 | max_local_cpu_size: 0
 3 | #local_disk: 
 4 | max_local_disk_size: 0
 5 | remote_serde: NULL
 6 | 
 7 | enable_nixl: True
 8 | nixl_role: "sender"
 9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_teleflm.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages %}
 2 |     {%- if message['role'] == 'user' %}
 3 |         {{- '<_user>' + message['content']|trim }}
 4 |     {%- elif message['role'] == 'system' %}
 5 |         {{- '<_system>' + message['content']|trim }}
 6 |     {%- elif message['role'] == 'assistant' %}
 7 |         {{- '<_bot>' + message['content'] }}
 8 |     {%- endif %}
 9 | {%- endfor %}
10 | {%- if add_generation_prompt %}
11 |     {{- '<_bot>' }}
12 | {%- endif %}
13 | 


--------------------------------------------------------------------------------
/examples/template_vlm2vec.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages | length > 1 -%}
 2 |     {{ raise_exception('Embedding models should only embed one message at a time') }}
 3 | {%- endif -%}
 4 | 
 5 | {% set vars = namespace(parts=[], next_image_id=1) %}
 6 | {%- for message in messages -%}
 7 |     {%- for content in message['content'] -%}
 8 |         {%- if content['type'] == 'text' -%}
 9 |             {%- set vars.parts = vars.parts + [content['text']] %}
10 |         {%- elif content['type'] == 'image' -%}
11 |             {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
12 |             {%- set vars.next_image_id = vars.next_image_id + 1 %}
13 |         {%- endif -%}
14 |     {%- endfor -%}
15 | {%- endfor -%}
16 | {{ vars.parts | join(' ') }}
17 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
4 | echo "Please run 'pip install -r requirements/lint.txt', followed by"
5 | echo "'pre-commit install' to install the pre-commit hooks."
6 | echo "Then linters will run automatically before each commit."


--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
 1 | # Should be mirrored in pyproject.toml
 2 | cmake>=3.26.1
 3 | ninja
 4 | packaging>=24.2
 5 | setuptools>=77.0.3,<80.0.0
 6 | setuptools-scm>=8
 7 | torch==2.7.0
 8 | wheel
 9 | jinja2>=3.1.6
10 | regex
11 | 


--------------------------------------------------------------------------------
/requirements/cuda.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 5 | numba == 0.61.2; python_version > '3.9'
 6 | 
 7 | # Dependencies for NVIDIA GPUs
 8 | ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 9 | torch==2.7.0
10 | torchaudio==2.7.0
11 | # These must be updated alongside torch
12 | torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
13 | # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
14 | xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
15 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r lint.txt
2 | -r test.txt
3 | 
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
 1 | mkdocs
 2 | mkdocs-api-autonav
 3 | mkdocs-material
 4 | mkdocstrings-python
 5 | mkdocs-gen-files
 6 | mkdocs-awesome-nav
 7 | python-markdown-math
 8 | regex
 9 | ruff
10 | 


--------------------------------------------------------------------------------
/requirements/hpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | # Dependencies for HPU code
 5 | ray
 6 | triton==3.1.0
 7 | pandas
 8 | numpy==1.26.4
 9 | tabulate
10 | setuptools>=77.0.3,<80.0.0
11 | setuptools-scm>=8
12 | vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
13 | 


--------------------------------------------------------------------------------
/requirements/lint.txt:
--------------------------------------------------------------------------------
1 | # formatting
2 | pre-commit==4.0.1
3 | 


--------------------------------------------------------------------------------
/requirements/neuron.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | # Dependencies for Neuron devices
 5 | packaging>=24.2
 6 | setuptools>=77.0.3,<80.0.0
 7 | torch-neuronx >= 2.5.0
 8 | neuronx-cc>=2.0.0a0
 9 | torchvision # Required for Llama3.2 multimodal image preprocessing
10 | 


--------------------------------------------------------------------------------
/requirements/rocm-build.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
 5 | torch==2.7.0
 6 | torchvision==0.22.0
 7 | torchaudio==2.7.0
 8 | 
 9 | triton==3.2
10 | cmake>=3.26.1,<4
11 | packaging>=24.2
12 | setuptools>=77.0.3,<80.0.0
13 | setuptools-scm>=8
14 | wheel
15 | jinja2>=3.1.6
16 | amdsmi==6.2.4
17 | 


--------------------------------------------------------------------------------
/requirements/rocm-test.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | # entrypoints test
 5 | # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 6 | audioread==3.0.1
 7 | cffi==1.17.1
 8 | decorator==5.2.1
 9 | lazy-loader==0.4
10 | platformdirs==4.3.6
11 | pooch==1.8.2
12 | #pycparse==2.22
13 | soundfile==0.13.1
14 | soxr==0.5.0.post1
15 | librosa==0.10.2.post1
16 | 
17 | # entrypoints test
18 | #vllm[video] # required by entrypoints/openai/test_video.py
19 | decord==0.6.0
20 | 
21 | # entrypoints test
22 | #sentence-transformers # required by entrypoints/openai/test_score.py
23 | sentence-transformers==3.4.1
24 | 
25 | # Basic Models Test
26 | matplotlib==3.10.3
27 | 
28 | # Multi-Modal Models Test (Extended) 3
29 | blobfile==3.0.0
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/requirements/rocm.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 5 | numba == 0.61.2; python_version > '3.9'
 6 | 
 7 | # Dependencies for AMD GPUs
 8 | boto3
 9 | botocore
10 | datasets
11 | ray>=2.10.0,<2.45.0
12 | peft
13 | pytest-asyncio
14 | tensorizer>=2.9.0
15 | setuptools-scm>=8
16 | setuptools>=77.0.3,<80.0.0
17 | runai-model-streamer==0.11.0
18 | runai-model-streamer-s3==0.11.0
19 | 


--------------------------------------------------------------------------------
/requirements/xpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | ray>=2.9
 5 | cmake>=3.26.1
 6 | packaging>=24.2
 7 | setuptools-scm>=8
 8 | setuptools>=77.0.3,<80.0.0
 9 | wheel
10 | jinja2>=3.1.6
11 | datasets # for benchmark scripts
12 | 
13 | torch==2.7.0+xpu
14 | torchaudio
15 | torchvision
16 | pytorch-triton-xpu
17 | --extra-index-url=https://download.pytorch.org/whl/xpu
18 | 
19 | # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
20 | # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
21 | intel-extension-for-pytorch==2.7.10+xpu
22 | oneccl_bind_pt==2.7.0+xpu
23 | --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
24 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/async_engine/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')
13 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/basic_correctness/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from ..utils import compare_two_settings
 5 | 
 6 | 
 7 | def test_cpu_offload():
 8 |     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
 9 |                          ["--cpu-offload-gb", "1"])
10 | 


--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/benchmarks/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarks/test_latency_cli.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import subprocess
 4 | 
 5 | import pytest
 6 | 
 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 8 | 
 9 | 
10 | @pytest.mark.benchmark
11 | def test_bench_latency():
12 |     command = [
13 |         "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
14 |         "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 |     ]
16 |     result = subprocess.run(command, capture_output=True, text=True)
17 |     print(result.stdout)
18 |     print(result.stderr)
19 | 
20 |     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_throughput_cli.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import subprocess
 4 | 
 5 | import pytest
 6 | 
 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 8 | 
 9 | 
10 | @pytest.mark.benchmark
11 | def test_bench_throughput():
12 |     command = [
13 |         "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
14 |         "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 |     ]
16 |     result = subprocess.run(command, capture_output=True, text=True)
17 |     print(result.stdout)
18 |     print(result.stderr)
19 | 
20 |     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 | 


--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/__init__.py


--------------------------------------------------------------------------------
/tests/compile/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | # TEST V1: this should be removed. Right now V1 overrides
 7 | # all the torch compile logic. We should re-enable this
 8 | # as we add torch compile support back to V1.
 9 | @pytest.fixture(scope="function", autouse=True)
10 | def use_v0_only(monkeypatch):
11 |     """
12 |     Since this module is V0 only, set VLLM_USE_V1=0 for
13 |     all tests in the module.
14 |     """
15 |     monkeypatch.setenv('VLLM_USE_V1', '0')
16 | 


--------------------------------------------------------------------------------
/tests/compile/piecewise/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/piecewise/__init__.py


--------------------------------------------------------------------------------
/tests/config/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | trust_remote_code: true
5 | multi_step_stream_outputs: false
6 | 


--------------------------------------------------------------------------------
/tests/config/test_config_with_model.yaml:
--------------------------------------------------------------------------------
1 | # Same as test_config.yaml but with model specified
2 | model: config-model
3 | port: 12312
4 | served_model_name: mymodel
5 | tensor_parallel_size: 2
6 | trust_remote_code: true
7 | multi_step_stream_outputs: false
8 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def should_do_global_cleanup_after_test() -> bool:
 9 |     """Disable the global cleanup fixture for tests in this directory. This
10 |     provides a ~10x speedup for unit tests that don't load a model to GPU.
11 | 
12 |     This requires that tests in this directory clean up after themselves if they
13 |     use the GPU.
14 |     """
15 |     return False
16 | 


--------------------------------------------------------------------------------
/tests/core/block/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/core/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')
13 | 


--------------------------------------------------------------------------------
/tests/detokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/detokenizer/__init__.py


--------------------------------------------------------------------------------
/tests/detokenizer/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True)
 7 | def v1(run_with_both_engines):
 8 |     # Simple autouse wrapper to run both engines for each test
 9 |     # This can be promoted up to conftest.py to run for every
10 |     # test in a package
11 |     pass
12 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/distributed/__init__.py


--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from ..entrypoints.openai.test_oot_registration import (
 5 |     run_and_test_dummy_opt_api_server)
 6 | 
 7 | 
 8 | def test_distributed_oot(dummy_opt_path: str):
 9 |     run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
10 | 


--------------------------------------------------------------------------------
/tests/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/encoder_decoder/__init__.py


--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/engine/__init__.py


--------------------------------------------------------------------------------
/tests/engine/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')
13 | 


--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/llm/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm import LLM
 7 | 
 8 | 
 9 | @pytest.fixture(autouse=True)
10 | def v1(run_with_both_engines):
11 |     # Simple autouse wrapper to run both engines for each test
12 |     # This can be promoted up to conftest.py to run for every
13 |     # test in a package
14 |     pass
15 | 
16 | 
17 | def test_empty_prompt():
18 |     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
19 |     with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
20 |         llm.generate([""])
21 | 
22 | 
23 | @pytest.mark.skip_v1
24 | def test_out_of_vocab_token():
25 |     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
26 |     with pytest.raises(ValueError, match='out of vocabulary'):
27 |         llm.generate({"prompt_token_ids": [999999]})
28 | 


--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/offline_mode/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/correctness/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/tool_parsers/__init__.py


--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/fastsafetensors_loader/__init__.py


--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/test_fastsafetensors_loader.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm import SamplingParams
 5 | from vllm.config import LoadFormat
 6 | 
 7 | test_model = "openai-community/gpt2"
 8 | 
 9 | prompts = [
10 |     "Hello, my name is",
11 |     "The president of the United States is",
12 |     "The capital of France is",
13 |     "The future of AI is",
14 | ]
15 | # Create a sampling params object.
16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
17 | 
18 | 
19 | def test_model_loader_download_files(vllm_runner):
20 |     with vllm_runner(test_model,
21 |                      load_format=LoadFormat.FASTSAFETENSORS) as llm:
22 |         deserialized_outputs = llm.generate(prompts, sampling_params)
23 |         assert deserialized_outputs
24 | 


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/__init__.py


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import torch
 5 | 
 6 | # Reference default values of atol and rtol are from
 7 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 8 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 9 | default_rtol = {
10 |     torch.float16: 1e-3,
11 |     torch.bfloat16: 1.6e-2,
12 |     torch.float: 1.3e-6
13 | }
14 | 
15 | 
16 | def get_default_atol(output) -> float:
17 |     return default_atol[output.dtype]
18 | 
19 | 
20 | def get_default_rtol(output) -> float:
21 |     return default_rtol[output.dtype]
22 | 


--------------------------------------------------------------------------------
/tests/kernels/attention/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.utils import (create_kv_caches_with_random,
 7 |                         create_kv_caches_with_random_flash)
 8 | 
 9 | 
10 | @pytest.fixture()
11 | def kv_cache_factory():
12 |     return create_kv_caches_with_random
13 | 
14 | 
15 | @pytest.fixture()
16 | def kv_cache_factory_flashinfer():
17 |     return create_kv_caches_with_random_flash
18 | 


--------------------------------------------------------------------------------
/tests/kernels/core/test_opcheck.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | Tests for miscellaneous utilities
 5 | """
 6 | 
 7 | import torch
 8 | 
 9 | from tests.kernels.utils import opcheck
10 | 
11 | 
12 | def test_convert_fp8_opcheck():
13 |     data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
14 |     result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
15 |     opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
16 | 
17 | 
18 | # TODO: Add this back, currently fails with
19 | # csrc/cuda_utils_kernels.cu:15 'invalid argument'
20 | # @pytest.mark.skipif(not current_platform.is_cuda(),
21 | #                     reason="Only supported for CUDA")
22 | # def test_cuda_utils_opcheck():
23 | #     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
24 | #     opcheck(
25 | #         torch.ops._C_cuda_utils.
26 | #         get_max_shared_memory_per_block_device_attribute, (0, ))
27 | 


--------------------------------------------------------------------------------
/tests/kernels/core/test_permute_cols.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | import torch
 6 | 
 7 | from tests.kernels.utils import opcheck
 8 | from vllm._custom_ops import permute_cols
 9 | 
10 | 
11 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
12 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
13 | def test_permute_cols(shape, dtype):
14 |     x = torch.randn(shape, dtype=dtype).cuda()
15 |     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
16 |     opcheck(torch.ops._C.permute_cols, (x, perm))
17 |     y = permute_cols(x, perm)
18 |     torch.testing.assert_close(y, x[:, perm])


--------------------------------------------------------------------------------
/tests/kernels/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/moe/__init__.py


--------------------------------------------------------------------------------
/tests/kv_transfer/test_lookup_buffer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python3 test_lookup_buffer.py &
3 | PID0=$!
4 | RANK=1 python3 test_lookup_buffer.py &
5 | PID1=$!
6 | 
7 | wait $PID0
8 | wait $PID1
9 | 


--------------------------------------------------------------------------------
/tests/kv_transfer/test_send_recv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0 python3 test_send_recv.py &
 4 | PID0=$!
 5 | RANK=1 python3 test_send_recv.py &
 6 | PID1=$!
 7 | 
 8 | wait $PID0
 9 | wait $PID1
10 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/mistral_tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mistral_tool_use/__init__.py


--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/model_executor/__init__.py


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/language/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/generation/__init__.py


--------------------------------------------------------------------------------
/tests/models/language/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/pooling/__init__.py


--------------------------------------------------------------------------------
/tests/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/__init__.py


--------------------------------------------------------------------------------
/tests/models/multimodal/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/__init__.py


--------------------------------------------------------------------------------
/tests/models/multimodal/generation/vlm_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/vlm_utils/__init__.py


--------------------------------------------------------------------------------
/tests/models/multimodal/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/pooling/__init__.py


--------------------------------------------------------------------------------
/tests/models/multimodal/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/processing/__init__.py


--------------------------------------------------------------------------------
/tests/models/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/mq_llm_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mq_llm_engine/__init__.py


--------------------------------------------------------------------------------
/tests/mq_llm_engine/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')
13 | 


--------------------------------------------------------------------------------
/tests/multi_step/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multi_step/__init__.py


--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/__init__.py


--------------------------------------------------------------------------------
/tests/multimodal/assets/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image1.png


--------------------------------------------------------------------------------
/tests/multimodal/assets/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image2.png


--------------------------------------------------------------------------------
/tests/multimodal/assets/rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/rgba.png


--------------------------------------------------------------------------------
/tests/multimodal/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import numpy as np
 5 | from PIL import Image
 6 | 
 7 | 
 8 | def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
 9 |     w, h = rng.randint(min_wh, max_wh, size=(2, ))
10 |     arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
11 |     return Image.fromarray(arr)
12 | 
13 | 
14 | def random_video(
15 |     rng: np.random.RandomState,
16 |     min_frames: int,
17 |     max_frames: int,
18 |     min_wh: int,
19 |     max_wh: int,
20 | ):
21 |     num_frames = rng.randint(min_frames, max_frames)
22 |     w, h = rng.randint(min_wh, max_wh, size=(2, ))
23 |     return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
24 | 
25 | 
26 | def random_audio(
27 |     rng: np.random.RandomState,
28 |     min_len: int,
29 |     max_len: int,
30 |     sr: int,
31 | ):
32 |     audio_len = rng.randint(min_len, max_len)
33 |     return rng.rand(audio_len), sr
34 | 


--------------------------------------------------------------------------------
/tests/neuron/1_core/test_neuron_quant.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from vllm.model_executor.layers.quantization.neuron_quant import (
 4 |     NeuronQuantConfig)
 5 | 
 6 | 
 7 | def test_get_supported_act_dtypes():
 8 |     neuron_quant_config = NeuronQuantConfig()
 9 |     supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
10 |     target_list = ["any_dtype1", "any_dtype2"]
11 |     for dtype in target_list:
12 |         assert dtype in supported_act_dtypes
13 | 


--------------------------------------------------------------------------------
/tests/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/plugins/lora_resolvers/__init__.py


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(name='vllm_add_dummy_model',
 7 |       version='0.1',
 8 |       packages=['vllm_add_dummy_model'],
 9 |       entry_points={
10 |           'vllm.general_plugins':
11 |           ["register_dummy_model = vllm_add_dummy_model:register"]
12 |       })
13 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm import ModelRegistry
 5 | 
 6 | 
 7 | def register():
 8 |     # Test directly passing the model
 9 |     from .my_opt import MyOPTForCausalLM
10 | 
11 |     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
12 |         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
13 | 
14 |     # Test passing lazy model
15 |     if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
16 |         ModelRegistry.register_model(
17 |             "MyGemma2Embedding",
18 |             "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
19 |         )
20 | 
21 |     if "MyLlava" not in ModelRegistry.get_supported_archs():
22 |         ModelRegistry.register_model("MyLlava",
23 |                                      "vllm_add_dummy_model.my_llava:MyLlava")
24 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from typing import Optional
 5 | 
 6 | import torch
 7 | 
 8 | from vllm.model_executor.models.opt import OPTForCausalLM
 9 | from vllm.model_executor.sampling_metadata import SamplingMetadata
10 | 
11 | 
12 | class MyOPTForCausalLM(OPTForCausalLM):
13 | 
14 |     def compute_logits(
15 |             self, hidden_states: torch.Tensor,
16 |             sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
17 |         # this dummy model always predicts the first token
18 |         logits = super().compute_logits(hidden_states, sampling_metadata)
19 |         if logits is not None:
20 |             logits.zero_()
21 |             logits[:, 0] += 1.0
22 |         return logits
23 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name='vllm_add_dummy_platform',
 8 |     version='0.1',
 9 |     packages=['vllm_add_dummy_platform'],
10 |     entry_points={
11 |         'vllm.platform_plugins': [
12 |             "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
13 |         ]
14 |     })
15 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from typing import Optional
5 | 
6 | 
7 | def dummy_platform_plugin() -> Optional[str]:
8 |     return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
9 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.attention.backends.flash_attn import FlashAttentionBackend
 5 | 
 6 | 
 7 | class DummyAttentionBackend(FlashAttentionBackend):
 8 | 
 9 |     @staticmethod
10 |     def get_name() -> str:
11 |         return "Dummy_Backend"
12 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.platforms.cuda import CudaPlatform
 5 | 
 6 | 
 7 | class DummyPlatform(CudaPlatform):
 8 |     device_name = "DummyDevice"
 9 | 
10 |     def get_attn_backend_cls(self, backend_name, head_size, dtype,
11 |                              kv_cache_dtype, block_size, use_v1, use_mla):
12 |         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
13 | 


--------------------------------------------------------------------------------
/tests/plugins_tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')


--------------------------------------------------------------------------------
/tests/prefix_caching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/prefix_caching/__init__.py


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.model_executor.layers.quantization import get_quantization_config
 5 | from vllm.platforms import current_platform
 6 | 
 7 | 
 8 | def is_quant_method_supported(quant_method: str) -> bool:
 9 |     # Currently, all quantization methods require Nvidia or AMD GPUs
10 |     if not (current_platform.is_cuda() or current_platform.is_rocm()):
11 |         return False
12 | 
13 |     capability = current_platform.get_device_capability()
14 |     assert capability is not None
15 | 
16 |     min_capability = get_quantization_config(quant_method).get_min_capability()
17 | 
18 |     return capability.to_int() >= min_capability
19 | 


--------------------------------------------------------------------------------
/tests/reasoning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/reasoning/__init__.py


--------------------------------------------------------------------------------
/tests/runai_model_streamer_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/runai_model_streamer_test/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     Since this module is V0 only, set VLLM_USE_V1=0 for
10 |     all tests in the module.
11 |     """
12 |     monkeypatch.setenv('VLLM_USE_V1', '0')
13 | 


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/standalone_tests/python_only_compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script tests if the python only compilation works correctly
 3 | # for users who do not have any compilers installed on their system
 4 | 
 5 | set -e
 6 | set -x
 7 | 
 8 | cd /vllm-workspace/
 9 | 
10 | # uninstall vllm
11 | pip3 uninstall -y vllm
12 | # restore the original files
13 | mv test_docs/vllm ./vllm
14 | 
15 | # remove all compilers
16 | apt remove --purge build-essential -y
17 | apt autoremove -y
18 | 
19 | echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
20 | 
21 | VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
22 | 
23 | # Run the script
24 | python3 -c 'import vllm'
25 | 
26 | # Check if the clangd log file was created
27 | if [ ! -f /tmp/changed.file ]; then
28 |     echo "changed.file was not created, python only compilation failed"
29 |     exit 1
30 | fi
31 | 


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tensorizer_loader/__init__.py


--------------------------------------------------------------------------------
/tests/tensorizer_loader/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | from vllm.distributed import cleanup_dist_env_and_memory
 6 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 7 | 
 8 | 
 9 | @pytest.fixture(autouse=True)
10 | def cleanup():
11 |     cleanup_dist_env_and_memory(shutdown_ray=True)
12 | 
13 | 
14 | @pytest.fixture(autouse=True)
15 | def tensorizer_config():
16 |     config = TensorizerConfig(tensorizer_uri="vllm")
17 |     return config
18 | 


--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import vllm
 5 | 
 6 | 
 7 | def test_embedded_commit_defined():
 8 |     assert hasattr(vllm, "__version__")
 9 |     assert hasattr(vllm, "__version_tuple__")
10 |     assert vllm.__version__ != "dev"
11 |     assert vllm.__version_tuple__ != (0, 0, "dev")
12 | 


--------------------------------------------------------------------------------
/tests/test_outputs.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | def test_request_output_forward_compatible():
 8 |     output = RequestOutput(request_id="test_request_id",
 9 |                            prompt="test prompt",
10 |                            prompt_token_ids=[1, 2, 3],
11 |                            prompt_logprobs=None,
12 |                            outputs=[],
13 |                            finished=False,
14 |                            example_arg_added_in_new_version="some_value")
15 |     assert output is not None
16 | 


--------------------------------------------------------------------------------
/tests/test_seed_behavior.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import random
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | from vllm.platforms.interface import Platform
 9 | 
10 | 
11 | def test_seed_behavior():
12 |     # Test with a specific seed
13 |     Platform.seed_everything(42)
14 |     random_value_1 = random.randint(0, 100)
15 |     np_random_value_1 = np.random.randint(0, 100)
16 |     torch_random_value_1 = torch.randint(0, 100, (1, )).item()
17 | 
18 |     Platform.seed_everything(42)
19 |     random_value_2 = random.randint(0, 100)
20 |     np_random_value_2 = np.random.randint(0, 100)
21 |     torch_random_value_2 = torch.randint(0, 100, (1, )).item()
22 | 
23 |     assert random_value_1 == random_value_2
24 |     assert np_random_value_1 == np_random_value_2
25 |     assert torch_random_value_1 == torch_random_value_2
26 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | from transformers import PreTrainedTokenizerBase
 6 | 
 7 | from vllm.transformers_utils.tokenizer import get_tokenizer
 8 | 
 9 | TOKENIZER_NAMES = [
10 |     "facebook/opt-125m",
11 |     "gpt2",
12 | ]
13 | 
14 | 
15 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
16 | def test_tokenizer_revision(tokenizer_name: str):
17 |     # Assume that "main" branch always exists
18 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
19 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
20 | 
21 |     # Assume that "never" branch always does not exist
22 |     with pytest.raises(OSError, match='not a valid git identifier'):
23 |         get_tokenizer(tokenizer_name, revision="never")
24 | 


--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tool_use/__init__.py


--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/__init__.py


--------------------------------------------------------------------------------
/tests/tpu/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/lora/__init__.py


--------------------------------------------------------------------------------
/tests/tracing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tracing/__init__.py


--------------------------------------------------------------------------------
/tests/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/__init__.py


--------------------------------------------------------------------------------
/tests/v1/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/v1/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/engine/__init__.py


--------------------------------------------------------------------------------
/tests/v1/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/__init__.py


--------------------------------------------------------------------------------
/tests/v1/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/llm/__init__.py


--------------------------------------------------------------------------------
/tests/v1/kv_connector/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/kv_connector/unit/__init__.py


--------------------------------------------------------------------------------
/tests/v1/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/sample/__init__.py


--------------------------------------------------------------------------------
/tests/v1/shutdown/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Shutdown test utils"""
4 | 
5 | SHUTDOWN_TEST_TIMEOUT_SEC = 120
6 | SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
7 | 


--------------------------------------------------------------------------------
/tests/v1/structured_output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/structured_output/__init__.py


--------------------------------------------------------------------------------
/tests/v1/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/__init__.py


--------------------------------------------------------------------------------
/tests/v1/tpu/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/worker/__init__.py


--------------------------------------------------------------------------------
/tests/v1/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/worker/__init__.py


--------------------------------------------------------------------------------
/tests/vllm_test_utils/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name='vllm_test_utils',
 8 |     version='0.1',
 9 |     packages=['vllm_test_utils'],
10 | )
11 | 


--------------------------------------------------------------------------------
/tests/vllm_test_utils/vllm_test_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | vllm_utils is a package for vLLM testing utilities.
 5 | It does not import any vLLM modules.
 6 | """
 7 | 
 8 | from .blame import BlameResult, blame
 9 | from .monitor import MonitoredValues, monitor
10 | 
11 | __all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
12 | 


--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
5 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
6 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
7 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
8 | compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/worker/__init__.py


--------------------------------------------------------------------------------
/tests/worker/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="function", autouse=True)
 7 | def use_v0_only(monkeypatch):
 8 |     """
 9 |     This module tests V0 internals, so set VLLM_USE_V1=0.
10 |     """
11 |     monkeypatch.setenv('VLLM_USE_V1', '0')


--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
 3 | 
 4 | if ! git diff --quiet; then
 5 | 	echo "Repo is dirty" >&2
 6 | 
 7 | 	exit 1
 8 | fi
 9 | 
10 | if ! git describe --tags; then
11 | 	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
12 | 
13 | 	exit 1
14 | fi
15 | 


--------------------------------------------------------------------------------
/tools/ep_kernels/install_system_drivers.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | # prepare workspace directory
 4 | WORKSPACE=$1
 5 | if [ -z "$WORKSPACE" ]; then
 6 |     export WORKSPACE=$(pwd)/ep_kernels_workspace
 7 | fi
 8 | 
 9 | if [ ! -d "$WORKSPACE" ]; then
10 |     mkdir -p $WORKSPACE
11 | fi
12 | 
13 | # build and install gdrcopy driver
14 | pushd $WORKSPACE
15 | cd gdrcopy_src
16 | ./insmod.sh
17 | # run gdrcopy_copybw to test the installation
18 | $WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
19 | 
20 | # turn on IBGDA
21 | echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
22 | update-initramfs -u
23 | 
24 | echo "Please reboot the system to apply the changes"
25 | 


--------------------------------------------------------------------------------
/tools/ep_kernels/install_system_libraries.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | # prepare workspace directory
 4 | WORKSPACE=$1
 5 | if [ -z "$WORKSPACE" ]; then
 6 |     export WORKSPACE=$(pwd)/ep_kernels_workspace
 7 | fi
 8 | 
 9 | if [ ! -d "$WORKSPACE" ]; then
10 |     mkdir -p $WORKSPACE
11 | fi
12 | 
13 | # build and install gdrcopy system packages
14 | pushd $WORKSPACE
15 | cd gdrcopy_src/packages
16 | apt install devscripts -y
17 | CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
18 | dpkg -i *.deb
19 | 


--------------------------------------------------------------------------------
/tools/mypy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CI=${1:-0}
 4 | PYTHON_VERSION=${2:-local}
 5 | 
 6 | if [ "$CI" -eq 1 ]; then
 7 |     set -e
 8 | fi
 9 | 
10 | if [ $PYTHON_VERSION == "local" ]; then
11 |     PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
12 | fi
13 | 
14 | run_mypy() {
15 |     echo "Running mypy on $1"
16 |     if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
17 |         mypy --python-version "${PYTHON_VERSION}" "$@"
18 |         return
19 |     fi
20 |     mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
21 | }
22 | 
23 | run_mypy # Note that this is less strict than CI
24 | run_mypy tests
25 | run_mypy vllm/attention
26 | run_mypy vllm/compilation
27 | run_mypy vllm/distributed
28 | run_mypy vllm/engine
29 | run_mypy vllm/executor
30 | run_mypy vllm/inputs
31 | run_mypy vllm/lora
32 | run_mypy vllm/model_executor
33 | run_mypy vllm/plugins
34 | run_mypy vllm/prompt_adapter
35 | run_mypy vllm/spec_decode
36 | run_mypy vllm/worker
37 | run_mypy vllm/v1
38 | 


--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure that *.excalidraw.png files have the excalidraw metadata
 4 | # embedded in them. This ensures they can be loaded back into
 5 | # the tool and edited in the future.
 6 | 
 7 | find . -iname '*.excalidraw.png' | while read -r file; do
 8 | 	if git check-ignore -q "$file"; then
 9 | 		continue
10 | 	fi
11 | 	if ! grep -q "excalidraw+json" "$file"; then
12 | 		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
13 | 		exit 1
14 | 	fi
15 | done
16 | 


--------------------------------------------------------------------------------
/tools/shellcheck.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | scversion="stable"
 5 | 
 6 | if [ -d "shellcheck-${scversion}" ]; then
 7 |     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
 8 | fi
 9 | 
10 | if ! [ -x "$(command -v shellcheck)" ]; then
11 |     if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
12 |         echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
13 |         exit 1
14 |     fi
15 | 
16 |     # automatic local install if linux x86_64
17 |     wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
18 |     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
19 | fi
20 | 
21 | # TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
22 | find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
23 | 


--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import glob
 5 | 
 6 | requires_files = glob.glob('requirements/*.txt')
 7 | requires_files += ["pyproject.toml"]
 8 | for file in requires_files:
 9 |     print(f">>> cleaning {file}")
10 |     with open(file) as f:
11 |         lines = f.readlines()
12 |     if "torch" in "".join(lines).lower():
13 |         print("removed:")
14 |         with open(file, 'w') as f:
15 |             for line in lines:
16 |                 if 'torch' not in line.lower():
17 |                     f.write(line)
18 |                 else:
19 |                     print(line.strip())
20 |     print(f"<<< done cleaning {file}")
21 |     print()
22 | 


--------------------------------------------------------------------------------
/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/adapter_commons/__init__.py


--------------------------------------------------------------------------------
/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from dataclasses import dataclass
 5 | 
 6 | 
 7 | @dataclass
 8 | class AdapterMapping:
 9 |     # Per every token in input_ids:
10 |     index_mapping: tuple[int, ...]
11 |     # Per sampled token:
12 |     prompt_mapping: tuple[int, ...]
13 | 
14 |     def __post_init__(self):
15 |         self.index_mapping = tuple(self.index_mapping)
16 |         self.prompt_mapping = tuple(self.prompt_mapping)


--------------------------------------------------------------------------------
/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from abc import ABC, abstractmethod
 5 | 
 6 | 
 7 | class AdapterRequest(ABC):
 8 |     """
 9 |     Base class for adapter requests.
10 |     """
11 | 
12 |     @property
13 |     @abstractmethod
14 |     def adapter_id(self) -> int:
15 |         raise NotImplementedError
16 | 
17 |     def __post_init__(self) -> None:
18 |         if self.adapter_id < 1:
19 |             raise ValueError(f"id must be > 0, got {self.adapter_id}")
20 | 
21 |     def __eq__(self, value: object) -> bool:
22 |         return isinstance(
23 |             value, self.__class__) and self.adapter_id == value.adapter_id
24 | 
25 |     def __hash__(self) -> int:
26 |         return hash(self.adapter_id)
27 | 


--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/assets/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.attention.backends.abstract import (AttentionBackend,
 5 |                                               AttentionMetadata,
 6 |                                               AttentionMetadataBuilder,
 7 |                                               AttentionState, AttentionType)
 8 | from vllm.attention.layer import Attention
 9 | from vllm.attention.selector import get_attn_backend
10 | 
11 | __all__ = [
12 |     "Attention",
13 |     "AttentionBackend",
14 |     "AttentionMetadata",
15 |     "AttentionType",
16 |     "AttentionMetadataBuilder",
17 |     "Attention",
18 |     "AttentionState",
19 |     "get_attn_backend",
20 | ]
21 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/mla/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/blocksparse_attention/__init__.py


--------------------------------------------------------------------------------
/vllm/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/benchmarks/__init__.py


--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/compilation/__init__.py


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/vllm/device_allocator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/device_allocator/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from .communication_op import *
5 | from .parallel_state import *
6 | from .utils import *
7 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/neuron_communicator.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import torch
 4 | 
 5 | from vllm.distributed.device_communicators.base_device_communicator import (
 6 |     DeviceCommunicatorBase)
 7 | from vllm.platforms import current_platform
 8 | 
 9 | if current_platform.is_neuron():
10 |     import torch_xla.core.xla_model as xm
11 | 
12 | 
13 | class NeuronCommunicator(DeviceCommunicatorBase):
14 | 
15 |     def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
16 |         return xm.all_reduce(xm.REDUCE_SUM, x)
17 | 
18 |     def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
19 |         assert dim == -1, "Neuron only supports dim=-1 for all-gather."
20 |         return xm.all_gather(x, dim=dim)
21 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.distributed.kv_transfer.kv_transfer_state import (
 5 |     KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
 6 |     has_kv_transfer_group, is_v1_kv_transfer_group)
 7 | 
 8 | __all__ = [
 9 |     "get_kv_transfer_group", "has_kv_transfer_group",
10 |     "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
11 |     "KVConnectorBaseType"
12 | ]
13 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_connector/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.distributed.kv_transfer.kv_connector.v1.base import (
4 |     KVConnectorBase_V1, KVConnectorRole)
5 | 
6 | __all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
7 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_pipe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_pipe/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/benchmark/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/serve.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import argparse
 4 | 
 5 | from vllm.benchmarks.serve import add_cli_args, main
 6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 7 | from vllm.entrypoints.cli.types import CLISubcommand
 8 | 
 9 | 
10 | class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
11 |     """ The `serve` subcommand for vllm bench. """
12 | 
13 |     def __init__(self):
14 |         self.name = "serve"
15 |         super().__init__()
16 | 
17 |     @property
18 |     def help(self) -> str:
19 |         return "Benchmark the online serving throughput."
20 | 
21 |     def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
22 |         add_cli_args(parser)
23 | 
24 |     @staticmethod
25 |     def cmd(args: argparse.Namespace) -> None:
26 |         main(args)
27 | 
28 | 
29 | def cmd_init() -> list[CLISubcommand]:
30 |     return [BenchmarkServingSubcommand()]
31 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/types.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import argparse
 5 | 
 6 | from vllm.utils import FlexibleArgumentParser
 7 | 
 8 | 
 9 | class CLISubcommand:
10 |     """Base class for CLI argument handlers."""
11 | 
12 |     name: str
13 | 
14 |     @staticmethod
15 |     def cmd(args: argparse.Namespace) -> None:
16 |         raise NotImplementedError("Subclasses should implement this method")
17 | 
18 |     def validate(self, args: argparse.Namespace) -> None:
19 |         # No validation by default
20 |         pass
21 | 
22 |     def subparser_init(
23 |             self,
24 |             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
25 |         raise NotImplementedError("Subclasses should implement this method")
26 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/logging_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.logging_utils.formatter import NewLineFormatter
5 | 
6 | __all__ = [
7 |     "NewLineFormatter",
8 | ]
9 | 


--------------------------------------------------------------------------------
/vllm/logging_utils/formatter.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import logging
 5 | 
 6 | 
 7 | class NewLineFormatter(logging.Formatter):
 8 |     """Adds logging prefix to newlines to align multi-line messages."""
 9 | 
10 |     def __init__(self, fmt, datefmt=None, style="%"):
11 |         logging.Formatter.__init__(self, fmt, datefmt, style)
12 | 
13 |     def format(self, record):
14 |         msg = logging.Formatter.format(self, record)
15 |         if record.message != "":
16 |             parts = msg.split(record.message)
17 |             msg = msg.replace("\n", "\r\n" + parts[0])
18 |         return msg
19 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/ops/torch_ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 5 | from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
 6 |                                               sgmv_expand, sgmv_expand_slice,
 7 |                                               sgmv_shrink)
 8 | 
 9 | __all__ = [
10 |     "bgmv_expand",
11 |     "bgmv_expand_slice",
12 |     "bgmv_shrink",
13 |     "sgmv_expand",
14 |     "sgmv_expand_slice",
15 |     "sgmv_shrink",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/triton_ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 5 | from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
 6 | from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
 7 | 
 8 | __all__ = [
 9 |     "lora_expand",
10 |     "lora_shrink",
11 |     "LoRAKernelMeta",
12 | ]
13 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/xla_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 |                                             bgmv_shrink)
6 | 
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 | 


--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 5 | from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
 6 | 
 7 | __all__ = [
 8 |     "PunicaWrapperBase",
 9 |     "get_punica_wrapper",
10 | ]
11 | 


--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/punica_selector.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.logger import init_logger
 5 | from vllm.platforms import current_platform
 6 | from vllm.utils import resolve_obj_by_qualname
 7 | 
 8 | from .punica_base import PunicaWrapperBase
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
14 |     punica_wrapper_qualname = current_platform.get_punica_wrapper()
15 |     punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
16 |     punica_wrapper = punica_wrapper_cls(*args, **kwargs)
17 |     assert punica_wrapper is not None, \
18 |         "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
19 |     logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
20 |     return punica_wrapper
21 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.model_executor.parameter import (BasevLLMParameter,
 5 |                                            PackedvLLMParameter)
 6 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
 7 |                                                    SamplingMetadataCache)
 8 | from vllm.model_executor.utils import set_random_seed
 9 | 
10 | __all__ = [
11 |     "SamplingMetadata",
12 |     "SamplingMetadataCache",
13 |     "set_random_seed",
14 |     "BasevLLMParameter",
15 |     "PackedvLLMParameter",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 
12 | See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
13 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/kernels/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/quark/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from .quark_scheme import QuarkScheme
 5 | from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
 6 | from .quark_w8a8_fp8 import QuarkW8A8Fp8
 7 | from .quark_w8a8_int8 import QuarkW8A8Int8
 8 | 
 9 | __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from .layer_utils import replace_parameter, update_tensor_inplace
5 | 
6 | __all__ = ['update_tensor_inplace', 'replace_parameter']
7 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 16,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 4
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 8,
24 |         "num_stages": 5
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 16,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "4": {
 3 |         "BLOCK_SIZE_M": 16,
 4 |         "BLOCK_SIZE_N": 32,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "8": {
11 |         "BLOCK_SIZE_M": 16,
12 |         "BLOCK_SIZE_N": 32,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 4
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 16,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | # Adapted from llama.py
 5 | """Inference-only Phi3 model code inherit from Llama.py"""
 6 | 
 7 | from vllm.model_executor.models.llama import LlamaForCausalLM
 8 | 
 9 | 
10 | class Phi3ForCausalLM(LlamaForCausalLM):
11 | 
12 |     packed_modules_mapping = {
13 |         "qkv_proj": [
14 |             "qkv_proj",
15 |         ],
16 |         "gate_up_proj": [
17 |             "gate_up_proj",
18 |         ],
19 |     }
20 | 


--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/README.md:
--------------------------------------------------------------------------------
 1 | # LoRA Resolver Plugins
 2 | 
 3 | This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
 4 | via the LoRAResolver plugin framework.
 5 | 
 6 | Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
 7 | to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
 8 | 
 9 | # lora_filesystem_resolver
10 | This LoRA Resolver is installed with vLLM by default.
11 | To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
12 | for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
13 | for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
14 | load that adapter, and then service the request as normal. That adapter will then be available
15 | for future requests as normal.
16 | 


--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/plugins/lora_resolvers/__init__.py


--------------------------------------------------------------------------------
/vllm/profiler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/profiler/__init__.py


--------------------------------------------------------------------------------
/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/prompt_adapter/__init__.py


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/reasoning/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 5 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 6 | from .granite_reasoning_parser import GraniteReasoningParser
 7 | from .qwen3_reasoning_parser import Qwen3ReasoningParser
 8 | 
 9 | __all__ = [
10 |     "ReasoningParser",
11 |     "ReasoningParserManager",
12 |     "DeepSeekR1ReasoningParser",
13 |     "GraniteReasoningParser",
14 |     "Qwen3ReasoningParser",
15 | ]
16 | 


--------------------------------------------------------------------------------
/vllm/scripts.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.entrypoints.cli.main import main as vllm_main
 5 | from vllm.logger import init_logger
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | # Backwards compatibility for the move from vllm.scripts to
11 | # vllm.entrypoints.cli.main
12 | def main():
13 |     logger.warning("vllm.scripts.main() is deprecated. Please re-install "
14 |                    "vllm or use vllm.entrypoints.cli.main.main() instead.")
15 |     vllm_main()
16 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/third_party/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .registry import get_chat_template_fallback_path
4 | 
5 | __all__ = ["get_chat_template_fallback_path"]
6 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_basic.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 |     {{- message['content'] -}}
3 | {%- endfor -%}
4 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_blip2.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'Question: ' + message['content'] + ' ' -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Answer: ' + message['content'] + ' ' -}}
 6 |     {%- endif -%}
 7 | {%- endfor -%}
 8 | 
 9 | {%- if add_generation_prompt -%}
10 |     {{- 'Answer:' -}}
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_chatml.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
 3 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 4 |         {{- '<|im_end|>' + '\n' -}}
 5 |     {%- endif -%}
 6 | {%- endfor -%}
 7 | 
 8 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
 9 |     {{- '<|im_start|>assistant\n' -}}
10 | {%- endif -%}
11 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages[0]['role'] == 'system' -%}
 2 |     {%- set system_message = messages[0]['content'] -%}
 3 |     {%- set messages = messages[1:] -%}
 4 | {%- else -%}
 5 |     {% set system_message = '' -%}
 6 | {%- endif -%}
 7 | 
 8 | {{ bos_token + system_message }}
 9 | {%- for message in messages -%}
10 |     {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 |         {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 |     {%- endif -%}
13 | 
14 |     {%- if message['role'] == 'user' -%}
15 |         {{ '<|User|>: ' + message['content'] + '\n\n' }}
16 |     {%- elif message['role'] == 'assistant' -%}
17 |         {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
18 |     {%- endif -%}
19 | {%- endfor -%}
20 | 
21 | {%- if add_generation_prompt -%}
22 |     {{ '<|Assistant|>: ' }}
23 | {%- endif -%}
24 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_fuyu.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 |     {{- message['content'] + '\n' -}}
3 | {%- endfor -%}
4 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/h2ovl.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | # Adapted from
 5 | # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
 6 | # --------------------------------------------------------
 7 | # H2OVL-Mississippi
 8 | # Copyright (c) 2024 H2O.AI
 9 | # Licensed under Apache 2.0 License [see LICENSE for details]
10 | # --------------------------------------------------------
11 | 
12 | from .internvl import InternVLChatConfig
13 | 
14 | 
15 | class H2OVLChatConfig(InternVLChatConfig):
16 |     model_type = "h2ovl_chat"
17 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | # Adapted from
 5 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 6 | # --------------------------------------------------------
 7 | # NVLM-D
 8 | # Copyright (c) 2024 NVIDIA
 9 | # Licensed under Apache 2.0 License [see LICENSE for details]
10 | # --------------------------------------------------------
11 | from .internvl import InternVLChatConfig
12 | 
13 | 
14 | class NVLM_D_Config(InternVLChatConfig):
15 |     model_type = 'NVLM_D'
16 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.transformers_utils.processors.deepseek_vl2 import (
5 |     DeepseekVLV2Processor)
6 | from vllm.transformers_utils.processors.ovis import OvisProcessor
7 | 
8 | __all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
9 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
 5 |                       truncate_tool_call_ids, validate_request_params)
 6 | 
 7 | __all__ = [
 8 |     "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
 9 |     "validate_request_params"
10 | ]
11 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
 5 |                                          TritonPlaceholder)
 6 | 
 7 | if HAS_TRITON:
 8 |     import triton
 9 |     import triton.language as tl
10 | else:
11 |     triton = TritonPlaceholder()
12 |     tl = TritonLanguagePlaceholder()
13 | 
14 | __all__ = ["HAS_TRITON", "triton", "tl"]
15 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/mla/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/core/sched/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/sched/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/core/sched/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from vllm.v1.request import Request, RequestStatus
 4 | 
 5 | 
 6 | def check_stop(request: Request, max_model_len: int) -> bool:
 7 |     if (request.num_tokens >= max_model_len
 8 |             or request.num_output_tokens >= request.max_tokens):
 9 |         request.status = RequestStatus.FINISHED_LENGTH_CAPPED
10 |         return True
11 | 
12 |     sampling_params = request.sampling_params
13 |     last_token_id = request.output_token_ids[-1]
14 |     if (not sampling_params.ignore_eos
15 |             and last_token_id == request.eos_token_id):
16 |         request.status = RequestStatus.FINISHED_STOPPED
17 |         return True
18 | 
19 |     if last_token_id in (sampling_params.stop_token_ids or ()):
20 |         request.status = RequestStatus.FINISHED_STOPPED
21 |         request.stop_reason = last_token_id
22 |         return True
23 |     return False
24 | 


--------------------------------------------------------------------------------
/vllm/v1/engine/exceptions.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | class EngineGenerateError(Exception):
 4 |     """Raised when a AsyncLLM.generate() fails. Recoverable."""
 5 |     pass
 6 | 
 7 | 
 8 | class EngineDeadError(Exception):
 9 |     """Raised when the EngineCore dies. Unrecoverable."""
10 | 
11 |     def __init__(self, *args, suppress_context: bool = False, **kwargs):
12 |         ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
13 | 
14 |         super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
15 |         # Make stack trace clearer when using with LLMEngine by
16 |         # silencing irrelevant ZMQError.
17 |         self.__suppress_context__ = suppress_context
18 | 


--------------------------------------------------------------------------------
/vllm/v1/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/metrics/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/sample/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/sample/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/tpu/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/worker/__init__.py


--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/vllm_flash_attn/.gitkeep


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/worker/__init__.py


--------------------------------------------------------------------------------