├── .buildkite
├── check-wheel-size.py
├── generate_index.py
├── lm-eval-harness
│ ├── configs
│ │ ├── DeepSeek-V2-Lite-Chat.yaml
│ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
│ │ ├── Meta-Llama-3-70B-Instruct.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct.yaml
│ │ ├── Meta-Llama-3-8B-QQQ.yaml
│ │ ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Minitron-4B-Base-FP8.yaml
│ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
│ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
│ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml
│ │ ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml
│ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml
│ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
│ │ ├── Qwen2-57B-A14-Instruct.yaml
│ │ ├── Qwen2.5-1.5B-Instruct.yaml
│ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
│ │ ├── SparseLlama3.1_2of4_fp8_compressed.yaml
│ │ ├── models-large.txt
│ │ └── models-small.txt
│ ├── conftest.py
│ ├── run-lm-eval-gsm-hf-baseline.sh
│ ├── run-lm-eval-gsm-vllm-baseline.sh
│ └── test_lm_eval_correctness.py
├── nightly-benchmarks
│ ├── README.md
│ ├── benchmark-pipeline.yaml
│ ├── nightly-annotation.md
│ ├── nightly-descriptions.md
│ ├── nightly-pipeline.yaml
│ ├── performance-benchmarks-descriptions.md
│ ├── scripts
│ │ ├── convert-results-json-to-markdown.py
│ │ ├── download-tokenizer.py
│ │ ├── generate-nightly-markdown.py
│ │ ├── get-lmdeploy-modelname.py
│ │ ├── launch-server.sh
│ │ ├── nightly-annotate.sh
│ │ ├── run-nightly-benchmarks.sh
│ │ ├── run-performance-benchmarks.sh
│ │ ├── summary-nightly-results.py
│ │ └── wait-for-image.sh
│ └── tests
│ │ ├── genai-perf-tests.json
│ │ ├── latency-tests.json
│ │ ├── nightly-tests.json
│ │ ├── serving-tests.json
│ │ └── throughput-tests.json
├── pyproject.toml
├── release-pipeline.yaml
├── scripts
│ ├── annotate-release.sh
│ ├── hardware_ci
│ │ ├── run-amd-test.sh
│ │ ├── run-cpu-test-ppc64le.sh
│ │ ├── run-cpu-test-s390x.sh
│ │ ├── run-cpu-test.sh
│ │ ├── run-gh200-test.sh
│ │ ├── run-hpu-test.sh
│ │ ├── run-neuron-test.sh
│ │ ├── run-tpu-v1-test.sh
│ │ └── run-xpu-test.sh
│ ├── run-benchmarks.sh
│ ├── run-multi-node-test.sh
│ └── upload-wheels.sh
├── test-pipeline.yaml
└── test-template.j2
├── .clang-format
├── .dockerignore
├── .github
├── CODEOWNERS
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── 100-documentation.yml
│ ├── 200-installation.yml
│ ├── 300-usage.yml
│ ├── 400-bug-report.yml
│ ├── 450-ci-failure.yml
│ ├── 500-feature-request.yml
│ ├── 600-new-model.yml
│ ├── 700-performance-discussion.yml
│ ├── 750-RFC.yml
│ └── config.yml
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
├── mergify.yml
├── scripts
│ └── cleanup_pr_body.sh
└── workflows
│ ├── add_label_automerge.yml
│ ├── cleanup_pr_body.yml
│ ├── matchers
│ ├── actionlint.json
│ └── mypy.json
│ ├── pre-commit.yml
│ ├── publish.yml
│ ├── scripts
│ ├── build.sh
│ ├── create_release.js
│ ├── cuda-install.sh
│ ├── env.sh
│ └── pytorch-install.sh
│ └── stale.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .shellcheckrc
├── .yapfignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DCO
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASE.md
├── ROCm_performance.md
├── SECURITY.md
├── benchmarks
├── P3L.py
├── P3L_mling.py
├── README.md
├── auto_tune.sh
├── backend_request_func.py
├── benchmark_dataset.py
├── benchmark_latency.py
├── benchmark_long_document_qa_throughput.py
├── benchmark_prefix_caching.py
├── benchmark_prioritization.py
├── benchmark_serving.py
├── benchmark_serving_structured_output.py
├── benchmark_throughput.py
├── benchmark_utils.py
├── cutlass_benchmarks
│ ├── sparse_benchmarks.py
│ ├── utils.py
│ ├── w8a8_benchmarks.py
│ └── weight_shapes.py
├── disagg_benchmarks
│ ├── disagg_overhead_benchmark.sh
│ ├── disagg_performance_benchmark.sh
│ ├── disagg_prefill_proxy_server.py
│ ├── round_robin_proxy.py
│ └── visualize_benchmark_results.py
├── fused_kernels
│ └── layernorm_rms_benchmarks.py
├── kernels
│ ├── bench_fp8_gemm.py
│ ├── benchmark_aqlm.py
│ ├── benchmark_bitblas.py
│ ├── benchmark_cutlass_fp4_moe.py
│ ├── benchmark_grouped_gemm_cutlass.py
│ ├── benchmark_layernorm.py
│ ├── benchmark_lora.py
│ ├── benchmark_machete.py
│ ├── benchmark_marlin.py
│ ├── benchmark_moe.py
│ ├── benchmark_moe_permute_unpermute.py
│ ├── benchmark_paged_attention.py
│ ├── benchmark_quant.py
│ ├── benchmark_rmsnorm.py
│ ├── benchmark_rope.py
│ ├── benchmark_shapes.py
│ ├── benchmark_w8a8_block_fp8.py
│ ├── deepgemm
│ │ ├── README.md
│ │ └── benchmark_fp8_block_dense_gemm.py
│ ├── graph_machete_bench.py
│ ├── moe_tune_script.sh
│ ├── requirements.txt
│ ├── utils.py
│ └── weight_shapes.py
├── overheads
│ └── benchmark_hashing.py
├── profiling
│ ├── README.md
│ ├── benchmark_latency.py
│ └── benchmark_throughput.py
├── pyproject.toml
├── run_structured_output_benchmark.sh
├── sonnet.txt
└── structured_schemas
│ └── structured_schema_1.json
├── cmake
├── cpu_extension.cmake
├── external_projects
│ ├── flashmla.cmake
│ └── vllm_flash_attn.cmake
├── hipify.py
└── utils.cmake
├── csrc
├── activation_kernels.cu
├── attention
│ ├── attention_dtypes.h
│ ├── attention_generic.cuh
│ ├── attention_kernels.cuh
│ ├── attention_utils.cuh
│ ├── dtype_bfloat16.cuh
│ ├── dtype_float16.cuh
│ ├── dtype_float32.cuh
│ ├── dtype_fp8.cuh
│ ├── merge_attn_states.cu
│ ├── mla
│ │ ├── cutlass_mla_entry.cu
│ │ └── cutlass_mla_kernels.cu
│ ├── paged_attention_v1.cu
│ ├── paged_attention_v2.cu
│ └── vertical_slash_index.cu
├── cache.h
├── cache_kernels.cu
├── core
│ ├── exception.hpp
│ ├── math.hpp
│ ├── registration.h
│ └── scalar_type.hpp
├── cpu
│ ├── activation.cpp
│ ├── attention.cpp
│ ├── cache.cpp
│ ├── cpu_types.hpp
│ ├── cpu_types_arm.hpp
│ ├── cpu_types_vsx.hpp
│ ├── cpu_types_vxe.hpp
│ ├── cpu_types_x86.hpp
│ ├── dnnl_helper.hpp
│ ├── layernorm.cpp
│ ├── mla_decode.cpp
│ ├── pos_encoding.cpp
│ ├── quant.cpp
│ ├── shm.cpp
│ ├── torch_bindings.cpp
│ └── utils.cpp
├── cuda_compat.h
├── cuda_utils.h
├── cuda_utils_kernels.cu
├── cuda_view.cu
├── cumem_allocator.cpp
├── custom_all_reduce.cu
├── custom_all_reduce.cuh
├── custom_all_reduce_test.cu
├── cutlass_extensions
│ ├── common.cpp
│ ├── common.hpp
│ ├── cute_utils.cuh
│ ├── epilogue
│ │ ├── broadcast_load_epilogue_array_c3x.hpp
│ │ ├── broadcast_load_epilogue_c2x.hpp
│ │ ├── broadcast_load_epilogue_c3x.hpp
│ │ ├── scaled_mm_epilogues_c2x.hpp
│ │ └── scaled_mm_epilogues_c3x.hpp
│ ├── gemm
│ │ ├── collective
│ │ │ ├── collective_builder.hpp
│ │ │ ├── fp8_accumulation.hpp
│ │ │ └── sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
│ │ └── dispatch_policy.hpp
│ ├── torch_utils.hpp
│ ├── vllm_collective_builder.cuh
│ ├── vllm_custom_types.cuh
│ ├── vllm_cutlass_library_extension.py
│ ├── vllm_numeric_conversion.cuh
│ └── vllm_type_utils.cuh
├── dispatch_utils.h
├── layernorm_kernels.cu
├── layernorm_quant_kernels.cu
├── mamba
│ ├── causal_conv1d
│ │ ├── causal_conv1d.cu
│ │ ├── causal_conv1d.h
│ │ └── static_switch.h
│ └── mamba_ssm
│ │ ├── selective_scan.h
│ │ ├── selective_scan_fwd.cu
│ │ └── static_switch.h
├── moe
│ ├── marlin_moe_wna16
│ │ ├── .gitignore
│ │ ├── generate_kernels.py
│ │ ├── kernel.h
│ │ ├── marlin_template.h
│ │ └── ops.cu
│ ├── moe_align_sum_kernels.cu
│ ├── moe_ops.h
│ ├── moe_permute_unpermute_op.cu
│ ├── moe_wna16.cu
│ ├── moe_wna16_utils.h
│ ├── permute_unpermute_kernels
│ │ ├── dispatch.h
│ │ ├── moe_permute_unpermute_kernel.cu
│ │ ├── moe_permute_unpermute_kernel.h
│ │ └── moe_permute_unpermute_kernel.inl
│ ├── topk_softmax_kernels.cu
│ └── torch_bindings.cpp
├── ops.h
├── permute_cols.cu
├── pos_encoding_kernels.cu
├── prepare_inputs
│ ├── advance_step.cu
│ └── advance_step.cuh
├── quantization
│ ├── activation_kernels.cu
│ ├── aqlm
│ │ └── gemm_kernels.cu
│ ├── awq
│ │ ├── dequantize.cuh
│ │ └── gemm_kernels.cu
│ ├── compressed_tensors
│ │ └── int8_quant_kernels.cu
│ ├── cutlass_w8a8
│ │ ├── Epilogues.md
│ │ ├── c3x
│ │ │ ├── cutlass_gemm_caller.cuh
│ │ │ ├── scaled_mm.cuh
│ │ │ ├── scaled_mm_azp_sm90_int8.cu
│ │ │ ├── scaled_mm_blockwise_sm100_fp8.cu
│ │ │ ├── scaled_mm_blockwise_sm100_fp8_dispatch.cuh
│ │ │ ├── scaled_mm_blockwise_sm90_fp8.cu
│ │ │ ├── scaled_mm_blockwise_sm90_fp8_dispatch.cuh
│ │ │ ├── scaled_mm_helper.hpp
│ │ │ ├── scaled_mm_kernels.hpp
│ │ │ ├── scaled_mm_sm100_fp8.cu
│ │ │ ├── scaled_mm_sm100_fp8_dispatch.cuh
│ │ │ ├── scaled_mm_sm90_fp8.cu
│ │ │ ├── scaled_mm_sm90_fp8_dispatch.cuh
│ │ │ ├── scaled_mm_sm90_int8.cu
│ │ │ └── scaled_mm_sm90_int8_dispatch.cuh
│ │ ├── moe
│ │ │ ├── get_group_starts.cuh
│ │ │ ├── grouped_mm_c3x.cu
│ │ │ ├── grouped_mm_c3x.cuh
│ │ │ └── moe_data.cu
│ │ ├── scaled_mm_c2x.cu
│ │ ├── scaled_mm_c2x.cuh
│ │ ├── scaled_mm_c2x_sm75_dispatch.cuh
│ │ ├── scaled_mm_c2x_sm80_dispatch.cuh
│ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
│ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
│ │ ├── scaled_mm_c3x_sm100.cu
│ │ ├── scaled_mm_c3x_sm90.cu
│ │ └── scaled_mm_entry.cu
│ ├── fp4
│ │ ├── nvfp4_blockwise_moe_kernel.cu
│ │ ├── nvfp4_experts_quant.cu
│ │ ├── nvfp4_quant_entry.cu
│ │ ├── nvfp4_quant_kernels.cu
│ │ ├── nvfp4_scaled_mm_entry.cu
│ │ └── nvfp4_scaled_mm_kernels.cu
│ ├── fp8
│ │ ├── amd
│ │ │ └── quant_utils.cuh
│ │ ├── common.cu
│ │ ├── common.cuh
│ │ └── nvidia
│ │ │ └── quant_utils.cuh
│ ├── fused_kernels
│ │ ├── fused_layernorm_dynamic_per_token_quant.cu
│ │ ├── layernorm_utils.cuh
│ │ └── quant_conversions.cuh
│ ├── gguf
│ │ ├── dequantize.cuh
│ │ ├── ggml-common.h
│ │ ├── gguf_kernel.cu
│ │ ├── mmq.cuh
│ │ ├── mmvq.cuh
│ │ ├── moe.cuh
│ │ ├── moe_vec.cuh
│ │ └── vecdotq.cuh
│ ├── gptq
│ │ ├── compat.cuh
│ │ ├── matrix_view.cuh
│ │ ├── q_gemm.cu
│ │ ├── qdq_2.cuh
│ │ ├── qdq_3.cuh
│ │ ├── qdq_4.cuh
│ │ ├── qdq_8.cuh
│ │ └── qdq_util.cuh
│ ├── gptq_allspark
│ │ ├── allspark_qgemm_w8a16.cu
│ │ ├── allspark_repack.cu
│ │ └── allspark_utils.cuh
│ ├── gptq_marlin
│ │ ├── .gitignore
│ │ ├── awq_marlin_repack.cu
│ │ ├── dequant.h
│ │ ├── generate_kernels.py
│ │ ├── gptq_marlin.cu
│ │ ├── gptq_marlin_repack.cu
│ │ ├── kernel.h
│ │ ├── marlin.cuh
│ │ ├── marlin_dtypes.cuh
│ │ └── marlin_template.h
│ ├── machete
│ │ ├── Readme.md
│ │ ├── generate.py
│ │ ├── machete_collective_builder.cuh
│ │ ├── machete_interleaving_utils.cuh
│ │ ├── machete_mainloop.cuh
│ │ ├── machete_mm_kernel.cuh
│ │ ├── machete_mm_launcher.cuh
│ │ ├── machete_prepack_kernel.cuh
│ │ ├── machete_prepack_launcher.cuh
│ │ ├── machete_prepacked_layout.cuh
│ │ └── machete_pytorch.cu
│ ├── marlin
│ │ ├── dense
│ │ │ ├── LICENSE
│ │ │ ├── common
│ │ │ │ ├── base.h
│ │ │ │ └── mem.h
│ │ │ └── marlin_cuda_kernel.cu
│ │ ├── qqq
│ │ │ └── marlin_qqq_gemm_kernel.cu
│ │ └── sparse
│ │ │ ├── LICENSE
│ │ │ ├── common
│ │ │ ├── base.h
│ │ │ ├── mem.h
│ │ │ └── mma.h
│ │ │ └── marlin_24_cuda_kernel.cu
│ ├── utils.cuh
│ └── vectorization.cuh
├── rocm
│ ├── attention.cu
│ ├── custom.cu
│ ├── fused_kernels.cu
│ ├── ops.h
│ ├── skinny_gemms.cu
│ └── torch_bindings.cpp
├── sampler.cu
├── sparse
│ └── cutlass
│ │ ├── sparse_compressor_c3x.cuh
│ │ ├── sparse_scaled_mm_c3x.cu
│ │ ├── sparse_scaled_mm_c3x.cuh
│ │ └── sparse_scaled_mm_entry.cu
├── torch_bindings.cpp
└── type_convert.cuh
├── docker
├── Dockerfile
├── Dockerfile.arm
├── Dockerfile.cpu
├── Dockerfile.hpu
├── Dockerfile.neuron
├── Dockerfile.nightly_torch
├── Dockerfile.ppc64le
├── Dockerfile.rocm
├── Dockerfile.rocm_base
├── Dockerfile.s390x
├── Dockerfile.tpu
└── Dockerfile.xpu
├── docs
├── .nav.yml
├── README.md
├── api
│ ├── README.md
│ └── vllm
│ │ └── .meta.yml
├── assets
│ ├── contributing
│ │ └── dockerfile-stages-dependency.png
│ ├── deployment
│ │ ├── anything-llm-chat-with-doc.png
│ │ ├── anything-llm-chat-without-doc.png
│ │ ├── anything-llm-provider.png
│ │ ├── anything-llm-upload-doc.png
│ │ ├── architecture_helm_deployment.png
│ │ ├── chatbox-chat.png
│ │ ├── chatbox-settings.png
│ │ ├── dify-chat.png
│ │ ├── dify-create-chatbot.png
│ │ ├── dify-settings.png
│ │ ├── open_webui.png
│ │ └── streamlit-chat.png
│ ├── design
│ │ ├── arch_overview
│ │ │ ├── entrypoints.excalidraw.png
│ │ │ └── llm_engine.excalidraw.png
│ │ ├── hierarchy.png
│ │ └── v1
│ │ │ ├── metrics
│ │ │ ├── intervals-1.png
│ │ │ ├── intervals-2.png
│ │ │ └── intervals-3.png
│ │ │ └── prefix_caching
│ │ │ ├── example-time-1.png
│ │ │ ├── example-time-3.png
│ │ │ ├── example-time-4.png
│ │ │ ├── example-time-5.png
│ │ │ ├── example-time-6.png
│ │ │ ├── example-time-7.png
│ │ │ ├── free.png
│ │ │ └── overview.png
│ ├── features
│ │ └── disagg_prefill
│ │ │ ├── abstraction.jpg
│ │ │ └── overview.jpg
│ ├── kernel
│ │ ├── k_vecs.png
│ │ ├── key.png
│ │ ├── logits_vec.png
│ │ ├── q_vecs.png
│ │ ├── query.png
│ │ ├── v_vec.png
│ │ └── value.png
│ └── logos
│ │ ├── vllm-logo-only-light.ico
│ │ ├── vllm-logo-only-light.png
│ │ ├── vllm-logo-text-dark.png
│ │ └── vllm-logo-text-light.png
├── cli
│ └── README.md
├── community
│ ├── meetups.md
│ └── sponsors.md
├── configuration
│ ├── README.md
│ ├── conserving_memory.md
│ ├── engine_args.md
│ ├── env_vars.md
│ ├── model_resolution.md
│ ├── optimization.md
│ └── serve_args.md
├── contributing
│ ├── README.md
│ ├── benchmarks.md
│ ├── ci-failures.md
│ ├── deprecation_policy.md
│ ├── dockerfile
│ │ └── dockerfile.md
│ ├── model
│ │ ├── README.md
│ │ ├── basic.md
│ │ ├── multimodal.md
│ │ ├── registration.md
│ │ └── tests.md
│ ├── profiling.md
│ └── vulnerability_management.md
├── deployment
│ ├── docker.md
│ ├── frameworks
│ │ ├── anything-llm.md
│ │ ├── autogen.md
│ │ ├── bentoml.md
│ │ ├── cerebrium.md
│ │ ├── chatbox.md
│ │ ├── dify.md
│ │ ├── dstack.md
│ │ ├── haystack.md
│ │ ├── helm.md
│ │ ├── litellm.md
│ │ ├── lobe-chat.md
│ │ ├── lws.md
│ │ ├── modal.md
│ │ ├── open-webui.md
│ │ ├── retrieval_augmented_generation.md
│ │ ├── skypilot.md
│ │ ├── streamlit.md
│ │ └── triton.md
│ ├── integrations
│ │ ├── kserve.md
│ │ ├── kubeai.md
│ │ ├── llamastack.md
│ │ ├── llmaz.md
│ │ └── production-stack.md
│ ├── k8s.md
│ └── nginx.md
├── design
│ ├── arch_overview.md
│ ├── automatic_prefix_caching.md
│ ├── huggingface_integration.md
│ ├── kernel
│ │ └── paged_attention.md
│ ├── mm_processing.md
│ ├── multiprocessing.md
│ ├── plugin_system.md
│ └── v1
│ │ ├── metrics.md
│ │ ├── prefix_caching.md
│ │ └── torch_compile.md
├── dev-docker
│ └── README.md
├── features
│ ├── automatic_prefix_caching.md
│ ├── compatibility_matrix.md
│ ├── disagg_prefill.md
│ ├── lora.md
│ ├── multimodal_inputs.md
│ ├── prompt_embeds.md
│ ├── quantization
│ │ ├── README.md
│ │ ├── auto_awq.md
│ │ ├── bitblas.md
│ │ ├── bnb.md
│ │ ├── fp8.md
│ │ ├── gguf.md
│ │ ├── gptqmodel.md
│ │ ├── int4.md
│ │ ├── int8.md
│ │ ├── modelopt.md
│ │ ├── quantized_kvcache.md
│ │ ├── quark.md
│ │ ├── supported_hardware.md
│ │ └── torchao.md
│ ├── reasoning_outputs.md
│ ├── spec_decode.md
│ ├── structured_outputs.md
│ └── tool_calling.md
├── getting_started
│ ├── installation
│ │ ├── .nav.yml
│ │ ├── README.md
│ │ ├── ai_accelerator.md
│ │ ├── ai_accelerator
│ │ │ ├── hpu-gaudi.inc.md
│ │ │ ├── neuron.inc.md
│ │ │ └── tpu.inc.md
│ │ ├── cpu.md
│ │ ├── cpu
│ │ │ ├── apple.inc.md
│ │ │ ├── arm.inc.md
│ │ │ ├── build.inc.md
│ │ │ ├── s390x.inc.md
│ │ │ └── x86.inc.md
│ │ ├── device.template.md
│ │ ├── gpu.md
│ │ ├── gpu
│ │ │ ├── cuda.inc.md
│ │ │ ├── rocm.inc.md
│ │ │ └── xpu.inc.md
│ │ └── python_env_setup.inc.md
│ └── quickstart.md
├── mkdocs
│ ├── hooks
│ │ ├── generate_examples.py
│ │ ├── remove_announcement.py
│ │ └── url_schemes.py
│ ├── javascript
│ │ └── run_llm_widget.js
│ ├── overrides
│ │ └── main.html
│ └── stylesheets
│ │ └── extra.css
├── models
│ ├── extensions
│ │ ├── fastsafetensor.md
│ │ ├── runai_model_streamer.md
│ │ └── tensorizer.md
│ ├── generative_models.md
│ ├── pooling_models.md
│ └── supported_models.md
├── serving
│ ├── distributed_serving.md
│ ├── integrations
│ │ ├── langchain.md
│ │ └── llamaindex.md
│ ├── offline_inference.md
│ └── openai_compatible_server.md
├── training
│ ├── rlhf.md
│ └── trl.md
└── usage
│ ├── README.md
│ ├── faq.md
│ ├── metrics.md
│ ├── reproducibility.md
│ ├── security.md
│ ├── troubleshooting.md
│ ├── usage_stats.md
│ └── v1_guide.md
├── examples
├── offline_inference
│ ├── audio_language.py
│ ├── automatic_prefix_caching.py
│ ├── basic
│ │ ├── README.md
│ │ ├── basic.py
│ │ ├── chat.py
│ │ ├── classify.py
│ │ ├── embed.py
│ │ ├── generate.py
│ │ └── score.py
│ ├── batch_llm_inference.py
│ ├── chat_with_tools.py
│ ├── context_extension.py
│ ├── data_parallel.py
│ ├── disaggregated-prefill-v1
│ │ ├── README.md
│ │ ├── decode_example.py
│ │ ├── prefill_example.py
│ │ └── run.sh
│ ├── disaggregated_prefill.py
│ ├── eagle.py
│ ├── embed_jina_embeddings_v3.py
│ ├── embed_matryoshka_fy.py
│ ├── encoder_decoder.py
│ ├── encoder_decoder_multimodal.py
│ ├── llm_engine_example.py
│ ├── load_sharded_state.py
│ ├── lora_with_quantization_inference.py
│ ├── metrics.py
│ ├── mistral-small.py
│ ├── mlpspeculator.py
│ ├── multilora_inference.py
│ ├── neuron.py
│ ├── neuron_eagle.py
│ ├── neuron_int8_quantization.py
│ ├── neuron_multimodal.py
│ ├── neuron_speculation.py
│ ├── openai_batch
│ │ ├── README.md
│ │ └── openai_example_batch.jsonl
│ ├── prefix_caching.py
│ ├── prithvi_geospatial_mae.py
│ ├── profiling.py
│ ├── profiling_tpu
│ │ ├── README.md
│ │ └── profiling.py
│ ├── prompt_embed_inference.py
│ ├── qwen2_5_omni
│ │ ├── README.md
│ │ └── only_thinker.py
│ ├── qwen_1m.py
│ ├── reproducibility.py
│ ├── rlhf.py
│ ├── rlhf_colocate.py
│ ├── rlhf_utils.py
│ ├── save_sharded_state.py
│ ├── simple_profiling.py
│ ├── structured_outputs.py
│ ├── torchrun_example.py
│ ├── tpu.py
│ ├── vision_language.py
│ ├── vision_language_embedding.py
│ └── vision_language_multi_image.py
├── online_serving
│ ├── api_client.py
│ ├── chart-helm
│ │ ├── .helmignore
│ │ ├── Chart.yaml
│ │ ├── README.md
│ │ ├── ct.yaml
│ │ ├── lintconf.yaml
│ │ ├── templates
│ │ │ ├── _helpers.tpl
│ │ │ ├── configmap.yaml
│ │ │ ├── custom-objects.yaml
│ │ │ ├── deployment.yaml
│ │ │ ├── hpa.yaml
│ │ │ ├── job.yaml
│ │ │ ├── poddisruptionbudget.yaml
│ │ │ ├── pvc.yaml
│ │ │ ├── secrets.yaml
│ │ │ └── service.yaml
│ │ ├── values.schema.json
│ │ └── values.yaml
│ ├── cohere_rerank_client.py
│ ├── disaggregated_prefill.sh
│ ├── disaggregated_serving
│ │ ├── README.md
│ │ ├── disagg_proxy_demo.py
│ │ └── kv_events.sh
│ ├── gradio_openai_chatbot_webserver.py
│ ├── gradio_webserver.py
│ ├── jinaai_rerank_client.py
│ ├── kv_events_subscriber.py
│ ├── multi-node-serving.sh
│ ├── multi_instance_data_parallel.py
│ ├── openai_chat_completion_client.py
│ ├── openai_chat_completion_client_for_multimodal.py
│ ├── openai_chat_completion_client_with_tools.py
│ ├── openai_chat_completion_client_with_tools_required.py
│ ├── openai_chat_completion_structured_outputs.py
│ ├── openai_chat_completion_structured_outputs_structural_tag.py
│ ├── openai_chat_completion_structured_outputs_with_reasoning.py
│ ├── openai_chat_completion_tool_calls_with_reasoning.py
│ ├── openai_chat_completion_with_reasoning.py
│ ├── openai_chat_completion_with_reasoning_streaming.py
│ ├── openai_chat_embedding_client_for_multimodal.py
│ ├── openai_classification_client.py
│ ├── openai_completion_client.py
│ ├── openai_cross_encoder_score.py
│ ├── openai_embedding_client.py
│ ├── openai_embedding_matryoshka_fy.py
│ ├── openai_pooling_client.py
│ ├── openai_transcription_client.py
│ ├── opentelemetry
│ │ ├── README.md
│ │ └── dummy_client.py
│ ├── prometheus_grafana
│ │ ├── README.md
│ │ ├── docker-compose.yaml
│ │ ├── grafana.json
│ │ └── prometheus.yaml
│ ├── prompt_embed_inference_with_openai_client.py
│ ├── ray_serve_deepseek.py
│ ├── retrieval_augmented_generation_with_langchain.py
│ ├── retrieval_augmented_generation_with_llamaindex.py
│ ├── run_cluster.sh
│ ├── sagemaker-entrypoint.sh
│ ├── streamlit_openai_chatbot_webserver.py
│ └── utils.py
├── others
│ ├── lmcache
│ │ ├── README.md
│ │ ├── cpu_offload_lmcache.py
│ │ ├── disagg_prefill_lmcache_v0.py
│ │ ├── disagg_prefill_lmcache_v1
│ │ │ ├── configs
│ │ │ │ ├── lmcache-decoder-config.yaml
│ │ │ │ └── lmcache-prefiller-config.yaml
│ │ │ ├── disagg_example_nixl.sh
│ │ │ ├── disagg_proxy_server.py
│ │ │ └── disagg_vllm_launcher.sh
│ │ └── kv_cache_sharing_lmcache_v1.py
│ ├── logging_configuration.md
│ └── tensorize_vllm_model.py
├── pyproject.toml
├── template_alpaca.jinja
├── template_baichuan.jinja
├── template_chatglm.jinja
├── template_chatglm2.jinja
├── template_chatml.jinja
├── template_dse_qwen2_vl.jinja
├── template_falcon.jinja
├── template_falcon_180b.jinja
├── template_inkbot.jinja
├── template_teleflm.jinja
├── template_vlm2vec.jinja
├── tool_chat_template_deepseekr1.jinja
├── tool_chat_template_deepseekv3.jinja
├── tool_chat_template_granite.jinja
├── tool_chat_template_granite_20b_fc.jinja
├── tool_chat_template_hermes.jinja
├── tool_chat_template_internlm2_tool.jinja
├── tool_chat_template_llama3.1_json.jinja
├── tool_chat_template_llama3.2_json.jinja
├── tool_chat_template_llama3.2_pythonic.jinja
├── tool_chat_template_llama4_json.jinja
├── tool_chat_template_llama4_pythonic.jinja
├── tool_chat_template_mistral.jinja
├── tool_chat_template_mistral3.jinja
├── tool_chat_template_mistral_parallel.jinja
├── tool_chat_template_phi4_mini.jinja
└── tool_chat_template_toolace.jinja
├── find_cuda_init.py
├── format.sh
├── mkdocs.yaml
├── pyproject.toml
├── requirements
├── build.txt
├── common.txt
├── cpu.txt
├── cuda.txt
├── dev.txt
├── docs.txt
├── hpu.txt
├── lint.txt
├── neuron.txt
├── nightly_torch_test.txt
├── rocm-build.txt
├── rocm-test.txt
├── rocm.txt
├── test.in
├── test.txt
├── tpu.txt
└── xpu.txt
├── setup.py
├── tests
├── __init__.py
├── async_engine
│ ├── __init__.py
│ ├── api_server_async_engine.py
│ ├── conftest.py
│ ├── test_api_server.py
│ ├── test_async_llm_engine.py
│ └── test_request_tracker.py
├── basic_correctness
│ ├── __init__.py
│ ├── test_basic_correctness.py
│ ├── test_chunked_prefill.py
│ ├── test_cpu_offload.py
│ ├── test_cumem.py
│ └── test_preemption.py
├── benchmarks
│ ├── __init__.py
│ ├── test_latency_cli.py
│ ├── test_serve_cli.py
│ └── test_throughput_cli.py
├── build_cython.py
├── compile
│ ├── __init__.py
│ ├── backend.py
│ ├── conftest.py
│ ├── piecewise
│ │ ├── __init__.py
│ │ ├── test_full_cudagraph.py
│ │ ├── test_simple.py
│ │ └── test_toy_llama.py
│ ├── test_async_tp.py
│ ├── test_basic_correctness.py
│ ├── test_full_graph.py
│ ├── test_functionalization.py
│ ├── test_fusion.py
│ ├── test_pass_manager.py
│ ├── test_sequence_parallelism.py
│ ├── test_silu_mul_quant_fusion.py
│ └── test_wrapper.py
├── config
│ ├── test_config.yaml
│ └── test_config_with_model.yaml
├── conftest.py
├── core
│ ├── __init__.py
│ ├── block
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── e2e
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ ├── test_correctness.py
│ │ │ └── test_correctness_sliding_window.py
│ │ ├── test_block_manager.py
│ │ ├── test_block_table.py
│ │ ├── test_common.py
│ │ ├── test_cpu_gpu_block_allocator.py
│ │ ├── test_naive_block.py
│ │ └── test_prefix_caching_block.py
│ ├── conftest.py
│ ├── test_chunked_prefill_scheduler.py
│ ├── test_num_computed_tokens_update.py
│ ├── test_scheduler.py
│ ├── test_scheduler_encoder_decoder.py
│ ├── test_serialization.py
│ └── utils.py
├── detokenizer
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_disable_detokenization.py
│ ├── test_stop_checker.py
│ ├── test_stop_reason.py
│ └── test_stop_strings.py
├── distributed
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_ca_buffer_sharing.py
│ ├── test_comm_ops.py
│ ├── test_custom_all_reduce.py
│ ├── test_distributed_oot.py
│ ├── test_events.py
│ ├── test_expert_parallel.py
│ ├── test_multi_node_assignment.py
│ ├── test_pipeline_parallel.py
│ ├── test_pipeline_partition.py
│ ├── test_pp_cudagraph.py
│ ├── test_pynccl.py
│ ├── test_same_node.py
│ ├── test_sequence_parallel.py
│ ├── test_shm_broadcast.py
│ ├── test_torchrun_example.py
│ └── test_utils.py
├── encoder_decoder
│ ├── __init__.py
│ └── test_e2e_correctness.py
├── engine
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_arg_utils.py
│ ├── test_computed_prefix_blocks.py
│ ├── test_executor.py
│ ├── test_multi_step_output_processor.py
│ ├── test_multiproc_workers.py
│ ├── test_options.py
│ └── test_short_mm_context.py
├── entrypoints
│ ├── __init__.py
│ ├── conftest.py
│ ├── llm
│ │ ├── __init__.py
│ │ ├── test_accuracy.py
│ │ ├── test_chat.py
│ │ ├── test_collective_rpc.py
│ │ ├── test_encode.py
│ │ ├── test_generate.py
│ │ ├── test_generate_multiple_loras.py
│ │ ├── test_gpu_utilization.py
│ │ ├── test_guided_generate.py
│ │ ├── test_lazy_outlines.py
│ │ └── test_prompt_validation.py
│ ├── offline_mode
│ │ ├── __init__.py
│ │ └── test_offline_mode.py
│ ├── openai
│ │ ├── __init__.py
│ │ ├── correctness
│ │ │ ├── __init__.py
│ │ │ ├── test_lmeval.py
│ │ │ ├── test_mteb.py
│ │ │ └── test_transcription_api_correctness.py
│ │ ├── test_async_tokenization.py
│ │ ├── test_audio.py
│ │ ├── test_basic.py
│ │ ├── test_chat.py
│ │ ├── test_chat_echo.py
│ │ ├── test_chat_logit_bias_validation.py
│ │ ├── test_chat_template.py
│ │ ├── test_chat_with_tool_reasoning.py
│ │ ├── test_chunked_prompt.py
│ │ ├── test_classification.py
│ │ ├── test_cli_args.py
│ │ ├── test_completion.py
│ │ ├── test_completion_with_function_calling.py
│ │ ├── test_completion_with_prompt_embeds.py
│ │ ├── test_embedding.py
│ │ ├── test_embedding_dimensions.py
│ │ ├── test_encoder_decoder.py
│ │ ├── test_lora_adapters.py
│ │ ├── test_lora_resolvers.py
│ │ ├── test_metrics.py
│ │ ├── test_models.py
│ │ ├── test_oot_registration.py
│ │ ├── test_openai_schema.py
│ │ ├── test_pooling.py
│ │ ├── test_prompt_validation.py
│ │ ├── test_rerank.py
│ │ ├── test_return_tokens_as_ids.py
│ │ ├── test_root_path.py
│ │ ├── test_run_batch.py
│ │ ├── test_score.py
│ │ ├── test_serving_chat.py
│ │ ├── test_serving_models.py
│ │ ├── test_shutdown.py
│ │ ├── test_sleep.py
│ │ ├── test_tensorizer_entrypoint.py
│ │ ├── test_tokenization.py
│ │ ├── test_transcription_validation.py
│ │ ├── test_truncation.py
│ │ ├── test_video.py
│ │ ├── test_vision.py
│ │ ├── test_vision_embedding.py
│ │ └── tool_parsers
│ │ │ ├── __init__.py
│ │ │ ├── test_llama4_pythonic_tool_parser.py
│ │ │ ├── test_pythonic_tool_parser.py
│ │ │ └── utils.py
│ ├── test_api_server_process_manager.py
│ ├── test_chat_utils.py
│ └── test_ssl_cert_refresher.py
├── fastsafetensors_loader
│ ├── __init__.py
│ ├── test_fastsafetensors_loader.py
│ └── test_weight_utils.py
├── kernels
│ ├── __init__.py
│ ├── allclose_default.py
│ ├── attention
│ │ ├── conftest.py
│ │ ├── test_attention.py
│ │ ├── test_attention_selector.py
│ │ ├── test_blocksparse_attention.py
│ │ ├── test_cache.py
│ │ ├── test_cascade_flash_attn.py
│ │ ├── test_encoder_decoder_attn.py
│ │ ├── test_flash_attn.py
│ │ ├── test_flashinfer.py
│ │ ├── test_flashmla.py
│ │ ├── test_lightning_attn.py
│ │ ├── test_merge_attn_states.py
│ │ ├── test_mha_attn.py
│ │ ├── test_mla_decode_cpu.py
│ │ ├── test_prefix_prefill.py
│ │ ├── test_rocm_attention_selector.py
│ │ ├── test_triton_decode_attention.py
│ │ └── test_triton_unified_attention.py
│ ├── core
│ │ ├── test_activation.py
│ │ ├── test_fused_quant_layernorm.py
│ │ ├── test_layernorm.py
│ │ ├── test_opcheck.py
│ │ ├── test_permute_cols.py
│ │ ├── test_pos_encoding.py
│ │ ├── test_rotary_embedding.py
│ │ └── test_uva.py
│ ├── mamba
│ │ ├── test_causal_conv1d.py
│ │ ├── test_mamba_mixer2.py
│ │ ├── test_mamba_ssm.py
│ │ └── test_mamba_ssm_ssd.py
│ ├── moe
│ │ ├── __init__.py
│ │ ├── deepep_utils.py
│ │ ├── test_batched_moe.py
│ │ ├── test_cutlass_moe.py
│ │ ├── test_deepep_deepgemm_moe.py
│ │ ├── test_deepep_moe.py
│ │ ├── test_moe.py
│ │ ├── test_moe_permute_unpermute.py
│ │ ├── test_nvfp4_moe.py
│ │ ├── test_pplx_moe.py
│ │ ├── test_rocm_aiter_topk.py
│ │ └── test_triton_moe_ptpc_fp8.py
│ ├── quant_utils.py
│ ├── quantization
│ │ ├── nvfp4_utils.py
│ │ ├── test_allspark_gemm.py
│ │ ├── test_aqlm.py
│ │ ├── test_awq.py
│ │ ├── test_awq_triton.py
│ │ ├── test_block_fp8.py
│ │ ├── test_block_int8.py
│ │ ├── test_cutlass_2of4_sparse.py
│ │ ├── test_cutlass_scaled_mm.py
│ │ ├── test_fp8_quant.py
│ │ ├── test_ggml.py
│ │ ├── test_gguf.py
│ │ ├── test_gptq.py
│ │ ├── test_int8_kernel.py
│ │ ├── test_int8_quant.py
│ │ ├── test_machete_mm.py
│ │ ├── test_marlin_gemm.py
│ │ ├── test_nvfp4_quant.py
│ │ ├── test_nvfp4_scaled_mm.py
│ │ ├── test_rocm_skinny_gemms.py
│ │ └── test_triton_scaled_mm.py
│ ├── test_apply_repetition_penalties.py
│ ├── test_cutlass_mla_decode.py
│ ├── test_fused_quant_activation.py
│ ├── test_triton_flash_attention.py
│ └── utils.py
├── kv_transfer
│ ├── test_disagg.py
│ ├── test_lookup_buffer.py
│ ├── test_lookup_buffer.sh
│ ├── test_module.py
│ ├── test_send_recv.py
│ └── test_send_recv.sh
├── lora
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_add_lora.py
│ ├── test_baichuan.py
│ ├── test_chatglm3_tp.py
│ ├── test_layers.py
│ ├── test_llama_tp.py
│ ├── test_lora_allowed_token_ids.py
│ ├── test_lora_checkpoints.py
│ ├── test_lora_functions.py
│ ├── test_lora_huggingface.py
│ ├── test_lora_manager.py
│ ├── test_minicpmv_tp.py
│ ├── test_mixtral.py
│ ├── test_peft_helper.py
│ ├── test_phi.py
│ ├── test_punica_ops.py
│ ├── test_quant_model.py
│ ├── test_qwen2vl.py
│ ├── test_resolver.py
│ ├── test_tokenizer_group.py
│ ├── test_transfomers_model.py
│ ├── test_utils.py
│ ├── test_worker.py
│ └── utils.py
├── metrics
│ ├── __init__.py
│ └── test_metrics.py
├── mistral_tool_use
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_mistral_tool_calls.py
│ └── utils.py
├── model_executor
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_enabled_custom_ops.py
│ ├── test_guided_processors.py
│ ├── test_logits_processor.py
│ ├── test_model_load_with_params.py
│ └── test_weight_utils.py
├── models
│ ├── __init__.py
│ ├── fixtures
│ │ ├── mistral_small_3_chat.json
│ │ └── pixtral_chat.json
│ ├── language
│ │ ├── __init__.py
│ │ ├── generation
│ │ │ ├── __init__.py
│ │ │ ├── test_bart.py
│ │ │ ├── test_common.py
│ │ │ ├── test_granite.py
│ │ │ ├── test_granitemoehybrid.py
│ │ │ ├── test_hybrid.py
│ │ │ ├── test_mistral.py
│ │ │ └── test_phimoe.py
│ │ └── pooling
│ │ │ ├── __init__.py
│ │ │ ├── embed_utils.py
│ │ │ ├── mteb_utils.py
│ │ │ ├── test_baai.py
│ │ │ ├── test_classification.py
│ │ │ ├── test_embedding.py
│ │ │ ├── test_gritlm.py
│ │ │ ├── test_gte.py
│ │ │ ├── test_intfloat.py
│ │ │ ├── test_jina.py
│ │ │ ├── test_nomic.py
│ │ │ ├── test_nomic_max_model_len.py
│ │ │ ├── test_scoring.py
│ │ │ ├── test_snowflake_arctic_embed.py
│ │ │ └── test_truncation_control.py
│ ├── multimodal
│ │ ├── __init__.py
│ │ ├── generation
│ │ │ ├── __init__.py
│ │ │ ├── test_common.py
│ │ │ ├── test_florence2.py
│ │ │ ├── test_granite_speech.py
│ │ │ ├── test_interleaved.py
│ │ │ ├── test_mllama.py
│ │ │ ├── test_phi4mm.py
│ │ │ ├── test_pixtral.py
│ │ │ ├── test_qwen2_vl.py
│ │ │ ├── test_ultravox.py
│ │ │ ├── test_whisper.py
│ │ │ └── vlm_utils
│ │ │ │ ├── __init__.py
│ │ │ │ ├── builders.py
│ │ │ │ ├── case_filtering.py
│ │ │ │ ├── core.py
│ │ │ │ ├── custom_inputs.py
│ │ │ │ ├── model_utils.py
│ │ │ │ ├── runners.py
│ │ │ │ └── types.py
│ │ ├── pooling
│ │ │ ├── __init__.py
│ │ │ ├── test_dse_qwen2_vl.py
│ │ │ ├── test_intern_vit.py
│ │ │ ├── test_llava_next.py
│ │ │ └── test_phi3v.py
│ │ └── processing
│ │ │ ├── __init__.py
│ │ │ ├── test_common.py
│ │ │ ├── test_h2ovl.py
│ │ │ ├── test_idefics3.py
│ │ │ ├── test_internvl.py
│ │ │ ├── test_llama4.py
│ │ │ ├── test_llava_next.py
│ │ │ ├── test_llava_onevision.py
│ │ │ ├── test_minimax_vl_01.py
│ │ │ ├── test_mllama.py
│ │ │ ├── test_phi3v.py
│ │ │ ├── test_phi4mm.py
│ │ │ ├── test_qwen2_vl.py
│ │ │ └── test_smolvlm.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── test_aqlm.py
│ │ ├── test_awq.py
│ │ ├── test_bitblas.py
│ │ ├── test_fp8.py
│ │ ├── test_gguf.py
│ │ ├── test_gptq_bitblas.py
│ │ ├── test_gptq_marlin.py
│ │ ├── test_gptq_marlin_24.py
│ │ ├── test_modelopt.py
│ │ ├── test_mxfp4.py
│ │ └── test_nvfp4.py
│ ├── registry.py
│ ├── test_initialization.py
│ ├── test_oot_registration.py
│ ├── test_registry.py
│ ├── test_transformers.py
│ ├── test_utils.py
│ ├── test_vision.py
│ └── utils.py
├── mq_llm_engine
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_abort.py
│ ├── test_error_handling.py
│ ├── test_load.py
│ └── utils.py
├── multi_step
│ ├── __init__.py
│ ├── test_correctness_async_llm.py
│ └── test_correctness_llm.py
├── multimodal
│ ├── __init__.py
│ ├── assets
│ │ ├── image1.png
│ │ ├── image2.png
│ │ └── rgba.png
│ ├── test_hasher.py
│ ├── test_image.py
│ ├── test_inputs.py
│ ├── test_processing.py
│ ├── test_utils.py
│ ├── test_video.py
│ └── utils.py
├── neuron
│ ├── 1_core
│ │ ├── test_activation.py
│ │ ├── test_block_table.py
│ │ ├── test_cache.py
│ │ ├── test_layernorm.py
│ │ ├── test_logits_processor.py
│ │ ├── test_neuron_model_runner.py
│ │ ├── test_neuron_quant.py
│ │ ├── test_prefix_prefill.py
│ │ └── test_rotary_embedding.py
│ └── 2_core
│ │ ├── test_comm_ops.py
│ │ ├── test_eagle.py
│ │ ├── test_mistral.py
│ │ └── test_multi_lora.py
├── plugins
│ ├── lora_resolvers
│ │ ├── __init__.py
│ │ └── test_filesystem_resolver.py
│ ├── vllm_add_dummy_model
│ │ ├── setup.py
│ │ └── vllm_add_dummy_model
│ │ │ ├── __init__.py
│ │ │ ├── my_gemma_embedding.py
│ │ │ ├── my_llava.py
│ │ │ └── my_opt.py
│ └── vllm_add_dummy_platform
│ │ ├── setup.py
│ │ └── vllm_add_dummy_platform
│ │ ├── __init__.py
│ │ ├── dummy_attention_backend.py
│ │ └── dummy_platform.py
├── plugins_tests
│ ├── conftest.py
│ ├── test_platform_plugins.py
│ └── test_scheduler_plugins.py
├── prefix_caching
│ ├── __init__.py
│ ├── test_disable_sliding_window.py
│ └── test_prefix_caching.py
├── prompt_adapter
│ ├── test_bloom.py
│ ├── test_multi_adapter_inference.py
│ └── test_pa_lora.py
├── prompts
│ ├── example.txt
│ └── summary.txt
├── quantization
│ ├── __init__.py
│ ├── test_auto_round.py
│ ├── test_bitsandbytes.py
│ ├── test_compressed_tensors.py
│ ├── test_configs.py
│ ├── test_cpu_offload.py
│ ├── test_experts_int8.py
│ ├── test_fp8.py
│ ├── test_gptq_dynamic.py
│ ├── test_ipex_quant.py
│ ├── test_lm_head.py
│ ├── test_ptpc_fp8.py
│ ├── test_quark.py
│ ├── test_register_quantization_config.py
│ ├── test_torchao.py
│ └── utils.py
├── reasoning
│ ├── __init__.py
│ ├── test_deepseekr1_reasoning_parser.py
│ ├── test_granite_reasoning_parser.py
│ ├── test_qwen3_reasoning_parser.py
│ └── utils.py
├── runai_model_streamer_test
│ ├── __init__.py
│ ├── test_runai_model_streamer_loader.py
│ └── test_weight_utils.py
├── samplers
│ ├── __init__.py
│ ├── test_beam_search.py
│ ├── test_ignore_eos.py
│ ├── test_logits_processor.py
│ ├── test_logprobs.py
│ ├── test_no_bad_words.py
│ ├── test_ranks.py
│ ├── test_rejection_sampler.py
│ ├── test_sampler.py
│ ├── test_seeded_generate.py
│ └── test_typical_acceptance_sampler.py
├── spec_decode
│ ├── __init__.py
│ ├── conftest.py
│ ├── e2e
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_compatibility.py
│ │ ├── test_eagle_correctness.py
│ │ ├── test_integration.py
│ │ ├── test_integration_dist_tp2.py
│ │ ├── test_integration_dist_tp4.py
│ │ ├── test_logprobs.py
│ │ ├── test_medusa_correctness.py
│ │ ├── test_mlp_correctness.py
│ │ ├── test_mtp_correctness.py
│ │ ├── test_multistep_correctness.py
│ │ ├── test_ngram_correctness.py
│ │ └── test_seed.py
│ ├── test_batch_expansion.py
│ ├── test_dynamic_spec_decode.py
│ ├── test_memory_usage.py
│ ├── test_metrics.py
│ ├── test_multi_step_worker.py
│ ├── test_ngram_worker.py
│ ├── test_scorer.py
│ ├── test_spec_decode_worker.py
│ ├── test_utils.py
│ └── utils.py
├── standalone_tests
│ ├── lazy_imports.py
│ └── python_only_compile.sh
├── system_messages
│ └── sonnet3.5_nov2024.txt
├── tensorizer_loader
│ ├── __init__.py
│ ├── conftest.py
│ └── test_tensorizer.py
├── test_cache_block_hashing.py
├── test_config.py
├── test_embedded_commit.py
├── test_inputs.py
├── test_logger.py
├── test_outputs.py
├── test_regression.py
├── test_sampling_params.py
├── test_scalartype.py
├── test_seed_behavior.py
├── test_sequence.py
├── test_sharded_state_loader.py
├── test_triton_utils.py
├── test_utils.py
├── test_version.py
├── test_vllm_port.py
├── tokenization
│ ├── __init__.py
│ ├── test_cached_tokenizer.py
│ ├── test_detokenize.py
│ ├── test_get_eos.py
│ ├── test_mistral_tokenizer.py
│ ├── test_tokenizer.py
│ ├── test_tokenizer_group.py
│ └── test_tokenizer_registry.py
├── tool_use
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_chat_completion_request_validations.py
│ ├── test_chat_completions.py
│ ├── test_jamba_tool_parser.py
│ ├── test_parallel_tool_calls.py
│ ├── test_tool_calls.py
│ ├── test_tool_choice_required.py
│ └── utils.py
├── tpu
│ ├── __init__.py
│ ├── lora
│ │ ├── __init__.py
│ │ └── test_lora.py
│ ├── test_compilation.py
│ ├── test_custom_dispatcher.py
│ ├── test_moe_pallas.py
│ └── test_quantization_accuracy.py
├── tracing
│ ├── __init__.py
│ └── test_tracing.py
├── utils.py
├── v1
│ ├── __init__.py
│ ├── core
│ │ ├── test_kv_cache_utils.py
│ │ ├── test_prefix_caching.py
│ │ ├── test_scheduler.py
│ │ ├── test_scheduler_e2e.py
│ │ └── test_specialized_manager.py
│ ├── e2e
│ │ ├── __init__.py
│ │ ├── test_cascade_attention.py
│ │ ├── test_correctness_sliding_window.py
│ │ └── test_spec_decode.py
│ ├── engine
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_async_llm.py
│ │ ├── test_engine_args.py
│ │ ├── test_engine_core.py
│ │ ├── test_engine_core_client.py
│ │ ├── test_llm_engine.py
│ │ ├── test_output_processor.py
│ │ └── utils.py
│ ├── entrypoints
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── llm
│ │ │ ├── __init__.py
│ │ │ └── test_struct_output_generate.py
│ │ └── openai
│ │ │ ├── test_chat_completion.py
│ │ │ ├── test_completion.py
│ │ │ └── test_multi_api_servers.py
│ ├── kv_connector
│ │ ├── nixl_integration
│ │ │ ├── run_accuracy_test.sh
│ │ │ ├── run_edge_case_test.sh
│ │ │ ├── test_accuracy.py
│ │ │ ├── test_edge_cases.py
│ │ │ └── toy_proxy_server.py
│ │ └── unit
│ │ │ ├── __init__.py
│ │ │ ├── test_multi_connector.py
│ │ │ ├── test_nixl_connector.py
│ │ │ ├── test_remote_decode_lifecycle.py
│ │ │ ├── test_remote_prefill_lifecycle.py
│ │ │ └── utils.py
│ ├── metrics
│ │ └── test_ray_metrics.py
│ ├── sample
│ │ ├── __init__.py
│ │ ├── test_logprobs.py
│ │ ├── test_logprobs_e2e.py
│ │ ├── test_rejection_sampler.py
│ │ ├── test_sampler.py
│ │ ├── test_sampling_params_e2e.py
│ │ ├── test_topk_topp_sampler.py
│ │ └── utils.py
│ ├── shutdown
│ │ ├── test_delete.py
│ │ ├── test_forward_error.py
│ │ ├── test_processor_error.py
│ │ ├── test_startup_error.py
│ │ └── utils.py
│ ├── spec_decode
│ │ ├── test_eagle.py
│ │ ├── test_max_len.py
│ │ └── test_ngram.py
│ ├── structured_output
│ │ ├── __init__.py
│ │ └── test_utils.py
│ ├── test_async_llm_dp.py
│ ├── test_metrics_reader.py
│ ├── test_oracle.py
│ ├── test_serial_utils.py
│ ├── test_utils.py
│ ├── tpu
│ │ ├── __init__.py
│ │ ├── test_basic.py
│ │ ├── test_mha_attn.py
│ │ ├── test_multimodal.py
│ │ ├── test_pallas.py
│ │ ├── test_perf.py
│ │ ├── test_sampler.py
│ │ ├── test_spmd_model_weight_loading.py
│ │ ├── test_topk_topp_sampler.py
│ │ ├── test_tpu_qkv_linear.py
│ │ └── worker
│ │ │ ├── __init__.py
│ │ │ └── test_tpu_model_runner.py
│ └── worker
│ │ ├── __init__.py
│ │ ├── test_gpu_input_batch.py
│ │ └── test_gpu_model_runner.py
├── vllm_test_utils
│ ├── setup.py
│ └── vllm_test_utils
│ │ ├── __init__.py
│ │ ├── blame.py
│ │ └── monitor.py
├── weight_loading
│ ├── models-large.txt
│ ├── models.txt
│ ├── run_model_weight_loading_test.sh
│ └── test_weight_loading.py
└── worker
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_encoder_decoder_model_runner.py
│ ├── test_model_input.py
│ ├── test_model_runner.py
│ ├── test_profile.py
│ └── test_swap.py
├── tools
├── check_repo.sh
├── check_spdx_header.py
├── check_triton_import.py
├── enforce_regex_import.py
├── ep_kernels
│ ├── README.md
│ ├── install_python_libraries.sh
│ ├── install_system_drivers.sh
│ └── install_system_libraries.sh
├── install_nixl.sh
├── mypy.sh
├── png-lint.sh
├── profiler
│ ├── print_layerwise_table.py
│ └── visualize_layerwise_profile.py
├── report_build_time_ninja.py
├── shellcheck.sh
└── update-dockerfile-graph.sh
├── use_existing_torch.py
└── vllm
├── __init__.py
├── _custom_ops.py
├── _ipex_ops.py
├── adapter_commons
├── __init__.py
├── layers.py
├── models.py
├── request.py
├── utils.py
└── worker_manager.py
├── assets
├── __init__.py
├── audio.py
├── base.py
├── image.py
└── video.py
├── attention
├── __init__.py
├── backends
│ ├── __init__.py
│ ├── abstract.py
│ ├── blocksparse_attn.py
│ ├── cpu_mla.py
│ ├── dual_chunk_flash_attn.py
│ ├── flash_attn.py
│ ├── flashinfer.py
│ ├── flashmla.py
│ ├── hpu_attn.py
│ ├── ipex_attn.py
│ ├── mla
│ │ ├── __init__.py
│ │ └── common.py
│ ├── pallas.py
│ ├── placeholder_attn.py
│ ├── rocm_aiter_mla.py
│ ├── rocm_flash_attn.py
│ ├── torch_sdpa.py
│ ├── triton_mla.py
│ ├── utils.py
│ └── xformers.py
├── layer.py
├── ops
│ ├── __init__.py
│ ├── blocksparse_attention
│ │ ├── __init__.py
│ │ ├── blocksparse_attention_kernel.py
│ │ ├── interface.py
│ │ └── utils.py
│ ├── chunked_prefill_paged_decode.py
│ ├── flashmla.py
│ ├── hpu_paged_attn.py
│ ├── ipex_attn.py
│ ├── merge_attn_states.py
│ ├── nki_flash_attn.py
│ ├── paged_attn.py
│ ├── prefix_prefill.py
│ ├── rocm_aiter_mla.py
│ ├── rocm_aiter_paged_attn.py
│ ├── triton_decode_attention.py
│ ├── triton_flash_attention.py
│ ├── triton_merge_attn_states.py
│ └── triton_unified_attention.py
├── selector.py
└── utils
│ └── fa_utils.py
├── beam_search.py
├── benchmarks
├── __init__.py
├── datasets.py
├── endpoint_request_func.py
├── latency.py
├── serve.py
├── throughput.py
└── utils.py
├── collect_env.py
├── compilation
├── __init__.py
├── activation_quant_fusion.py
├── backends.py
├── base_piecewise_backend.py
├── collective_fusion.py
├── compiler_interface.py
├── counter.py
├── cuda_piecewise_backend.py
├── decorators.py
├── fix_functionalization.py
├── fusion.py
├── fx_utils.py
├── inductor_pass.py
├── monitor.py
├── multi_output_match.py
├── noop_elimination.py
├── pass_manager.py
├── sequence_parallelism.py
├── torch25_custom_graph_pass.py
├── vllm_inductor_pass.py
└── wrapper.py
├── config.py
├── connections.py
├── core
├── __init__.py
├── block
│ ├── __init__.py
│ ├── block_table.py
│ ├── common.py
│ ├── cpu_gpu_block_allocator.py
│ ├── interfaces.py
│ ├── naive_block.py
│ ├── prefix_caching_block.py
│ └── utils.py
├── block_manager.py
├── evictor.py
├── interfaces.py
├── placeholder_block_space_manager.py
└── scheduler.py
├── device_allocator
├── __init__.py
└── cumem.py
├── distributed
├── __init__.py
├── communication_op.py
├── device_communicators
│ ├── __init__.py
│ ├── all2all.py
│ ├── base_device_communicator.py
│ ├── cpu_communicator.py
│ ├── cuda_communicator.py
│ ├── cuda_wrapper.py
│ ├── custom_all_reduce.py
│ ├── custom_all_reduce_utils.py
│ ├── hpu_communicator.py
│ ├── neuron_communicator.py
│ ├── pynccl.py
│ ├── pynccl_wrapper.py
│ ├── shm_broadcast.py
│ ├── tpu_communicator.py
│ └── xpu_communicator.py
├── kv_events.py
├── kv_transfer
│ ├── README.md
│ ├── __init__.py
│ ├── disagg_prefill_workflow.jpg
│ ├── kv_connector
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── factory.py
│ │ ├── lmcache_connector.py
│ │ ├── mooncake_store_connector.py
│ │ ├── simple_connector.py
│ │ ├── utils.py
│ │ └── v1
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── lmcache_connector.py
│ │ │ ├── multi_connector.py
│ │ │ ├── nixl_connector.py
│ │ │ └── shared_storage_connector.py
│ ├── kv_connector_agent.py
│ ├── kv_lookup_buffer
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── mooncake_store.py
│ │ └── simple_buffer.py
│ ├── kv_pipe
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── mooncake_pipe.py
│ │ └── pynccl_pipe.py
│ └── kv_transfer_state.py
├── parallel_state.py
├── tpu_distributed_utils.py
└── utils.py
├── engine
├── __init__.py
├── arg_utils.py
├── async_llm_engine.py
├── async_timeout.py
├── llm_engine.py
├── metrics.py
├── metrics_types.py
├── multiprocessing
│ ├── __init__.py
│ ├── client.py
│ └── engine.py
├── output_processor
│ ├── __init__.py
│ ├── interfaces.py
│ ├── multi_step.py
│ ├── single_step.py
│ ├── stop_checker.py
│ └── util.py
└── protocol.py
├── entrypoints
├── __init__.py
├── api_server.py
├── chat_utils.py
├── cli
│ ├── __init__.py
│ ├── benchmark
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── latency.py
│ │ ├── main.py
│ │ ├── serve.py
│ │ └── throughput.py
│ ├── collect_env.py
│ ├── main.py
│ ├── openai.py
│ ├── run_batch.py
│ ├── serve.py
│ └── types.py
├── launcher.py
├── llm.py
├── logger.py
├── openai
│ ├── __init__.py
│ ├── api_server.py
│ ├── cli_args.py
│ ├── logits_processors.py
│ ├── protocol.py
│ ├── run_batch.py
│ ├── serving_chat.py
│ ├── serving_classification.py
│ ├── serving_completion.py
│ ├── serving_embedding.py
│ ├── serving_engine.py
│ ├── serving_models.py
│ ├── serving_pooling.py
│ ├── serving_score.py
│ ├── serving_tokenization.py
│ ├── serving_transcription.py
│ └── tool_parsers
│ │ ├── __init__.py
│ │ ├── abstract_tool_parser.py
│ │ ├── deepseekv3_tool_parser.py
│ │ ├── granite_20b_fc_tool_parser.py
│ │ ├── granite_tool_parser.py
│ │ ├── hermes_tool_parser.py
│ │ ├── internlm2_tool_parser.py
│ │ ├── jamba_tool_parser.py
│ │ ├── llama4_pythonic_tool_parser.py
│ │ ├── llama_tool_parser.py
│ │ ├── mistral_tool_parser.py
│ │ ├── phi4mini_tool_parser.py
│ │ ├── pythonic_tool_parser.py
│ │ └── utils.py
├── score_utils.py
├── ssl.py
└── utils.py
├── env_override.py
├── envs.py
├── executor
├── __init__.py
├── executor_base.py
├── mp_distributed_executor.py
├── msgspec_utils.py
├── multiproc_worker_utils.py
├── ray_distributed_executor.py
├── ray_utils.py
└── uniproc_executor.py
├── forward_context.py
├── inputs
├── __init__.py
├── data.py
├── parse.py
├── preprocess.py
└── registry.py
├── jsontree.py
├── logger.py
├── logging_utils
├── __init__.py
├── dump_input.py
└── formatter.py
├── logits_process.py
├── lora
├── __init__.py
├── fully_sharded_layers.py
├── layers.py
├── lora.py
├── models.py
├── ops
│ ├── __init__.py
│ ├── torch_ops
│ │ ├── __init__.py
│ │ └── lora_ops.py
│ ├── triton_ops
│ │ ├── __init__.py
│ │ ├── kernel_utils.py
│ │ ├── lora_expand_op.py
│ │ ├── lora_kernel_metadata.py
│ │ ├── lora_shrink_op.py
│ │ └── utils.py
│ └── xla_ops
│ │ ├── __init__.py
│ │ └── lora_ops.py
├── peft_helper.py
├── punica_wrapper
│ ├── __init__.py
│ ├── punica_base.py
│ ├── punica_cpu.py
│ ├── punica_gpu.py
│ ├── punica_hpu.py
│ ├── punica_selector.py
│ ├── punica_tpu.py
│ └── utils.py
├── request.py
├── resolver.py
├── utils.py
└── worker_manager.py
├── model_executor
├── __init__.py
├── custom_op.py
├── guided_decoding
│ ├── __init__.py
│ ├── guidance_decoding.py
│ ├── guidance_logits_processors.py
│ ├── guided_fields.py
│ ├── lm_format_enforcer_decoding.py
│ ├── outlines_decoding.py
│ ├── outlines_logits_processors.py
│ ├── utils.py
│ └── xgrammar_decoding.py
├── layers
│ ├── __init__.py
│ ├── activation.py
│ ├── fused_moe
│ │ ├── __init__.py
│ │ ├── batched_deep_gemm_moe.py
│ │ ├── batched_triton_or_deep_gemm_moe.py
│ │ ├── configs
│ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=128,N=192,device_name=NVIDIA_H20.json
│ │ │ ├── E=128,N=192,device_name=NVIDIA_H200.json
│ │ │ ├── E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=384,device_name=NVIDIA_H20.json
│ │ │ ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=384,device_name=NVIDIA_H200.json
│ │ │ ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json
│ │ │ ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=128,N=768,device_name=NVIDIA_H200.json
│ │ │ ├── E=128,N=96,device_name=NVIDIA_H20.json
│ │ │ ├── E=16,N=1024,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=16,N=1024,device_name=NVIDIA_H100.json
│ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
│ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── E=60,N=1408,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=60,N=176,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=60,N=352,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=60,N=704,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200.json
│ │ │ ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200.json
│ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=320,device_name=NVIDIA_H200.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=64,N=640,device_name=NVIDIA_H200.json
│ │ │ ├── E=64,N=896,device_name=NVIDIA_H20.json
│ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json
│ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200.json
│ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X.json
│ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X.json
│ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ ├── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ └── README
│ │ ├── cutlass_moe.py
│ │ ├── deep_gemm_moe.py
│ │ ├── deepep_ht_prepare_finalize.py
│ │ ├── deepep_ll_prepare_finalize.py
│ │ ├── fused_batched_moe.py
│ │ ├── fused_marlin_moe.py
│ │ ├── fused_moe.py
│ │ ├── layer.py
│ │ ├── modular_kernel.py
│ │ ├── moe_align_block_size.py
│ │ ├── moe_pallas.py
│ │ ├── moe_permute_unpermute.py
│ │ ├── moe_torch_iterative.py
│ │ ├── pplx_prepare_finalize.py
│ │ ├── prepare_finalize.py
│ │ ├── rocm_aiter_fused_moe.py
│ │ ├── triton_deep_gemm_moe.py
│ │ └── utils.py
│ ├── layernorm.py
│ ├── lightning_attn.py
│ ├── linear.py
│ ├── logits_processor.py
│ ├── mamba
│ │ ├── __init__.py
│ │ ├── mamba2_metadata.py
│ │ ├── mamba_mixer.py
│ │ ├── mamba_mixer2.py
│ │ └── ops
│ │ │ ├── __init__.py
│ │ │ ├── causal_conv1d.py
│ │ │ ├── mamba_ssm.py
│ │ │ ├── ssd_bmm.py
│ │ │ ├── ssd_chunk_scan.py
│ │ │ ├── ssd_chunk_state.py
│ │ │ ├── ssd_combined.py
│ │ │ └── ssd_state_passing.py
│ ├── pooler.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── aqlm.py
│ │ ├── auto_round.py
│ │ ├── awq.py
│ │ ├── awq_marlin.py
│ │ ├── awq_triton.py
│ │ ├── base_config.py
│ │ ├── bitblas.py
│ │ ├── bitsandbytes.py
│ │ ├── compressed_tensors
│ │ │ ├── __init__.py
│ │ │ ├── compressed_tensors.py
│ │ │ ├── compressed_tensors_moe.py
│ │ │ ├── schemes
│ │ │ │ ├── __init__.py
│ │ │ │ ├── compressed_tensors_24.py
│ │ │ │ ├── compressed_tensors_scheme.py
│ │ │ │ ├── compressed_tensors_w4a16_24.py
│ │ │ │ ├── compressed_tensors_w4a16_nvfp4.py
│ │ │ │ ├── compressed_tensors_w8a16_fp8.py
│ │ │ │ ├── compressed_tensors_w8a8_fp8.py
│ │ │ │ ├── compressed_tensors_w8a8_int8.py
│ │ │ │ └── compressed_tensors_wNa16.py
│ │ │ ├── triton_scaled_mm.py
│ │ │ └── utils.py
│ │ ├── deepspeedfp.py
│ │ ├── experts_int8.py
│ │ ├── fbgemm_fp8.py
│ │ ├── fp8.py
│ │ ├── gguf.py
│ │ ├── gptq.py
│ │ ├── gptq_bitblas.py
│ │ ├── gptq_marlin.py
│ │ ├── gptq_marlin_24.py
│ │ ├── hqq_marlin.py
│ │ ├── ipex_quant.py
│ │ ├── kernels
│ │ │ ├── __init__.py
│ │ │ ├── mixed_precision
│ │ │ │ ├── MPLinearKernel.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── allspark.py
│ │ │ │ ├── bitblas.py
│ │ │ │ ├── exllama.py
│ │ │ │ ├── machete.py
│ │ │ │ └── marlin.py
│ │ │ └── scaled_mm
│ │ │ │ ├── ScaledMMLinearKernel.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aiter.py
│ │ │ │ ├── cutlass.py
│ │ │ │ ├── triton.py
│ │ │ │ └── xla.py
│ │ ├── kv_cache.py
│ │ ├── marlin.py
│ │ ├── modelopt.py
│ │ ├── moe_wna16.py
│ │ ├── neuron_quant.py
│ │ ├── ptpc_fp8.py
│ │ ├── qqq.py
│ │ ├── quark
│ │ │ ├── __init__.py
│ │ │ ├── quark.py
│ │ │ ├── quark_moe.py
│ │ │ ├── schemes
│ │ │ │ ├── __init__.py
│ │ │ │ ├── quark_scheme.py
│ │ │ │ ├── quark_w4a4_mxfp4.py
│ │ │ │ ├── quark_w8a8_fp8.py
│ │ │ │ └── quark_w8a8_int8.py
│ │ │ └── utils.py
│ │ ├── schema.py
│ │ ├── torchao.py
│ │ ├── tpu_int8.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── allspark_utils.py
│ │ │ ├── bitblas_utils.py
│ │ │ ├── configs
│ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ └── N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ ├── fp8_utils.py
│ │ │ ├── gptq_utils.py
│ │ │ ├── int8_utils.py
│ │ │ ├── layer_utils.py
│ │ │ ├── machete_utils.py
│ │ │ ├── marlin_utils.py
│ │ │ ├── marlin_utils_fp4.py
│ │ │ ├── marlin_utils_fp8.py
│ │ │ ├── marlin_utils_test.py
│ │ │ ├── marlin_utils_test_24.py
│ │ │ ├── marlin_utils_test_qqq.py
│ │ │ ├── mxfp4_utils.py
│ │ │ ├── nvfp4_emulation_utils.py
│ │ │ ├── quant_utils.py
│ │ │ └── w8a8_utils.py
│ ├── rejection_sampler.py
│ ├── resampler.py
│ ├── rotary_embedding.py
│ ├── sampler.py
│ ├── spec_decode_base_sampler.py
│ ├── typical_acceptance_sampler.py
│ ├── utils.py
│ └── vocab_parallel_embedding.py
├── model_loader
│ ├── __init__.py
│ ├── base_loader.py
│ ├── bitsandbytes_loader.py
│ ├── default_loader.py
│ ├── dummy_loader.py
│ ├── gguf_loader.py
│ ├── neuron.py
│ ├── neuronx_distributed.py
│ ├── runai_streamer_loader.py
│ ├── sharded_state_loader.py
│ ├── tensorizer.py
│ ├── tensorizer_loader.py
│ ├── tpu.py
│ ├── utils.py
│ └── weight_utils.py
├── models
│ ├── __init__.py
│ ├── adapters.py
│ ├── aimv2.py
│ ├── arctic.py
│ ├── aria.py
│ ├── aya_vision.py
│ ├── baichuan.py
│ ├── bamba.py
│ ├── bart.py
│ ├── bert.py
│ ├── bert_with_rope.py
│ ├── blip.py
│ ├── blip2.py
│ ├── bloom.py
│ ├── chameleon.py
│ ├── chatglm.py
│ ├── clip.py
│ ├── commandr.py
│ ├── constant_size_cache.py
│ ├── dbrx.py
│ ├── deepseek.py
│ ├── deepseek_mtp.py
│ ├── deepseek_v2.py
│ ├── deepseek_vl2.py
│ ├── eagle.py
│ ├── exaone.py
│ ├── fairseq2_llama.py
│ ├── falcon.py
│ ├── falcon_h1.py
│ ├── florence2.py
│ ├── fuyu.py
│ ├── gemma.py
│ ├── gemma2.py
│ ├── gemma3.py
│ ├── gemma3_mm.py
│ ├── glm.py
│ ├── glm4.py
│ ├── glm4v.py
│ ├── gpt2.py
│ ├── gpt_bigcode.py
│ ├── gpt_j.py
│ ├── gpt_neox.py
│ ├── granite.py
│ ├── granite_speech.py
│ ├── granitemoe.py
│ ├── granitemoehybrid.py
│ ├── granitemoeshared.py
│ ├── gritlm.py
│ ├── grok1.py
│ ├── h2ovl.py
│ ├── idefics2_vision_model.py
│ ├── idefics3.py
│ ├── interfaces.py
│ ├── interfaces_base.py
│ ├── intern_vit.py
│ ├── internlm2.py
│ ├── internlm2_ve.py
│ ├── internvl.py
│ ├── jais.py
│ ├── jamba.py
│ ├── kimi_vl.py
│ ├── llama.py
│ ├── llama4.py
│ ├── llama_eagle.py
│ ├── llama_eagle3.py
│ ├── llava.py
│ ├── llava_next.py
│ ├── llava_next_video.py
│ ├── llava_onevision.py
│ ├── mamba.py
│ ├── mamba2.py
│ ├── mamba_cache.py
│ ├── medusa.py
│ ├── mimo.py
│ ├── mimo_mtp.py
│ ├── minicpm.py
│ ├── minicpm3.py
│ ├── minicpm_eagle.py
│ ├── minicpmo.py
│ ├── minicpmv.py
│ ├── minimax_cache.py
│ ├── minimax_text_01.py
│ ├── minimax_vl_01.py
│ ├── mistral3.py
│ ├── mixtral.py
│ ├── mixtral_quant.py
│ ├── mllama.py
│ ├── mllama4.py
│ ├── mlp_speculator.py
│ ├── modernbert.py
│ ├── module_mapping.py
│ ├── molmo.py
│ ├── moonvit.py
│ ├── mpt.py
│ ├── nemotron.py
│ ├── nemotron_nas.py
│ ├── nvlm_d.py
│ ├── olmo.py
│ ├── olmo2.py
│ ├── olmoe.py
│ ├── opt.py
│ ├── orion.py
│ ├── ovis.py
│ ├── paligemma.py
│ ├── persimmon.py
│ ├── phi.py
│ ├── phi3.py
│ ├── phi3_small.py
│ ├── phi3v.py
│ ├── phi4mm.py
│ ├── phi4mm_audio.py
│ ├── phi4mm_utils.py
│ ├── phimoe.py
│ ├── pixtral.py
│ ├── plamo2.py
│ ├── prithvi_geospatial_mae.py
│ ├── qwen.py
│ ├── qwen2.py
│ ├── qwen2_5_omni_thinker.py
│ ├── qwen2_5_vl.py
│ ├── qwen2_audio.py
│ ├── qwen2_moe.py
│ ├── qwen2_rm.py
│ ├── qwen2_vl.py
│ ├── qwen3.py
│ ├── qwen3_moe.py
│ ├── qwen_vl.py
│ ├── registry.py
│ ├── roberta.py
│ ├── siglip.py
│ ├── skyworkr1v.py
│ ├── smolvlm.py
│ ├── solar.py
│ ├── stablelm.py
│ ├── starcoder2.py
│ ├── tarsier.py
│ ├── telechat2.py
│ ├── teleflm.py
│ ├── transformers.py
│ ├── ultravox.py
│ ├── utils.py
│ ├── vision.py
│ ├── whisper.py
│ └── zamba2.py
├── parameter.py
├── pooling_metadata.py
├── sampling_metadata.py
└── utils.py
├── multimodal
├── __init__.py
├── audio.py
├── base.py
├── hasher.py
├── image.py
├── inputs.py
├── parse.py
├── processing.py
├── profiling.py
├── registry.py
├── utils.py
└── video.py
├── outputs.py
├── platforms
├── __init__.py
├── cpu.py
├── cuda.py
├── hpu.py
├── interface.py
├── neuron.py
├── rocm.py
├── tpu.py
└── xpu.py
├── plugins
├── __init__.py
└── lora_resolvers
│ ├── README.md
│ ├── __init__.py
│ └── filesystem_resolver.py
├── pooling_params.py
├── profiler
├── __init__.py
├── layerwise_profile.py
└── utils.py
├── prompt_adapter
├── __init__.py
├── layers.py
├── models.py
├── request.py
├── utils.py
└── worker_manager.py
├── py.typed
├── reasoning
├── __init__.py
├── abs_reasoning_parsers.py
├── deepseek_r1_reasoning_parser.py
├── granite_reasoning_parser.py
└── qwen3_reasoning_parser.py
├── sampling_params.py
├── scalar_type.py
├── scripts.py
├── sequence.py
├── spec_decode
├── __init__.py
├── batch_expansion.py
├── draft_model_runner.py
├── interfaces.py
├── medusa_worker.py
├── metrics.py
├── mlp_speculator_worker.py
├── mqa_scorer.py
├── multi_step_worker.py
├── ngram_worker.py
├── proposer_worker_base.py
├── smaller_tp_proposer_worker.py
├── spec_decode_worker.py
├── target_model_runner.py
├── top1_proposer.py
└── util.py
├── test_utils.py
├── third_party
├── __init__.py
└── pynvml.py
├── tracing.py
├── transformers_utils
├── __init__.py
├── chat_templates
│ ├── __init__.py
│ ├── registry.py
│ ├── template_basic.jinja
│ ├── template_blip2.jinja
│ ├── template_chatml.jinja
│ ├── template_deepseek_vl2.jinja
│ └── template_fuyu.jinja
├── config.py
├── configs
│ ├── __init__.py
│ ├── arctic.py
│ ├── chatglm.py
│ ├── cohere2.py
│ ├── dbrx.py
│ ├── deepseek_vl2.py
│ ├── eagle.py
│ ├── exaone.py
│ ├── falcon.py
│ ├── h2ovl.py
│ ├── internvl.py
│ ├── jais.py
│ ├── kimi_vl.py
│ ├── medusa.py
│ ├── minimax_text_01.py
│ ├── minimax_vl_01.py
│ ├── mllama.py
│ ├── mlp_speculator.py
│ ├── moonvit.py
│ ├── mpt.py
│ ├── nemotron.py
│ ├── nvlm_d.py
│ ├── ovis.py
│ ├── skyworkr1v.py
│ ├── solar.py
│ ├── telechat2.py
│ └── ultravox.py
├── detokenizer.py
├── detokenizer_utils.py
├── processor.py
├── processors
│ ├── __init__.py
│ ├── deepseek_vl2.py
│ └── ovis.py
├── s3_utils.py
├── tokenizer.py
├── tokenizer_base.py
├── tokenizer_group.py
├── tokenizers
│ ├── __init__.py
│ └── mistral.py
└── utils.py
├── triton_utils
├── __init__.py
└── importing.py
├── usage
├── __init__.py
└── usage_lib.py
├── utils.py
├── v1
├── __init__.py
├── attention
│ ├── __init__.py
│ └── backends
│ │ ├── __init__.py
│ │ ├── cpu_attn.py
│ │ ├── flash_attn.py
│ │ ├── flashinfer.py
│ │ ├── mla
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── cutlass_mla.py
│ │ ├── flashmla.py
│ │ ├── rocm_aiter_mla.py
│ │ └── triton_mla.py
│ │ ├── pallas.py
│ │ ├── triton_attn.py
│ │ └── utils.py
├── core
│ ├── __init__.py
│ ├── block_pool.py
│ ├── encoder_cache_manager.py
│ ├── kv_cache_manager.py
│ ├── kv_cache_utils.py
│ ├── sched
│ │ ├── __init__.py
│ │ ├── interface.py
│ │ ├── output.py
│ │ ├── scheduler.py
│ │ └── utils.py
│ └── single_type_kv_cache_manager.py
├── engine
│ ├── __init__.py
│ ├── async_llm.py
│ ├── coordinator.py
│ ├── core.py
│ ├── core_client.py
│ ├── detokenizer.py
│ ├── exceptions.py
│ ├── llm_engine.py
│ ├── logprobs.py
│ ├── mm_input_cache.py
│ ├── output_processor.py
│ ├── parallel_sampling.py
│ └── processor.py
├── executor
│ ├── __init__.py
│ ├── abstract.py
│ ├── multiproc_executor.py
│ └── ray_distributed_executor.py
├── kv_cache_interface.py
├── metrics
│ ├── __init__.py
│ ├── loggers.py
│ ├── prometheus.py
│ ├── ray_wrappers.py
│ ├── reader.py
│ └── stats.py
├── outputs.py
├── request.py
├── sample
│ ├── __init__.py
│ ├── metadata.py
│ ├── ops
│ │ ├── __init__.py
│ │ ├── bad_words.py
│ │ ├── penalties.py
│ │ └── topk_topp_sampler.py
│ ├── rejection_sampler.py
│ ├── sampler.py
│ └── tpu
│ │ ├── __init__.py
│ │ ├── metadata.py
│ │ └── sampler.py
├── serial_utils.py
├── spec_decode
│ ├── __init__.py
│ ├── eagle.py
│ ├── medusa.py
│ ├── metadata.py
│ ├── metrics.py
│ ├── ngram_proposer.py
│ └── utils.py
├── structured_output
│ ├── __init__.py
│ ├── backend_guidance.py
│ ├── backend_types.py
│ ├── backend_xgrammar.py
│ ├── request.py
│ └── utils.py
├── utils.py
└── worker
│ ├── __init__.py
│ ├── block_table.py
│ ├── cpu_model_runner.py
│ ├── cpu_worker.py
│ ├── gpu_input_batch.py
│ ├── gpu_model_runner.py
│ ├── gpu_worker.py
│ ├── lora_model_runner_mixin.py
│ ├── tpu_model_runner.py
│ ├── tpu_worker.py
│ ├── utils.py
│ └── worker_base.py
├── version.py
├── vllm_flash_attn
└── .gitkeep
└── worker
├── __init__.py
├── cache_engine.py
├── cpu_enc_dec_model_runner.py
├── cpu_model_runner.py
├── cpu_pooling_model_runner.py
├── cpu_worker.py
├── enc_dec_model_runner.py
├── hpu_model_runner.py
├── hpu_worker.py
├── model_runner.py
├── model_runner_base.py
├── multi_step_hpu_worker.py
├── multi_step_model_runner.py
├── multi_step_neuron_model_runner.py
├── multi_step_neuronx_distributed_model_runner.py
├── multi_step_tpu_worker.py
├── multi_step_worker.py
├── neuron_model_runner.py
├── neuron_worker.py
├── neuronx_distributed_model_runner.py
├── pooling_model_runner.py
├── tpu_model_runner.py
├── tpu_worker.py
├── utils.py
├── worker.py
├── worker_base.py
├── xpu_model_runner.py
└── xpu_worker.py
/.buildkite/generate_index.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import argparse
5 | import os
6 |
7 | template = """
8 |
9 |
10 | Links for vLLM
11 | {wheel}
12 |
13 |
14 | """
15 |
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("--wheel", help="The wheel path.", required=True)
18 | args = parser.parse_args()
19 |
20 | filename = os.path.basename(args.wheel)
21 |
22 | with open("index.html", "w") as f:
23 | print(f"Generated index.html for {args.wheel}")
24 | # cloudfront requires escaping the '+' character
25 | f.write(
26 | template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
27 | )
28 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
3 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.671
9 | - name: "exact_match,flexible-extract"
10 | value: 0.664
11 | limit: 1000
12 | num_fewshot: 5
13 | trust_remote_code: True
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
3 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.905
9 | - name: "exact_match,flexible-extract"
10 | value: 0.905
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
3 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.892
9 | - name: "exact_match,flexible-extract"
10 | value: 0.892
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.752
9 | - name: "exact_match,flexible-extract"
10 | value: 0.754
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.753
9 | - name: "exact_match,flexible-extract"
10 | value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.755
9 | - name: "exact_match,flexible-extract"
10 | value: 0.755
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
3 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.753
9 | - name: "exact_match,flexible-extract"
10 | value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.764
9 | - name: "exact_match,flexible-extract"
10 | value: 0.764
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.728
9 | - name: "exact_match,flexible-extract"
10 | value: 0.728
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.758
9 | - name: "exact_match,flexible-extract"
10 | value: 0.759
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
3 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.756
9 | - name: "exact_match,flexible-extract"
10 | value: 0.752
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
3 | model_name: "HandH1998/QQQ-Llama-3-8b-g128"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.419
9 | - name: "exact_match,flexible-extract"
10 | value: 0.416
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
2 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.335
8 | - name: "exact_match,flexible-extract"
9 | value: 0.323
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.356
9 | - name: "exact_match,flexible-extract"
10 | value: 0.358
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
3 | model_name: "mgoin/Minitron-4B-Base-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.231
9 | - name: "exact_match,flexible-extract"
10 | value: 0.22
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
3 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.86
9 | - name: "exact_match,flexible-extract"
10 | value: 0.86
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
3 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.624
9 | - name: "exact_match,flexible-extract"
10 | value: 0.624
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
3 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.616
9 | - name: "exact_match,flexible-extract"
10 | value: 0.632
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
3 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.30
9 | - name: "exact_match,flexible-extract"
10 | value: 0.465
11 | limit: 1319
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.578
9 | - name: "exact_match,flexible-extract"
10 | value: 0.585
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.593
9 | - name: "exact_match,flexible-extract"
10 | value: 0.588
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.595
9 | - name: "exact_match,flexible-extract"
10 | value: 0.582
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
3 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.792
9 | - name: "exact_match,flexible-extract"
10 | value: 0.824
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
2 | model_name: "Qwen/Qwen2.5-1.5B-Instruct"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.54
8 | - name: "exact_match,flexible-extract"
9 | value: 0.59
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
2 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.47
8 | - name: "exact_match,flexible-extract"
9 | value: 0.64
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
3 | model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.6353
9 | - name: "exact_match,flexible-extract"
10 | value: 0.637
11 | limit: null
12 | num_fewshot: null
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 | Meta-Llama-3-8B-QQQ.yaml
7 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen2.5-1.5B-Instruct.yaml
2 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
4 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
5 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
6 | Qwen1.5-MoE-W4A16-compressed-tensors.yaml
7 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/nightly-annotation.md:
--------------------------------------------------------------------------------
1 |
2 | ## Description
3 |
4 | This file contains the downloading link for benchmarking results.
5 |
6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
7 | - [benchmarking results](artifact://results.zip)
8 | - [benchmarking code](artifact://nightly-benchmarks.zip)
9 |
10 | Please download the visualization scripts in the post
11 |
12 | ## Results reproduction
13 |
14 | - Find the docker we use in `benchmarking pipeline`
15 | - Deploy the docker, and inside the docker:
16 | - Download `nightly-benchmarks.zip`.
17 | - In the same folder, run the following code:
18 |
19 | ```console
20 | export HF_TOKEN=
21 | apt update
22 | apt install -y git
23 | unzip nightly-benchmarks.zip
24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
25 | ```
26 |
27 | And the results will be inside `./benchmarks/results`.
28 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import argparse
5 |
6 | from transformers import AutoTokenizer
7 |
8 |
9 | def main(model, cachedir):
10 | # Load the tokenizer and save it to the specified directory
11 | tokenizer = AutoTokenizer.from_pretrained(model)
12 | tokenizer.save_pretrained(cachedir)
13 | print(f"Tokenizer saved to {cachedir}")
14 |
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser(
18 | description="Download and save Hugging Face tokenizer"
19 | )
20 | parser.add_argument("--model", type=str, required=True, help="Name of the model")
21 | parser.add_argument(
22 | "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
23 | )
24 |
25 | args = parser.parse_args()
26 | main(args.model, args.cachedir)
27 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from lmdeploy.serve.openai.api_client import APIClient
5 |
6 | api_client = APIClient("http://localhost:8000")
7 | model_name = api_client.available_models[0]
8 |
9 | print(model_name)
10 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
3 | if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
4 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
5 | else
6 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
7 | fi
8 |
9 | TIMEOUT_SECONDS=10
10 |
11 | retries=0
12 | while [ $retries -lt 1000 ]; do
13 | if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
14 | exit 0
15 | fi
16 |
17 | echo "Waiting for image to be available..."
18 |
19 | retries=$((retries + 1))
20 | sleep 5
21 | done
22 |
23 | exit 1
24 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "test_name": "llama8B_tp1_genai_perf",
4 | "qps_list": [4,8,16,32],
5 | "common_parameters": {
6 | "model": "meta-llama/Meta-Llama-3-8B-Instruct",
7 | "tp": 1,
8 | "port": 8000,
9 | "num_prompts": 500,
10 | "reuse_server": false
11 | },
12 | "vllm_server_parameters": {
13 | "disable_log_stats": "",
14 | "disable_log_requests": "",
15 | "gpu_memory_utilization": 0.9,
16 | "num_scheduler_steps": 10,
17 | "max_num_seqs": 512,
18 | "dtype": "bfloat16"
19 | },
20 | "genai_perf_input_parameters": {
21 | }
22 | }
23 | ]
--------------------------------------------------------------------------------
/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script build the CPU docker image and run the offline inference inside the container.
4 | # It serves a sanity check for compilation and basic model usage.
5 | set -ex
6 |
7 | # Setup cleanup
8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
9 | trap remove_docker_container EXIT
10 | remove_docker_container
11 |
12 | # Try building the docker image
13 | docker build -t cpu-test -f docker/Dockerfile.s390x .
14 |
--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | UseTab: Never
3 | IndentWidth: 2
4 | ColumnLimit: 80
5 |
6 | # Force pointers to the type for C++.
7 | DerivePointerAlignment: false
8 | PointerAlignment: Left
9 |
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 |
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 |
18 | IncludeCategories:
19 | - Regex: '^<'
20 | Priority: 4
21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 | Priority: 3
23 | - Regex: '^"(qoda|\.\.)/'
24 | Priority: 2
25 | - Regex: '.*'
26 | Priority: 1
27 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | /.venv
2 | /build
3 | dist
4 | vllm/*.so
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | .mypy_cache
12 |
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/about-codeowners/
2 | # for more info about CODEOWNERS file
3 |
4 | * @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
5 |
6 | /csrc/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
7 | /vllm/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
8 |
9 | fused_moe @divakar-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
10 |
11 | /tests/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
12 | /.buildkite/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
13 |
14 | /benchmarks/profiling @AdrianAbeyta @dllehr-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
15 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: vllm
3 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Questions
4 | url: https://discuss.vllm.ai
5 | about: Ask questions and discuss with other vLLM community members
6 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Please direct your PRs to the upstream vllm (https://github.com/vllm-project/vllm.git)
2 |
3 | Accepting PRs into the ROCm fork (https://github.com/ROCm/vllm) will require a clear previously communicated exception
4 |
--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
1 | name: Add label on auto-merge enabled
2 | permissions:
3 | pull-requests: write
4 | on:
5 | pull_request_target:
6 | types:
7 | - auto_merge_enabled
8 | jobs:
9 | add-label-on-auto-merge:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Add label
13 | uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
14 | with:
15 | script: |
16 | github.rest.issues.addLabels({
17 | owner: context.repo.owner,
18 | repo: context.repo.repo,
19 | issue_number: context.issue.number,
20 | labels: ['ready']
21 | })
22 | env:
23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 |
--------------------------------------------------------------------------------
/.github/workflows/cleanup_pr_body.yml:
--------------------------------------------------------------------------------
1 | name: Cleanup PR Body
2 |
3 | on:
4 | pull_request_target:
5 | types: [opened, reopened, edited]
6 |
7 | permissions:
8 | pull-requests: write
9 |
10 | jobs:
11 | update-description:
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - name: Checkout repository
16 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
20 | with:
21 | python-version: '3.12'
22 |
23 | - name: Install Python dependencies
24 | run: |
25 | python3 -m pip install --upgrade pip
26 | python3 -m pip install regex
27 |
28 | - name: Update PR description
29 | env:
30 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31 | run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
32 |
--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "actionlint",
5 | "pattern": [
6 | {
7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
8 | "file": 1,
9 | "line": 2,
10 | "column": 3,
11 | "message": 4,
12 | "code": 5
13 | }
14 | ]
15 | }
16 | ]
17 | }
18 |
--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "mypy",
5 | "pattern": [
6 | {
7 | "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
8 | "file": 1,
9 | "line": 2,
10 | "severity": 3,
11 | "message": 4
12 | }
13 | ]
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: pre-commit
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches: [main]
7 |
8 | permissions:
9 | contents: read
10 |
11 | jobs:
12 | pre-commit:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
16 | - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
17 | with:
18 | python-version: "3.12"
19 | - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
20 | - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
21 | - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
22 | with:
23 | extra_args: --all-files --hook-stage manual
24 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 |
4 | python_executable=python3
5 |
6 | # Update paths
7 | # Install requirements
8 | $python_executable -m pip install -r requirements/rocm.txt
9 |
10 | # Limit the number of parallel jobs to avoid OOM
11 | export MAX_JOBS=1
12 | # Make sure release wheels are built for the following architectures
13 | export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
14 |
15 | rm -f "$(which sccache)"
16 |
17 | export MAX_JOBS=32
18 |
19 | # Build
20 | $python_executable setup.py bdist_wheel --dist-dir=dist
21 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
1 | // Uses GitHub's API to create the release and wait for result.
2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
3 |
4 | module.exports = async (github, context, core) => {
5 | try {
6 | const response = await github.rest.repos.createRelease({
7 | draft: false,
8 | generate_release_notes: true,
9 | name: process.env.RELEASE_TAG,
10 | owner: context.repo.owner,
11 | prerelease: true,
12 | repo: context.repo.repo,
13 | tag_name: process.env.RELEASE_TAG,
14 | });
15 |
16 | core.setOutput('upload_url', response.data.upload_url);
17 | } catch (error) {
18 | core.setFailed(error.message);
19 | }
20 | }
--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Replace '.' with '-' ex: 11.8 -> 11-8
4 | cuda_version=$(echo "$1" | tr "." "-")
5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
6 | OS=$(echo "$2" | tr -d ".\-")
7 |
8 | # Installs CUDA
9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
14 | sudo apt clean
15 |
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 |
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python_executable=python$1
4 | pytorch_version=$2
5 | cuda_version=$3
6 |
7 | # Install torch
8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
10 |
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | version: 2
5 |
6 | build:
7 | os: ubuntu-22.04
8 | tools:
9 | python: "3.12"
10 |
11 | mkdocs:
12 | configuration: mkdocs.yaml
13 |
14 | # Optionally declare the Python requirements required to build your docs
15 | python:
16 | install:
17 | - requirements: requirements/docs.txt
18 |
--------------------------------------------------------------------------------
/.shellcheckrc:
--------------------------------------------------------------------------------
1 | # rules currently disabled:
2 | #
3 | # SC1091 (info): Not following: was not specified as input (see shellcheck -x)
4 | # SC2004 (style): $/${} is unnecessary on arithmetic variables.
5 | # SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
6 | # SC2155 (warning): Declare and assign separately to avoid masking return values.
7 | # SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
8 | #
9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164
10 |
--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 |
3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
4 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements/common.txt
3 | include requirements/cuda.txt
4 | include requirements/rocm.txt
5 | include requirements/neuron.txt
6 | include requirements/cpu.txt
7 | include CMakeLists.txt
8 |
9 | recursive-include cmake *
10 | recursive-include csrc *
11 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Reporting a Vulnerability
4 |
5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
6 |
7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
8 |
9 | ---
10 |
11 | Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
12 |
13 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
14 |
--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
--------------------------------------------------------------------------------
/benchmarks/structured_schemas/structured_schema_1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "object",
3 | "properties": {
4 | "name": { "type": "string" },
5 | "email": { "type": "string" },
6 | "street": { "type": "string" },
7 | "city": { "type": "string" },
8 | "state": { "type": "string" },
9 | "zip": { "type": "string" },
10 | "phone": { "type": "string" },
11 | "website": { "type": "string" },
12 | "company": { "type": "string" },
13 | "age": { "type": "integer" }
14 | },
15 | "required": [
16 | "name",
17 | "email"
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 |
--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 |
5 | #include
6 | #ifdef ENABLE_FP8
7 | #ifndef USE_ROCM
8 | #include
9 | #endif // USE_ROCM
10 | #endif // ENABLE_FP8
11 |
12 | namespace vllm {
13 |
14 | enum class Fp8KVCacheDataType {
15 | kAuto = 0,
16 | kFp8E4M3 = 1,
17 | kFp8E5M2 = 2,
18 | };
19 |
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec {
23 | using Type = uint8_t;
24 | };
25 |
26 | template <>
27 | struct Vec {
28 | using Type = uint16_t;
29 | };
30 |
31 | template <>
32 | struct Vec {
33 | using Type = uint32_t;
34 | };
35 |
36 | template <>
37 | struct Vec {
38 | using Type = uint2;
39 | };
40 |
41 | } // namespace vllm
42 |
--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 |
--------------------------------------------------------------------------------
/csrc/core/math.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | inline constexpr uint32_t next_pow_2(uint32_t const num) {
7 | if (num <= 1) return num;
8 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
9 | }
10 |
11 | template
12 | static inline constexpr auto div_ceil(A a, B b) {
13 | return (a + b - 1) / b;
14 | }
15 |
16 | // Round a down to the next multiple of b. The caller is responsible for making
17 | // sure that b is non-zero
18 | template
19 | inline constexpr T round_to_previous_multiple_of(T a, T b) {
20 | return a % b == 0 ? a : (a / b) * b;
21 | }
22 |
23 | // Round a up to the next multiple of b. The caller is responsible for making
24 | // sure that b is non-zero
25 | template
26 | inline constexpr T round_to_next_multiple_of(T a, T b) {
27 | return a % b == 0 ? a : ((a / b) + 1) * b;
28 | }
29 |
--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
1 | #ifndef CPU_TYPES_HPP
2 | #define CPU_TYPES_HPP
3 |
4 | #if defined(__x86_64__)
5 | // x86 implementation
6 | #include "cpu_types_x86.hpp"
7 | #elif defined(__POWER9_VECTOR__)
8 | // ppc implementation
9 | #include "cpu_types_vsx.hpp"
10 | #elif defined(__s390x__)
11 | // s390 implementation
12 | #include "cpu_types_vxe.hpp"
13 | #elif defined(__aarch64__)
14 | // arm implementation
15 | #include "cpu_types_arm.hpp"
16 | #else
17 | #warning "unsupported vLLM cpu implementation"
18 | #endif
19 |
20 | #endif
--------------------------------------------------------------------------------
/csrc/cutlass_extensions/common.cpp:
--------------------------------------------------------------------------------
1 | #include "cutlass_extensions/common.hpp"
2 |
3 | int32_t get_sm_version_num() {
4 | int32_t major_capability, minor_capability;
5 | cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
6 | 0);
7 | cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
8 | 0);
9 | int32_t version_num = major_capability * 10 + minor_capability;
10 | return version_num;
11 | }
--------------------------------------------------------------------------------
/csrc/moe/marlin_moe_wna16/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu
--------------------------------------------------------------------------------
/csrc/prepare_inputs/advance_step.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | namespace prepare_inputs {
13 |
14 | static constexpr int max_threads = 256;
15 | static constexpr bool logging = false;
16 |
17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
18 |
19 | } // namespace prepare_inputs
20 |
--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu:
--------------------------------------------------------------------------------
1 | #include "scaled_mm_kernels.hpp"
2 | #include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
3 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
4 |
5 | namespace vllm {
6 |
7 | void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
8 | torch::Tensor const& a,
9 | torch::Tensor const& b,
10 | torch::Tensor const& a_scales,
11 | torch::Tensor const& b_scales) {
12 | if (out.dtype() == torch::kBFloat16) {
13 | cutlass_gemm_blockwise_sm100_fp8_dispatch(
14 | out, a, b, a_scales, b_scales);
15 |
16 | } else {
17 | TORCH_CHECK(out.dtype() == torch::kFloat16);
18 | cutlass_gemm_blockwise_sm100_fp8_dispatch(
19 | out, a, b, a_scales, b_scales);
20 | }
21 | }
22 |
23 | } // namespace vllm
24 |
--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu:
--------------------------------------------------------------------------------
1 |
2 | #include "scaled_mm_kernels.hpp"
3 | #include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
4 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
5 |
6 | namespace vllm {
7 |
8 | void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
9 | torch::Tensor const& a,
10 | torch::Tensor const& b,
11 | torch::Tensor const& a_scales,
12 | torch::Tensor const& b_scales) {
13 | if (out.dtype() == torch::kBFloat16) {
14 | cutlass_gemm_blockwise_sm90_fp8_dispatch(
15 | out, a, b, a_scales, b_scales);
16 |
17 | } else {
18 | TORCH_CHECK(out.dtype() == torch::kFloat16);
19 | cutlass_gemm_blockwise_sm90_fp8_dispatch(
20 | out, a, b, a_scales, b_scales);
21 | }
22 | }
23 |
24 | } // namespace vllm
--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu:
--------------------------------------------------------------------------------
1 | #include "c3x/scaled_mm_helper.hpp"
2 | #include "c3x/scaled_mm_kernels.hpp"
3 |
4 | /*
5 | This file defines quantized GEMM operations using the CUTLASS 3.x API, for
6 | NVIDIA GPUs with sm100 (Blackwell).
7 | */
8 |
9 | #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
10 |
11 | void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
12 | torch::Tensor const& b,
13 | torch::Tensor const& a_scales,
14 | torch::Tensor const& b_scales,
15 | std::optional const& bias) {
16 | dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 | vllm::cutlass_scaled_mm_sm100_fp8,
18 | nullptr, // int8 not supported on SM100
19 | vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
20 | }
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | Copied from https://github.com/turboderp/exllamav2
3 | */
4 |
5 | #ifndef _qdq_8_cuh
6 | #define _qdq_8_cuh
7 |
8 | #include "qdq_util.cuh"
9 |
10 | namespace vllm {
11 | namespace gptq {
12 |
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 |
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 | const uint32_t q_1,
17 | half2 (&dq)[4], int stride,
18 | const uint32_t zero) {
19 | half dqh[8];
20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 |
23 | for (int i = 0; i < 4; i++)
24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 |
27 | } // namespace gptq
28 | } // namespace vllm
29 |
30 | #endif
31 |
--------------------------------------------------------------------------------
/csrc/quantization/gptq_marlin/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu
--------------------------------------------------------------------------------
/csrc/rocm/custom.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | // declare templates for front (cpp) and back (cuda) sides of function:
6 | // template
7 |
8 | void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K,
9 | cudaStream_t stream, const int rows_per_block);
10 | void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
11 | const int64_t rows_per_block) {
12 | auto M = in_a.size(0);
13 | auto K = in_a.size(1);
14 | LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(), M, K,
15 | at::cuda::getCurrentCUDAStream(), rows_per_block);
16 | }
17 |
--------------------------------------------------------------------------------
/docker/Dockerfile.hpu:
--------------------------------------------------------------------------------
1 | FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
2 |
3 | COPY ./ /workspace/vllm
4 |
5 | WORKDIR /workspace/vllm
6 |
7 | RUN pip install -v -r requirements/hpu.txt
8 |
9 | ENV no_proxy=localhost,127.0.0.1
10 | ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
11 |
12 | RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
13 |
14 | # install development dependencies (for testing)
15 | RUN python3 -m pip install -e tests/vllm_test_utils
16 |
17 | WORKDIR /workspace/
18 |
19 | RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
20 |
21 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
22 |
--------------------------------------------------------------------------------
/docs/api/vllm/.meta.yml:
--------------------------------------------------------------------------------
1 | search:
2 | boost: 0.5
3 |
--------------------------------------------------------------------------------
/docs/assets/contributing/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/contributing/dockerfile-stages-dependency.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-with-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-with-doc.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-without-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-without-doc.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-provider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-provider.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-upload-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-upload-doc.png
--------------------------------------------------------------------------------
/docs/assets/deployment/architecture_helm_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/architecture_helm_deployment.png
--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-chat.png
--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-settings.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-chat.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-create-chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-create-chatbot.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-settings.png
--------------------------------------------------------------------------------
/docs/assets/deployment/open_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/open_webui.png
--------------------------------------------------------------------------------
/docs/assets/deployment/streamlit-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/streamlit-chat.png
--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/entrypoints.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/entrypoints.excalidraw.png
--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/llm_engine.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/llm_engine.excalidraw.png
--------------------------------------------------------------------------------
/docs/assets/design/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/hierarchy.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-1.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-2.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/metrics/intervals-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-3.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-1.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-3.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-4.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-5.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-6.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/example-time-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-7.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/free.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/free.png
--------------------------------------------------------------------------------
/docs/assets/design/v1/prefix_caching/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/overview.png
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/abstraction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/abstraction.jpg
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/overview.jpg
--------------------------------------------------------------------------------
/docs/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/k_vecs.png
--------------------------------------------------------------------------------
/docs/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/key.png
--------------------------------------------------------------------------------
/docs/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/logits_vec.png
--------------------------------------------------------------------------------
/docs/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/q_vecs.png
--------------------------------------------------------------------------------
/docs/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/query.png
--------------------------------------------------------------------------------
/docs/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/v_vec.png
--------------------------------------------------------------------------------
/docs/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/value.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.ico
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-dark.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-light.png
--------------------------------------------------------------------------------
/docs/community/sponsors.md:
--------------------------------------------------------------------------------
1 | # Sponsors
2 |
3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
4 |
5 |
6 |
7 |
8 | Cash Donations:
9 |
10 | - a16z
11 | - Dropbox
12 | - Sequoia Capital
13 | - Skywork AI
14 | - ZhenFund
15 |
16 | Compute Resources:
17 |
18 | - AMD
19 | - Anyscale
20 | - AWS
21 | - Crusoe Cloud
22 | - Databricks
23 | - DeepInfra
24 | - Google Cloud
25 | - Intel
26 | - Lambda Lab
27 | - Nebius
28 | - Novita AI
29 | - NVIDIA
30 | - Replicate
31 | - Roblox
32 | - RunPod
33 | - Trainy
34 | - UC Berkeley
35 | - UC San Diego
36 |
37 | Slack Sponsor: Anyscale
38 |
39 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
40 |
--------------------------------------------------------------------------------
/docs/configuration/README.md:
--------------------------------------------------------------------------------
1 | # Configuration Options
2 |
3 | This section lists the most common options for running vLLM.
4 |
5 | There are three main levels of configuration, from highest priority to lowest priority:
6 |
7 | - [Request parameters][completions-api] and [input arguments][sampling-params]
8 | - [Engine arguments](./engine_args.md)
9 | - [Environment variables](./env_vars.md)
10 |
--------------------------------------------------------------------------------
/docs/configuration/env_vars.md:
--------------------------------------------------------------------------------
1 | # Environment Variables
2 |
3 | vLLM uses the following environment variables to configure the system:
4 |
5 | !!! warning
6 | Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
7 |
8 | All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
9 |
10 | ```python
11 | --8<-- "vllm/envs.py:env-vars-definition"
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/contributing/model/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Adding a New Model
3 | ---
4 | [](){ #new-model }
5 |
6 | This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
7 |
8 | Contents:
9 |
10 | - [Basic](basic.md)
11 | - [Registration](registration.md)
12 | - [Tests](tests.md)
13 | - [Multimodal](multimodal.md)
14 |
15 | !!! note
16 | The complexity of adding a new model depends heavily on the model's architecture.
17 | The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
18 | However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
19 |
20 | !!! tip
21 | If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
22 | or ask on our [developer slack](https://slack.vllm.ai).
23 | We will be happy to help you out!
24 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/bentoml.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: BentoML
3 | ---
4 | [](){ #deployment-bentoml }
5 |
6 | [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
7 |
8 | For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
9 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/lobe-chat.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Lobe Chat
3 | ---
4 | [](){ #deployment-lobe-chat }
5 |
6 | [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
7 |
8 | Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
9 |
10 | One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
11 |
12 | It supports vLLM as a AI model provider to efficiently serve large language models.
13 |
14 | For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
15 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/modal.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Modal
3 | ---
4 | [](){ #deployment-modal }
5 |
6 | vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
7 |
8 | For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
9 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/open-webui.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Open WebUI
3 | ---
4 | [](){ #deployment-open-webui }
5 |
6 | 1. Install the [Docker](https://docs.docker.com/engine/install/)
7 |
8 | 2. Start the vLLM server with the supported chat completion model, e.g.
9 |
10 | ```console
11 | vllm serve qwen/Qwen1.5-0.5B-Chat
12 | ```
13 |
14 | 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
15 |
16 | ```console
17 | docker run -d -p 3000:8080 \
18 | --name open-webui \
19 | -v open-webui:/app/backend/data \
20 | -e OPENAI_API_BASE_URL=http://:/v1 \
21 | --restart always \
22 | ghcr.io/open-webui/open-webui:main
23 | ```
24 |
25 | 1. Open it in the browser:
26 |
27 | On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
28 |
29 | 
30 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/triton.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: NVIDIA Triton
3 | ---
4 | [](){ #deployment-triton }
5 |
6 | The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
7 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/kserve.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: KServe
3 | ---
4 | [](){ #deployment-kserve }
5 |
6 | vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
7 |
8 | Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
9 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/kubeai.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: KubeAI
3 | ---
4 | [](){ #deployment-kubeai }
5 |
6 | [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
7 |
8 | Please see the Installation Guides for environment specific instructions:
9 |
10 | - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
11 | - [EKS](https://www.kubeai.org/installation/eks/)
12 | - [GKE](https://www.kubeai.org/installation/gke/)
13 |
14 | Once you have KubeAI installed, you can
15 | [configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
16 | using vLLM.
17 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/llmaz.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: llmaz
3 | ---
4 | [](){ #deployment-llmaz }
5 |
6 | [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
7 |
8 | Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
9 |
--------------------------------------------------------------------------------
/docs/features/quantization/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Quantization
3 | ---
4 | [](){ #quantization-index }
5 |
6 | Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
7 |
8 | Contents:
9 |
10 | - [Supported_Hardware](supported_hardware.md)
11 | - [Auto_Awq](auto_awq.md)
12 | - [Bnb](bnb.md)
13 | - [Bitblas](bitblas.md)
14 | - [Gguf](gguf.md)
15 | - [Gptqmodel](gptqmodel.md)
16 | - [Int4](int4.md)
17 | - [Int8](int8.md)
18 | - [Fp8](fp8.md)
19 | - [Modelopt](modelopt.md)
20 | - [Quark](quark.md)
21 | - [Quantized_Kvcache](quantized_kvcache.md)
22 | - [Torchao](torchao.md)
23 |
--------------------------------------------------------------------------------
/docs/getting_started/installation/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 | - README.md
3 | - gpu.md
4 | - cpu.md
5 | - ai_accelerator.md
--------------------------------------------------------------------------------
/docs/getting_started/installation/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Installation
3 | ---
4 | [](){ #installation-index }
5 |
6 | vLLM supports the following hardware platforms:
7 |
8 | - [GPU](gpu.md)
9 | - [NVIDIA CUDA](gpu.md#nvidia-cuda)
10 | - [AMD ROCm](gpu.md#amd-rocm)
11 | - [Intel XPU](gpu.md#intel-xpu)
12 | - [CPU](cpu.md)
13 | - [Intel/AMD x86](cpu.md#intelamd-x86)
14 | - [ARM AArch64](cpu.md#arm-aarch64)
15 | - [Apple silicon](cpu.md#apple-silicon)
16 | - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
17 | - [Other AI accelerators](ai_accelerator.md)
18 | - [Google TPU](ai_accelerator.md#google-tpu)
19 | - [Intel Gaudi](ai_accelerator.md#intel-gaudi)
20 | - [AWS Neuron](ai_accelerator.md#aws-neuron)
21 |
--------------------------------------------------------------------------------
/docs/getting_started/installation/device.template.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Requirements
4 |
5 | ## Set up using Python
6 |
7 | ### Pre-built wheels
8 |
9 | ### Build wheel from source
10 |
11 | ## Set up using Docker
12 |
13 | ### Pre-built images
14 |
15 | ### Build image from source
16 |
17 | ## Extra information
18 |
--------------------------------------------------------------------------------
/docs/getting_started/installation/python_env_setup.inc.md:
--------------------------------------------------------------------------------
1 | It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
2 |
3 | ```console
4 | uv venv --python 3.12 --seed
5 | source .venv/bin/activate
6 | ```
7 |
--------------------------------------------------------------------------------
/docs/mkdocs/hooks/remove_announcement.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import os
4 | from typing import Literal
5 |
6 |
7 | def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
8 | # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
9 | if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
10 | # remove the warning banner if the version is a tagged release
11 | docs_dir = os.path.dirname(__file__)
12 | announcement_path = os.path.join(docs_dir,
13 | "mkdocs/overrides/main.html")
14 | # The file might be removed already if the build is triggered multiple
15 | # times (readthedocs build both HTML and PDF versions separately)
16 | if os.path.exists(announcement_path):
17 | os.remove(announcement_path)
18 |
--------------------------------------------------------------------------------
/docs/mkdocs/javascript/run_llm_widget.js:
--------------------------------------------------------------------------------
1 | // Add RunLLM widget
2 | document.addEventListener("DOMContentLoaded", function () {
3 | var script = document.createElement("script");
4 | script.type = "module";
5 | script.id = "runllm-widget-script"
6 |
7 | script.src = "https://widget.runllm.com";
8 |
9 | script.setAttribute("version", "stable");
10 | script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
11 | script.setAttribute("runllm-name", "vLLM");
12 | script.setAttribute("runllm-position", "BOTTOM_RIGHT");
13 | script.setAttribute("runllm-position-y", "120px");
14 | script.setAttribute("runllm-position-x", "20px");
15 | script.setAttribute("runllm-assistant-id", "207");
16 |
17 | script.async = true;
18 | document.head.appendChild(script);
19 | });
20 |
--------------------------------------------------------------------------------
/docs/mkdocs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block announce %}
4 | You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.
5 | {% endblock %}
6 |
--------------------------------------------------------------------------------
/docs/models/extensions/fastsafetensor.md:
--------------------------------------------------------------------------------
1 | Loading Model weights with fastsafetensors
2 | ===================================================================
3 |
4 | Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
5 | For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
6 |
--------------------------------------------------------------------------------
/docs/serving/integrations/langchain.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: LangChain
3 | ---
4 | [](){ #serving-langchain }
5 |
6 | vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
7 |
8 | To install LangChain, run
9 |
10 | ```console
11 | pip install langchain langchain_community -q
12 | ```
13 |
14 | To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
15 |
16 | ```python
17 | from langchain_community.llms import VLLM
18 |
19 | llm = VLLM(model="mosaicml/mpt-7b",
20 | trust_remote_code=True, # mandatory for hf models
21 | max_new_tokens=128,
22 | top_k=10,
23 | top_p=0.95,
24 | temperature=0.8,
25 | # tensor_parallel_size=... # for distributed inference
26 | )
27 |
28 | print(llm("What is the capital of France ?"))
29 | ```
30 |
31 | Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
32 |
--------------------------------------------------------------------------------
/docs/serving/integrations/llamaindex.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: LlamaIndex
3 | ---
4 | [](){ #serving-llamaindex }
5 |
6 | vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
7 |
8 | To install LlamaIndex, run
9 |
10 | ```console
11 | pip install llama-index-llms-vllm -q
12 | ```
13 |
14 | To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
15 |
16 | ```python
17 | from llama_index.llms.vllm import Vllm
18 |
19 | llm = Vllm(
20 | model="microsoft/Orca-2-7b",
21 | tensor_parallel_size=4,
22 | max_new_tokens=100,
23 | vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
24 | )
25 | ```
26 |
27 | Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
28 |
--------------------------------------------------------------------------------
/docs/training/rlhf.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning from Human Feedback
2 |
3 | Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
4 |
5 | vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
6 |
7 | See the following basic examples to get started if you don't want to use an existing library:
8 |
9 | - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
10 | - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
11 | - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
12 |
--------------------------------------------------------------------------------
/docs/usage/README.md:
--------------------------------------------------------------------------------
1 | # Using vLLM
2 |
3 | vLLM supports the following usage patterns:
4 |
5 | - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
6 | - [Deployment](../deployment/docker.md): Scale up model instances for production.
7 | - [Training](../training/rlhf.md): Train or fine-tune a model.
8 |
--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Prefill V1
2 |
3 | This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
4 |
5 | ## Files
6 |
7 | - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
8 | - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
9 | - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
10 | - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
11 |
--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/run.sh:
--------------------------------------------------------------------------------
1 | rm -rf local_storage/
2 |
3 | if [ -f "output.txt" ]; then
4 | rm output.txt
5 | fi
6 |
7 | # The directory of current script
8 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
9 |
10 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
11 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
12 |
--------------------------------------------------------------------------------
/examples/offline_inference/openai_batch/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
3 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/.helmignore:
--------------------------------------------------------------------------------
1 | *.png
2 | .git/
3 | ct.yaml
4 | lintconf.yaml
5 | values.schema.json
6 | /workflows
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: chart-vllm
3 | description: Chart vllm
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.0.1
19 |
20 | maintainers:
21 | - name: mfournioux
22 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/ct.yaml:
--------------------------------------------------------------------------------
1 | chart-dirs:
2 | - charts
3 | validate-maintainers: false
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.configs -}}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: "{{ .Release.Name }}-configs"
6 | namespace: {{ .Release.Namespace }}
7 | data:
8 | {{- with .Values.configs }}
9 | {{- toYaml . | nindent 2 }}
10 | {{- end }}
11 | {{- end -}}
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/custom-objects.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.customObjects }}
2 | {{- range .Values.customObjects }}
3 | {{- tpl (. | toYaml) $ }}
4 | ---
5 | {{- end }}
6 | {{- end }}
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: policy/v1
2 | kind: PodDisruptionBudget
3 | metadata:
4 | name: "{{ .Release.Name }}-pdb"
5 | namespace: {{ .Release.Namespace }}
6 | spec:
7 | maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/pvc.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.extraInit }}
2 | apiVersion: v1
3 | kind: PersistentVolumeClaim
4 | metadata:
5 | name: "{{ .Release.Name }}-storage-claim"
6 | namespace: {{ .Release.Namespace }}
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.extraInit.pvcStorage }}
13 | {{- end }}
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/secrets.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: "{{ .Release.Name }}-secrets"
5 | namespace: {{ .Release.Namespace }}
6 | type: Opaque
7 | data:
8 | {{- range $key, $val := .Values.secrets }}
9 | {{ $key }}: {{ $val | b64enc | quote }}
10 | {{- end }}
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: "{{ .Release.Name }}-service"
5 | namespace: {{ .Release.Namespace }}
6 | spec:
7 | type: ClusterIP
8 | ports:
9 | - name: {{ include "chart.service-port-name" . }}
10 | port: {{ include "chart.service-port" . }}
11 | targetPort: {{ include "chart.container-port-name" . }}
12 | protocol: TCP
13 | selector:
14 | {{- include "chart.labels" . | nindent 4 }}
--------------------------------------------------------------------------------
/examples/online_serving/disaggregated_serving/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Serving
2 |
3 | This example contains scripts that demonstrate the disaggregated serving features of vLLM.
4 |
5 | ## Files
6 |
7 | - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
8 | - `kv_events.sh` - Demonstrates KV cache event publishing.
9 |
--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # docker-compose.yaml
2 | version: "3"
3 |
4 | services:
5 | prometheus:
6 | image: prom/prometheus:latest
7 | extra_hosts:
8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
9 | ports:
10 | - "9090:9090" # the default port used by Prometheus
11 | volumes:
12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 |
14 | grafana:
15 | image: grafana/grafana:latest
16 | depends_on:
17 | - prometheus
18 | ports:
19 | - "3000:3000" # the default port used by Grafana
20 |
--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/prometheus.yaml:
--------------------------------------------------------------------------------
1 | # prometheus.yaml
2 | global:
3 | scrape_interval: 5s
4 | evaluation_interval: 30s
5 |
6 | scrape_configs:
7 | - job_name: vllm
8 | static_configs:
9 | - targets:
10 | - 'host.docker.internal:8000'
11 |
--------------------------------------------------------------------------------
/examples/online_serving/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from openai import APIConnectionError, OpenAI
4 | from openai.pagination import SyncPage
5 | from openai.types.model import Model
6 |
7 |
8 | def get_first_model(client: OpenAI) -> str:
9 | """
10 | Get the first model from the vLLM server.
11 | """
12 | try:
13 | models: SyncPage[Model] = client.models.list()
14 | except APIConnectionError as e:
15 | raise RuntimeError(
16 | "Failed to get the list of models from the vLLM server at "
17 | f"{client.base_url} with API key {client.api_key}. Check\n"
18 | "1. the server is running\n"
19 | "2. the server URL is correct\n"
20 | "3. the API key is correct"
21 | ) from e
22 |
23 | if len(models.data) == 0:
24 | raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
25 |
26 | return models.data[0].id
27 |
--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml:
--------------------------------------------------------------------------------
1 | local_cpu: False
2 | max_local_cpu_size: 0
3 | #local_disk:
4 | max_local_disk_size: 0
5 | remote_serde: NULL
6 |
7 | enable_nixl: True
8 | nixl_role: "receiver"
9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 |
--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml:
--------------------------------------------------------------------------------
1 | local_cpu: False
2 | max_local_cpu_size: 0
3 | #local_disk:
4 | max_local_disk_size: 0
5 | remote_serde: NULL
6 |
7 | enable_nixl: True
8 | nixl_role: "sender"
9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 |
--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
2 |
3 | {% for message in messages %}
4 | {% if message['role'] == 'user' %}
5 | ### Instruction:
6 | {{ message['content']|trim -}}
7 | {% if not loop.last %}
8 |
9 |
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 |
16 |
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 |
23 |
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}
--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
2 |
3 | {%- for message in messages -%}
4 | {%- if message['role'] == 'user' -%}
5 | {{- '' + message['content'] -}}
6 | {%- elif message['role'] == 'assistant' -%}
7 | {{- '' + message['content'] -}}
8 | {%- endif -%}
9 | {%- endfor -%}
10 |
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 | {{- '' -}}
13 | {% endif %}
--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
1 | {%- set counter = namespace(index=0) -%}
2 | {%- for message in messages -%}
3 | {%- if message['role'] == 'user' -%}
4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}}
5 | {%- set counter.index = counter.index + 1 -%}
6 | {%- endif -%}
7 | {%- if message['role'] == 'assistant' -%}
8 | {{- '\n答:' + message['content'] -}}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n' -}}
11 | {%- endif -%}
12 | {%- endif -%}
13 | {%- endfor -%}
14 |
15 |
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 | {{- '\n答:' -}}
18 | {%- endif -%}
--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
1 | {%- set counter = namespace(index=1) -%}
2 | {%- for message in messages -%}
3 | {%- if message['role'] == 'user' -%}
4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}}
5 | {%- set counter.index = counter.index + 1 -%}
6 | {%- endif -%}
7 | {%- if message['role'] == 'assistant' -%}
8 | {{- '\n\n答:' + message['content'] -}}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n\n' -}}
11 | {%- endif -%}
12 | {%- endif -%}
13 | {%- endfor -%}
14 |
15 |
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 | {{- '\n\n答:' -}}
18 | {%- endif -%}
--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'user' -%}
3 | {{- 'User: ' + message['content'] -}}
4 | {%- elif message['role'] == 'assistant' -%}
5 | {{- 'Assistant: ' + message['content'] -}}
6 | {%- endif -%}
7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
8 | {{- '\n' -}}
9 | {%- endif -%}
10 | {%- endfor -%}
11 |
12 |
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 | {{- 'Assistant:' -}}
15 | {% endif %}
--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'system' -%}
3 | {{- 'System: ' + message['content'] -}}
4 | {%- elif message['role'] == 'user' -%}
5 | {{- 'User: ' + message['content'] -}}
6 | {%- elif message['role'] == 'assistant' -%}
7 | {{- 'Falcon: ' + message['content'] -}}
8 | {%- endif -%}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n' -}}
11 | {%- endif -%}
12 | {%- endfor -%}
13 |
14 |
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 | {{- 'Falcon:' -}}
17 | {% endif %}
--------------------------------------------------------------------------------
/examples/template_teleflm.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages %}
2 | {%- if message['role'] == 'user' %}
3 | {{- '<_user>' + message['content']|trim }}
4 | {%- elif message['role'] == 'system' %}
5 | {{- '<_system>' + message['content']|trim }}
6 | {%- elif message['role'] == 'assistant' %}
7 | {{- '<_bot>' + message['content'] }}
8 | {%- endif %}
9 | {%- endfor %}
10 | {%- if add_generation_prompt %}
11 | {{- '<_bot>' }}
12 | {%- endif %}
13 |
--------------------------------------------------------------------------------
/examples/template_vlm2vec.jinja:
--------------------------------------------------------------------------------
1 | {%- if messages | length > 1 -%}
2 | {{ raise_exception('Embedding models should only embed one message at a time') }}
3 | {%- endif -%}
4 |
5 | {% set vars = namespace(parts=[], next_image_id=1) %}
6 | {%- for message in messages -%}
7 | {%- for content in message['content'] -%}
8 | {%- if content['type'] == 'text' -%}
9 | {%- set vars.parts = vars.parts + [content['text']] %}
10 | {%- elif content['type'] == 'image' -%}
11 | {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
12 | {%- set vars.next_image_id = vars.next_image_id + 1 %}
13 | {%- endif -%}
14 | {%- endfor -%}
15 | {%- endfor -%}
16 | {{ vars.parts | join(' ') }}
17 |
--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
4 | echo "Please run 'pip install -r requirements/lint.txt', followed by"
5 | echo "'pre-commit install' to install the pre-commit hooks."
6 | echo "Then linters will run automatically before each commit."
--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | # Should be mirrored in pyproject.toml
2 | cmake>=3.26.1
3 | ninja
4 | packaging>=24.2
5 | setuptools>=77.0.3,<80.0.0
6 | setuptools-scm>=8
7 | torch==2.7.0
8 | wheel
9 | jinja2>=3.1.6
10 | regex
11 |
--------------------------------------------------------------------------------
/requirements/cuda.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
5 | numba == 0.61.2; python_version > '3.9'
6 |
7 | # Dependencies for NVIDIA GPUs
8 | ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
9 | torch==2.7.0
10 | torchaudio==2.7.0
11 | # These must be updated alongside torch
12 | torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
13 | # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
14 | xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
15 |
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r lint.txt
2 | -r test.txt
3 |
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 |
--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-api-autonav
3 | mkdocs-material
4 | mkdocstrings-python
5 | mkdocs-gen-files
6 | mkdocs-awesome-nav
7 | python-markdown-math
8 | regex
9 | ruff
10 |
--------------------------------------------------------------------------------
/requirements/hpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | # Dependencies for HPU code
5 | ray
6 | triton==3.1.0
7 | pandas
8 | numpy==1.26.4
9 | tabulate
10 | setuptools>=77.0.3,<80.0.0
11 | setuptools-scm>=8
12 | vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
13 |
--------------------------------------------------------------------------------
/requirements/lint.txt:
--------------------------------------------------------------------------------
1 | # formatting
2 | pre-commit==4.0.1
3 |
--------------------------------------------------------------------------------
/requirements/neuron.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | # Dependencies for Neuron devices
5 | packaging>=24.2
6 | setuptools>=77.0.3,<80.0.0
7 | torch-neuronx >= 2.5.0
8 | neuronx-cc>=2.0.0a0
9 | torchvision # Required for Llama3.2 multimodal image preprocessing
10 |
--------------------------------------------------------------------------------
/requirements/rocm-build.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
5 | torch==2.7.0
6 | torchvision==0.22.0
7 | torchaudio==2.7.0
8 |
9 | triton==3.2
10 | cmake>=3.26.1,<4
11 | packaging>=24.2
12 | setuptools>=77.0.3,<80.0.0
13 | setuptools-scm>=8
14 | wheel
15 | jinja2>=3.1.6
16 | amdsmi==6.2.4
17 |
--------------------------------------------------------------------------------
/requirements/rocm-test.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | # entrypoints test
5 | # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
6 | audioread==3.0.1
7 | cffi==1.17.1
8 | decorator==5.2.1
9 | lazy-loader==0.4
10 | platformdirs==4.3.6
11 | pooch==1.8.2
12 | #pycparse==2.22
13 | soundfile==0.13.1
14 | soxr==0.5.0.post1
15 | librosa==0.10.2.post1
16 |
17 | # entrypoints test
18 | #vllm[video] # required by entrypoints/openai/test_video.py
19 | decord==0.6.0
20 |
21 | # entrypoints test
22 | #sentence-transformers # required by entrypoints/openai/test_score.py
23 | sentence-transformers==3.4.1
24 |
25 | # Basic Models Test
26 | matplotlib==3.10.3
27 |
28 | # Multi-Modal Models Test (Extended) 3
29 | blobfile==3.0.0
30 |
31 |
32 |
--------------------------------------------------------------------------------
/requirements/rocm.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
5 | numba == 0.61.2; python_version > '3.9'
6 |
7 | # Dependencies for AMD GPUs
8 | boto3
9 | botocore
10 | datasets
11 | ray>=2.10.0,<2.45.0
12 | peft
13 | pytest-asyncio
14 | tensorizer>=2.9.0
15 | setuptools-scm>=8
16 | setuptools>=77.0.3,<80.0.0
17 | runai-model-streamer==0.11.0
18 | runai-model-streamer-s3==0.11.0
19 |
--------------------------------------------------------------------------------
/requirements/xpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | ray>=2.9
5 | cmake>=3.26.1
6 | packaging>=24.2
7 | setuptools-scm>=8
8 | setuptools>=77.0.3,<80.0.0
9 | wheel
10 | jinja2>=3.1.6
11 | datasets # for benchmark scripts
12 |
13 | torch==2.7.0+xpu
14 | torchaudio
15 | torchvision
16 | pytorch-triton-xpu
17 | --extra-index-url=https://download.pytorch.org/whl/xpu
18 |
19 | # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
20 | # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
21 | intel-extension-for-pytorch==2.7.10+xpu
22 | oneccl_bind_pt==2.7.0+xpu
23 | --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
24 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/__init__.py
--------------------------------------------------------------------------------
/tests/async_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/async_engine/__init__.py
--------------------------------------------------------------------------------
/tests/async_engine/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
13 |
--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/basic_correctness/__init__.py
--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from ..utils import compare_two_settings
5 |
6 |
7 | def test_cpu_offload():
8 | compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
9 | ["--cpu-offload-gb", "1"])
10 |
--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/benchmarks/__init__.py
--------------------------------------------------------------------------------
/tests/benchmarks/test_latency_cli.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import subprocess
4 |
5 | import pytest
6 |
7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
8 |
9 |
10 | @pytest.mark.benchmark
11 | def test_bench_latency():
12 | command = [
13 | "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
14 | "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 | ]
16 | result = subprocess.run(command, capture_output=True, text=True)
17 | print(result.stdout)
18 | print(result.stderr)
19 |
20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 |
--------------------------------------------------------------------------------
/tests/benchmarks/test_throughput_cli.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import subprocess
4 |
5 | import pytest
6 |
7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
8 |
9 |
10 | @pytest.mark.benchmark
11 | def test_bench_throughput():
12 | command = [
13 | "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
14 | "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 | ]
16 | result = subprocess.run(command, capture_output=True, text=True)
17 | print(result.stdout)
18 | print(result.stderr)
19 |
20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 |
--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/__init__.py
--------------------------------------------------------------------------------
/tests/compile/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | # TEST V1: this should be removed. Right now V1 overrides
7 | # all the torch compile logic. We should re-enable this
8 | # as we add torch compile support back to V1.
9 | @pytest.fixture(scope="function", autouse=True)
10 | def use_v0_only(monkeypatch):
11 | """
12 | Since this module is V0 only, set VLLM_USE_V1=0 for
13 | all tests in the module.
14 | """
15 | monkeypatch.setenv('VLLM_USE_V1', '0')
16 |
--------------------------------------------------------------------------------
/tests/compile/piecewise/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/piecewise/__init__.py
--------------------------------------------------------------------------------
/tests/config/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | trust_remote_code: true
5 | multi_step_stream_outputs: false
6 |
--------------------------------------------------------------------------------
/tests/config/test_config_with_model.yaml:
--------------------------------------------------------------------------------
1 | # Same as test_config.yaml but with model specified
2 | model: config-model
3 | port: 12312
4 | served_model_name: mymodel
5 | tensor_parallel_size: 2
6 | trust_remote_code: true
7 | multi_step_stream_outputs: false
8 |
--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/__init__.py
--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/__init__.py
--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 |
7 | @pytest.fixture()
8 | def should_do_global_cleanup_after_test() -> bool:
9 | """Disable the global cleanup fixture for tests in this directory. This
10 | provides a ~10x speedup for unit tests that don't load a model to GPU.
11 |
12 | This requires that tests in this directory clean up after themselves if they
13 | use the GPU.
14 | """
15 | return False
16 |
--------------------------------------------------------------------------------
/tests/core/block/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/e2e/__init__.py
--------------------------------------------------------------------------------
/tests/core/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
13 |
--------------------------------------------------------------------------------
/tests/detokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/detokenizer/__init__.py
--------------------------------------------------------------------------------
/tests/detokenizer/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(autouse=True)
7 | def v1(run_with_both_engines):
8 | # Simple autouse wrapper to run both engines for each test
9 | # This can be promoted up to conftest.py to run for every
10 | # test in a package
11 | pass
12 |
--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/distributed/__init__.py
--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from ..entrypoints.openai.test_oot_registration import (
5 | run_and_test_dummy_opt_api_server)
6 |
7 |
8 | def test_distributed_oot(dummy_opt_path: str):
9 | run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
10 |
--------------------------------------------------------------------------------
/tests/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/encoder_decoder/__init__.py
--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/engine/__init__.py
--------------------------------------------------------------------------------
/tests/engine/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
13 |
--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/__init__.py
--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/llm/__init__.py
--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from vllm import LLM
7 |
8 |
9 | @pytest.fixture(autouse=True)
10 | def v1(run_with_both_engines):
11 | # Simple autouse wrapper to run both engines for each test
12 | # This can be promoted up to conftest.py to run for every
13 | # test in a package
14 | pass
15 |
16 |
17 | def test_empty_prompt():
18 | llm = LLM(model="openai-community/gpt2", enforce_eager=True)
19 | with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
20 | llm.generate([""])
21 |
22 |
23 | @pytest.mark.skip_v1
24 | def test_out_of_vocab_token():
25 | llm = LLM(model="openai-community/gpt2", enforce_eager=True)
26 | with pytest.raises(ValueError, match='out of vocabulary'):
27 | llm.generate({"prompt_token_ids": [999999]})
28 |
--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/offline_mode/__init__.py
--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/__init__.py
--------------------------------------------------------------------------------
/tests/entrypoints/openai/correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/correctness/__init__.py
--------------------------------------------------------------------------------
/tests/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/tool_parsers/__init__.py
--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/fastsafetensors_loader/__init__.py
--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/test_fastsafetensors_loader.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm import SamplingParams
5 | from vllm.config import LoadFormat
6 |
7 | test_model = "openai-community/gpt2"
8 |
9 | prompts = [
10 | "Hello, my name is",
11 | "The president of the United States is",
12 | "The capital of France is",
13 | "The future of AI is",
14 | ]
15 | # Create a sampling params object.
16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
17 |
18 |
19 | def test_model_loader_download_files(vllm_runner):
20 | with vllm_runner(test_model,
21 | load_format=LoadFormat.FASTSAFETENSORS) as llm:
22 | deserialized_outputs = llm.generate(prompts, sampling_params)
23 | assert deserialized_outputs
24 |
--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/__init__.py
--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import torch
5 |
6 | # Reference default values of atol and rtol are from
7 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
8 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
9 | default_rtol = {
10 | torch.float16: 1e-3,
11 | torch.bfloat16: 1.6e-2,
12 | torch.float: 1.3e-6
13 | }
14 |
15 |
16 | def get_default_atol(output) -> float:
17 | return default_atol[output.dtype]
18 |
19 |
20 | def get_default_rtol(output) -> float:
21 | return default_rtol[output.dtype]
22 |
--------------------------------------------------------------------------------
/tests/kernels/attention/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from vllm.utils import (create_kv_caches_with_random,
7 | create_kv_caches_with_random_flash)
8 |
9 |
10 | @pytest.fixture()
11 | def kv_cache_factory():
12 | return create_kv_caches_with_random
13 |
14 |
15 | @pytest.fixture()
16 | def kv_cache_factory_flashinfer():
17 | return create_kv_caches_with_random_flash
18 |
--------------------------------------------------------------------------------
/tests/kernels/core/test_opcheck.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | Tests for miscellaneous utilities
5 | """
6 |
7 | import torch
8 |
9 | from tests.kernels.utils import opcheck
10 |
11 |
12 | def test_convert_fp8_opcheck():
13 | data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
14 | result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
15 | opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
16 |
17 |
18 | # TODO: Add this back, currently fails with
19 | # csrc/cuda_utils_kernels.cu:15 'invalid argument'
20 | # @pytest.mark.skipif(not current_platform.is_cuda(),
21 | # reason="Only supported for CUDA")
22 | # def test_cuda_utils_opcheck():
23 | # opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
24 | # opcheck(
25 | # torch.ops._C_cuda_utils.
26 | # get_max_shared_memory_per_block_device_attribute, (0, ))
27 |
--------------------------------------------------------------------------------
/tests/kernels/core/test_permute_cols.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 | import torch
6 |
7 | from tests.kernels.utils import opcheck
8 | from vllm._custom_ops import permute_cols
9 |
10 |
11 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
12 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
13 | def test_permute_cols(shape, dtype):
14 | x = torch.randn(shape, dtype=dtype).cuda()
15 | perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
16 | opcheck(torch.ops._C.permute_cols, (x, perm))
17 | y = permute_cols(x, perm)
18 | torch.testing.assert_close(y, x[:, perm])
--------------------------------------------------------------------------------
/tests/kernels/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/moe/__init__.py
--------------------------------------------------------------------------------
/tests/kv_transfer/test_lookup_buffer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python3 test_lookup_buffer.py &
3 | PID0=$!
4 | RANK=1 python3 test_lookup_buffer.py &
5 | PID1=$!
6 |
7 | wait $PID0
8 | wait $PID1
9 |
--------------------------------------------------------------------------------
/tests/kv_transfer/test_send_recv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | RANK=0 python3 test_send_recv.py &
4 | PID0=$!
5 | RANK=1 python3 test_send_recv.py &
6 | PID1=$!
7 |
8 | wait $PID0
9 | wait $PID1
10 |
--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/lora/__init__.py
--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/metrics/__init__.py
--------------------------------------------------------------------------------
/tests/mistral_tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mistral_tool_use/__init__.py
--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/model_executor/__init__.py
--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/__init__.py
--------------------------------------------------------------------------------
/tests/models/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/__init__.py
--------------------------------------------------------------------------------
/tests/models/language/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/generation/__init__.py
--------------------------------------------------------------------------------
/tests/models/language/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/pooling/__init__.py
--------------------------------------------------------------------------------
/tests/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/__init__.py
--------------------------------------------------------------------------------
/tests/models/multimodal/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/__init__.py
--------------------------------------------------------------------------------
/tests/models/multimodal/generation/vlm_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/vlm_utils/__init__.py
--------------------------------------------------------------------------------
/tests/models/multimodal/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/pooling/__init__.py
--------------------------------------------------------------------------------
/tests/models/multimodal/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/processing/__init__.py
--------------------------------------------------------------------------------
/tests/models/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/quantization/__init__.py
--------------------------------------------------------------------------------
/tests/mq_llm_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mq_llm_engine/__init__.py
--------------------------------------------------------------------------------
/tests/mq_llm_engine/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
13 |
--------------------------------------------------------------------------------
/tests/multi_step/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multi_step/__init__.py
--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/__init__.py
--------------------------------------------------------------------------------
/tests/multimodal/assets/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image1.png
--------------------------------------------------------------------------------
/tests/multimodal/assets/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image2.png
--------------------------------------------------------------------------------
/tests/multimodal/assets/rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/rgba.png
--------------------------------------------------------------------------------
/tests/multimodal/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import numpy as np
5 | from PIL import Image
6 |
7 |
8 | def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
9 | w, h = rng.randint(min_wh, max_wh, size=(2, ))
10 | arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
11 | return Image.fromarray(arr)
12 |
13 |
14 | def random_video(
15 | rng: np.random.RandomState,
16 | min_frames: int,
17 | max_frames: int,
18 | min_wh: int,
19 | max_wh: int,
20 | ):
21 | num_frames = rng.randint(min_frames, max_frames)
22 | w, h = rng.randint(min_wh, max_wh, size=(2, ))
23 | return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
24 |
25 |
26 | def random_audio(
27 | rng: np.random.RandomState,
28 | min_len: int,
29 | max_len: int,
30 | sr: int,
31 | ):
32 | audio_len = rng.randint(min_len, max_len)
33 | return rng.rand(audio_len), sr
34 |
--------------------------------------------------------------------------------
/tests/neuron/1_core/test_neuron_quant.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.model_executor.layers.quantization.neuron_quant import (
4 | NeuronQuantConfig)
5 |
6 |
7 | def test_get_supported_act_dtypes():
8 | neuron_quant_config = NeuronQuantConfig()
9 | supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
10 | target_list = ["any_dtype1", "any_dtype2"]
11 | for dtype in target_list:
12 | assert dtype in supported_act_dtypes
13 |
--------------------------------------------------------------------------------
/tests/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/plugins/lora_resolvers/__init__.py
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(name='vllm_add_dummy_model',
7 | version='0.1',
8 | packages=['vllm_add_dummy_model'],
9 | entry_points={
10 | 'vllm.general_plugins':
11 | ["register_dummy_model = vllm_add_dummy_model:register"]
12 | })
13 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm import ModelRegistry
5 |
6 |
7 | def register():
8 | # Test directly passing the model
9 | from .my_opt import MyOPTForCausalLM
10 |
11 | if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
12 | ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
13 |
14 | # Test passing lazy model
15 | if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
16 | ModelRegistry.register_model(
17 | "MyGemma2Embedding",
18 | "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
19 | )
20 |
21 | if "MyLlava" not in ModelRegistry.get_supported_archs():
22 | ModelRegistry.register_model("MyLlava",
23 | "vllm_add_dummy_model.my_llava:MyLlava")
24 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from typing import Optional
5 |
6 | import torch
7 |
8 | from vllm.model_executor.models.opt import OPTForCausalLM
9 | from vllm.model_executor.sampling_metadata import SamplingMetadata
10 |
11 |
12 | class MyOPTForCausalLM(OPTForCausalLM):
13 |
14 | def compute_logits(
15 | self, hidden_states: torch.Tensor,
16 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
17 | # this dummy model always predicts the first token
18 | logits = super().compute_logits(hidden_states, sampling_metadata)
19 | if logits is not None:
20 | logits.zero_()
21 | logits[:, 0] += 1.0
22 | return logits
23 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(
7 | name='vllm_add_dummy_platform',
8 | version='0.1',
9 | packages=['vllm_add_dummy_platform'],
10 | entry_points={
11 | 'vllm.platform_plugins': [
12 | "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa
13 | ]
14 | })
15 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from typing import Optional
5 |
6 |
7 | def dummy_platform_plugin() -> Optional[str]:
8 | return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
9 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.attention.backends.flash_attn import FlashAttentionBackend
5 |
6 |
7 | class DummyAttentionBackend(FlashAttentionBackend):
8 |
9 | @staticmethod
10 | def get_name() -> str:
11 | return "Dummy_Backend"
12 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.platforms.cuda import CudaPlatform
5 |
6 |
7 | class DummyPlatform(CudaPlatform):
8 | device_name = "DummyDevice"
9 |
10 | def get_attn_backend_cls(self, backend_name, head_size, dtype,
11 | kv_cache_dtype, block_size, use_v1, use_mla):
12 | return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501
13 |
--------------------------------------------------------------------------------
/tests/plugins_tests/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
--------------------------------------------------------------------------------
/tests/prefix_caching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/prefix_caching/__init__.py
--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 |
--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/quantization/__init__.py
--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.model_executor.layers.quantization import get_quantization_config
5 | from vllm.platforms import current_platform
6 |
7 |
8 | def is_quant_method_supported(quant_method: str) -> bool:
9 | # Currently, all quantization methods require Nvidia or AMD GPUs
10 | if not (current_platform.is_cuda() or current_platform.is_rocm()):
11 | return False
12 |
13 | capability = current_platform.get_device_capability()
14 | assert capability is not None
15 |
16 | min_capability = get_quantization_config(quant_method).get_min_capability()
17 |
18 | return capability.to_int() >= min_capability
19 |
--------------------------------------------------------------------------------
/tests/reasoning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/reasoning/__init__.py
--------------------------------------------------------------------------------
/tests/runai_model_streamer_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/runai_model_streamer_test/__init__.py
--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/samplers/__init__.py
--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/__init__.py
--------------------------------------------------------------------------------
/tests/spec_decode/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | Since this module is V0 only, set VLLM_USE_V1=0 for
10 | all tests in the module.
11 | """
12 | monkeypatch.setenv('VLLM_USE_V1', '0')
13 |
--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/e2e/__init__.py
--------------------------------------------------------------------------------
/tests/standalone_tests/python_only_compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script tests if the python only compilation works correctly
3 | # for users who do not have any compilers installed on their system
4 |
5 | set -e
6 | set -x
7 |
8 | cd /vllm-workspace/
9 |
10 | # uninstall vllm
11 | pip3 uninstall -y vllm
12 | # restore the original files
13 | mv test_docs/vllm ./vllm
14 |
15 | # remove all compilers
16 | apt remove --purge build-essential -y
17 | apt autoremove -y
18 |
19 | echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
20 |
21 | VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
22 |
23 | # Run the script
24 | python3 -c 'import vllm'
25 |
26 | # Check if the clangd log file was created
27 | if [ ! -f /tmp/changed.file ]; then
28 | echo "changed.file was not created, python only compilation failed"
29 | exit 1
30 | fi
31 |
--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tensorizer_loader/__init__.py
--------------------------------------------------------------------------------
/tests/tensorizer_loader/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 | from vllm.distributed import cleanup_dist_env_and_memory
6 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
7 |
8 |
9 | @pytest.fixture(autouse=True)
10 | def cleanup():
11 | cleanup_dist_env_and_memory(shutdown_ray=True)
12 |
13 |
14 | @pytest.fixture(autouse=True)
15 | def tensorizer_config():
16 | config = TensorizerConfig(tensorizer_uri="vllm")
17 | return config
18 |
--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import vllm
5 |
6 |
7 | def test_embedded_commit_defined():
8 | assert hasattr(vllm, "__version__")
9 | assert hasattr(vllm, "__version_tuple__")
10 | assert vllm.__version__ != "dev"
11 | assert vllm.__version_tuple__ != (0, 0, "dev")
12 |
--------------------------------------------------------------------------------
/tests/test_outputs.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.outputs import RequestOutput
5 |
6 |
7 | def test_request_output_forward_compatible():
8 | output = RequestOutput(request_id="test_request_id",
9 | prompt="test prompt",
10 | prompt_token_ids=[1, 2, 3],
11 | prompt_logprobs=None,
12 | outputs=[],
13 | finished=False,
14 | example_arg_added_in_new_version="some_value")
15 | assert output is not None
16 |
--------------------------------------------------------------------------------
/tests/test_seed_behavior.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import random
4 |
5 | import numpy as np
6 | import torch
7 |
8 | from vllm.platforms.interface import Platform
9 |
10 |
11 | def test_seed_behavior():
12 | # Test with a specific seed
13 | Platform.seed_everything(42)
14 | random_value_1 = random.randint(0, 100)
15 | np_random_value_1 = np.random.randint(0, 100)
16 | torch_random_value_1 = torch.randint(0, 100, (1, )).item()
17 |
18 | Platform.seed_everything(42)
19 | random_value_2 = random.randint(0, 100)
20 | np_random_value_2 = np.random.randint(0, 100)
21 | torch_random_value_2 = torch.randint(0, 100, (1, )).item()
22 |
23 | assert random_value_1 == random_value_2
24 | assert np_random_value_1 == np_random_value_2
25 | assert torch_random_value_1 == torch_random_value_2
26 |
--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tokenization/__init__.py
--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 | from transformers import PreTrainedTokenizerBase
6 |
7 | from vllm.transformers_utils.tokenizer import get_tokenizer
8 |
9 | TOKENIZER_NAMES = [
10 | "facebook/opt-125m",
11 | "gpt2",
12 | ]
13 |
14 |
15 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
16 | def test_tokenizer_revision(tokenizer_name: str):
17 | # Assume that "main" branch always exists
18 | tokenizer = get_tokenizer(tokenizer_name, revision="main")
19 | assert isinstance(tokenizer, PreTrainedTokenizerBase)
20 |
21 | # Assume that "never" branch always does not exist
22 | with pytest.raises(OSError, match='not a valid git identifier'):
23 | get_tokenizer(tokenizer_name, revision="never")
24 |
--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tool_use/__init__.py
--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/__init__.py
--------------------------------------------------------------------------------
/tests/tpu/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/lora/__init__.py
--------------------------------------------------------------------------------
/tests/tracing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tracing/__init__.py
--------------------------------------------------------------------------------
/tests/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/__init__.py
--------------------------------------------------------------------------------
/tests/v1/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/e2e/__init__.py
--------------------------------------------------------------------------------
/tests/v1/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/engine/__init__.py
--------------------------------------------------------------------------------
/tests/v1/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/__init__.py
--------------------------------------------------------------------------------
/tests/v1/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/llm/__init__.py
--------------------------------------------------------------------------------
/tests/v1/kv_connector/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/kv_connector/unit/__init__.py
--------------------------------------------------------------------------------
/tests/v1/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/sample/__init__.py
--------------------------------------------------------------------------------
/tests/v1/shutdown/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Shutdown test utils"""
4 |
5 | SHUTDOWN_TEST_TIMEOUT_SEC = 120
6 | SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
7 |
--------------------------------------------------------------------------------
/tests/v1/structured_output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/structured_output/__init__.py
--------------------------------------------------------------------------------
/tests/v1/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/__init__.py
--------------------------------------------------------------------------------
/tests/v1/tpu/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/worker/__init__.py
--------------------------------------------------------------------------------
/tests/v1/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/worker/__init__.py
--------------------------------------------------------------------------------
/tests/vllm_test_utils/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(
7 | name='vllm_test_utils',
8 | version='0.1',
9 | packages=['vllm_test_utils'],
10 | )
11 |
--------------------------------------------------------------------------------
/tests/vllm_test_utils/vllm_test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | vllm_utils is a package for vLLM testing utilities.
5 | It does not import any vLLM modules.
6 | """
7 |
8 | from .blame import BlameResult, blame
9 | from .monitor import MonitoredValues, monitor
10 |
11 | __all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
12 |
--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
5 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
6 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
7 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
8 | compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/worker/__init__.py
--------------------------------------------------------------------------------
/tests/worker/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="function", autouse=True)
7 | def use_v0_only(monkeypatch):
8 | """
9 | This module tests V0 internals, so set VLLM_USE_V1=0.
10 | """
11 | monkeypatch.setenv('VLLM_USE_V1', '0')
--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
3 |
4 | if ! git diff --quiet; then
5 | echo "Repo is dirty" >&2
6 |
7 | exit 1
8 | fi
9 |
10 | if ! git describe --tags; then
11 | echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
12 |
13 | exit 1
14 | fi
15 |
--------------------------------------------------------------------------------
/tools/ep_kernels/install_system_drivers.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | # prepare workspace directory
4 | WORKSPACE=$1
5 | if [ -z "$WORKSPACE" ]; then
6 | export WORKSPACE=$(pwd)/ep_kernels_workspace
7 | fi
8 |
9 | if [ ! -d "$WORKSPACE" ]; then
10 | mkdir -p $WORKSPACE
11 | fi
12 |
13 | # build and install gdrcopy driver
14 | pushd $WORKSPACE
15 | cd gdrcopy_src
16 | ./insmod.sh
17 | # run gdrcopy_copybw to test the installation
18 | $WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
19 |
20 | # turn on IBGDA
21 | echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
22 | update-initramfs -u
23 |
24 | echo "Please reboot the system to apply the changes"
25 |
--------------------------------------------------------------------------------
/tools/ep_kernels/install_system_libraries.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | # prepare workspace directory
4 | WORKSPACE=$1
5 | if [ -z "$WORKSPACE" ]; then
6 | export WORKSPACE=$(pwd)/ep_kernels_workspace
7 | fi
8 |
9 | if [ ! -d "$WORKSPACE" ]; then
10 | mkdir -p $WORKSPACE
11 | fi
12 |
13 | # build and install gdrcopy system packages
14 | pushd $WORKSPACE
15 | cd gdrcopy_src/packages
16 | apt install devscripts -y
17 | CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
18 | dpkg -i *.deb
19 |
--------------------------------------------------------------------------------
/tools/mypy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CI=${1:-0}
4 | PYTHON_VERSION=${2:-local}
5 |
6 | if [ "$CI" -eq 1 ]; then
7 | set -e
8 | fi
9 |
10 | if [ $PYTHON_VERSION == "local" ]; then
11 | PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
12 | fi
13 |
14 | run_mypy() {
15 | echo "Running mypy on $1"
16 | if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
17 | mypy --python-version "${PYTHON_VERSION}" "$@"
18 | return
19 | fi
20 | mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
21 | }
22 |
23 | run_mypy # Note that this is less strict than CI
24 | run_mypy tests
25 | run_mypy vllm/attention
26 | run_mypy vllm/compilation
27 | run_mypy vllm/distributed
28 | run_mypy vllm/engine
29 | run_mypy vllm/executor
30 | run_mypy vllm/inputs
31 | run_mypy vllm/lora
32 | run_mypy vllm/model_executor
33 | run_mypy vllm/plugins
34 | run_mypy vllm/prompt_adapter
35 | run_mypy vllm/spec_decode
36 | run_mypy vllm/worker
37 | run_mypy vllm/v1
38 |
--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Ensure that *.excalidraw.png files have the excalidraw metadata
4 | # embedded in them. This ensures they can be loaded back into
5 | # the tool and edited in the future.
6 |
7 | find . -iname '*.excalidraw.png' | while read -r file; do
8 | if git check-ignore -q "$file"; then
9 | continue
10 | fi
11 | if ! grep -q "excalidraw+json" "$file"; then
12 | echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
13 | exit 1
14 | fi
15 | done
16 |
--------------------------------------------------------------------------------
/tools/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | scversion="stable"
5 |
6 | if [ -d "shellcheck-${scversion}" ]; then
7 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
8 | fi
9 |
10 | if ! [ -x "$(command -v shellcheck)" ]; then
11 | if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
12 | echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
13 | exit 1
14 | fi
15 |
16 | # automatic local install if linux x86_64
17 | wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
18 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
19 | fi
20 |
21 | # TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
22 | find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
23 |
--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import glob
5 |
6 | requires_files = glob.glob('requirements/*.txt')
7 | requires_files += ["pyproject.toml"]
8 | for file in requires_files:
9 | print(f">>> cleaning {file}")
10 | with open(file) as f:
11 | lines = f.readlines()
12 | if "torch" in "".join(lines).lower():
13 | print("removed:")
14 | with open(file, 'w') as f:
15 | for line in lines:
16 | if 'torch' not in line.lower():
17 | f.write(line)
18 | else:
19 | print(line.strip())
20 | print(f"<<< done cleaning {file}")
21 | print()
22 |
--------------------------------------------------------------------------------
/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/adapter_commons/__init__.py
--------------------------------------------------------------------------------
/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from dataclasses import dataclass
5 |
6 |
7 | @dataclass
8 | class AdapterMapping:
9 | # Per every token in input_ids:
10 | index_mapping: tuple[int, ...]
11 | # Per sampled token:
12 | prompt_mapping: tuple[int, ...]
13 |
14 | def __post_init__(self):
15 | self.index_mapping = tuple(self.index_mapping)
16 | self.prompt_mapping = tuple(self.prompt_mapping)
--------------------------------------------------------------------------------
/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from abc import ABC, abstractmethod
5 |
6 |
7 | class AdapterRequest(ABC):
8 | """
9 | Base class for adapter requests.
10 | """
11 |
12 | @property
13 | @abstractmethod
14 | def adapter_id(self) -> int:
15 | raise NotImplementedError
16 |
17 | def __post_init__(self) -> None:
18 | if self.adapter_id < 1:
19 | raise ValueError(f"id must be > 0, got {self.adapter_id}")
20 |
21 | def __eq__(self, value: object) -> bool:
22 | return isinstance(
23 | value, self.__class__) and self.adapter_id == value.adapter_id
24 |
25 | def __hash__(self) -> int:
26 | return hash(self.adapter_id)
27 |
--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/assets/__init__.py
--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.attention.backends.abstract import (AttentionBackend,
5 | AttentionMetadata,
6 | AttentionMetadataBuilder,
7 | AttentionState, AttentionType)
8 | from vllm.attention.layer import Attention
9 | from vllm.attention.selector import get_attn_backend
10 |
11 | __all__ = [
12 | "Attention",
13 | "AttentionBackend",
14 | "AttentionMetadata",
15 | "AttentionType",
16 | "AttentionMetadataBuilder",
17 | "Attention",
18 | "AttentionState",
19 | "get_attn_backend",
20 | ]
21 |
--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/__init__.py
--------------------------------------------------------------------------------
/vllm/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/mla/__init__.py
--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/__init__.py
--------------------------------------------------------------------------------
/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/blocksparse_attention/__init__.py
--------------------------------------------------------------------------------
/vllm/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/benchmarks/__init__.py
--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/compilation/__init__.py
--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/__init__.py
--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/block/__init__.py
--------------------------------------------------------------------------------
/vllm/device_allocator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/device_allocator/__init__.py
--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .communication_op import *
5 | from .parallel_state import *
6 | from .utils import *
7 |
--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/device_communicators/__init__.py
--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/neuron_communicator.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import torch
4 |
5 | from vllm.distributed.device_communicators.base_device_communicator import (
6 | DeviceCommunicatorBase)
7 | from vllm.platforms import current_platform
8 |
9 | if current_platform.is_neuron():
10 | import torch_xla.core.xla_model as xm
11 |
12 |
13 | class NeuronCommunicator(DeviceCommunicatorBase):
14 |
15 | def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
16 | return xm.all_reduce(xm.REDUCE_SUM, x)
17 |
18 | def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
19 | assert dim == -1, "Neuron only supports dim=-1 for all-gather."
20 | return xm.all_gather(x, dim=dim)
21 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.distributed.kv_transfer.kv_transfer_state import (
5 | KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
6 | has_kv_transfer_group, is_v1_kv_transfer_group)
7 |
8 | __all__ = [
9 | "get_kv_transfer_group", "has_kv_transfer_group",
10 | "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
11 | "KVConnectorBaseType"
12 | ]
13 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_connector/__init__.py
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.distributed.kv_transfer.kv_connector.v1.base import (
4 | KVConnectorBase_V1, KVConnectorRole)
5 |
6 | __all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
7 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_pipe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_pipe/__init__.py
--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/__init__.py
--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/output_processor/__init__.py
--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/__init__.py
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/__init__.py
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/benchmark/__init__.py
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/serve.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import argparse
4 |
5 | from vllm.benchmarks.serve import add_cli_args, main
6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
7 | from vllm.entrypoints.cli.types import CLISubcommand
8 |
9 |
10 | class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
11 | """ The `serve` subcommand for vllm bench. """
12 |
13 | def __init__(self):
14 | self.name = "serve"
15 | super().__init__()
16 |
17 | @property
18 | def help(self) -> str:
19 | return "Benchmark the online serving throughput."
20 |
21 | def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
22 | add_cli_args(parser)
23 |
24 | @staticmethod
25 | def cmd(args: argparse.Namespace) -> None:
26 | main(args)
27 |
28 |
29 | def cmd_init() -> list[CLISubcommand]:
30 | return [BenchmarkServingSubcommand()]
31 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/types.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import argparse
5 |
6 | from vllm.utils import FlexibleArgumentParser
7 |
8 |
9 | class CLISubcommand:
10 | """Base class for CLI argument handlers."""
11 |
12 | name: str
13 |
14 | @staticmethod
15 | def cmd(args: argparse.Namespace) -> None:
16 | raise NotImplementedError("Subclasses should implement this method")
17 |
18 | def validate(self, args: argparse.Namespace) -> None:
19 | # No validation by default
20 | pass
21 |
22 | def subparser_init(
23 | self,
24 | subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
25 | raise NotImplementedError("Subclasses should implement this method")
26 |
--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/openai/__init__.py
--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/executor/__init__.py
--------------------------------------------------------------------------------
/vllm/logging_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.logging_utils.formatter import NewLineFormatter
5 |
6 | __all__ = [
7 | "NewLineFormatter",
8 | ]
9 |
--------------------------------------------------------------------------------
/vllm/logging_utils/formatter.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import logging
5 |
6 |
7 | class NewLineFormatter(logging.Formatter):
8 | """Adds logging prefix to newlines to align multi-line messages."""
9 |
10 | def __init__(self, fmt, datefmt=None, style="%"):
11 | logging.Formatter.__init__(self, fmt, datefmt, style)
12 |
13 | def format(self, record):
14 | msg = logging.Formatter.format(self, record)
15 | if record.message != "":
16 | parts = msg.split(record.message)
17 | msg = msg.replace("\n", "\r\n" + parts[0])
18 | return msg
19 |
--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/__init__.py
--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/ops/__init__.py
--------------------------------------------------------------------------------
/vllm/lora/ops/torch_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401
5 | from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
6 | sgmv_expand, sgmv_expand_slice,
7 | sgmv_shrink)
8 |
9 | __all__ = [
10 | "bgmv_expand",
11 | "bgmv_expand_slice",
12 | "bgmv_shrink",
13 | "sgmv_expand",
14 | "sgmv_expand_slice",
15 | "sgmv_shrink",
16 | ]
17 |
--------------------------------------------------------------------------------
/vllm/lora/ops/triton_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
5 | from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
6 | from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
7 |
8 | __all__ = [
9 | "lora_expand",
10 | "lora_shrink",
11 | "LoRAKernelMeta",
12 | ]
13 |
--------------------------------------------------------------------------------
/vllm/lora/ops/xla_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 | bgmv_shrink)
6 |
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 |
--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
5 | from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
6 |
7 | __all__ = [
8 | "PunicaWrapperBase",
9 | "get_punica_wrapper",
10 | ]
11 |
--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/punica_selector.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.logger import init_logger
5 | from vllm.platforms import current_platform
6 | from vllm.utils import resolve_obj_by_qualname
7 |
8 | from .punica_base import PunicaWrapperBase
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
14 | punica_wrapper_qualname = current_platform.get_punica_wrapper()
15 | punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
16 | punica_wrapper = punica_wrapper_cls(*args, **kwargs)
17 | assert punica_wrapper is not None, \
18 | "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
19 | logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
20 | return punica_wrapper
21 |
--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.model_executor.parameter import (BasevLLMParameter,
5 | PackedvLLMParameter)
6 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
7 | SamplingMetadataCache)
8 | from vllm.model_executor.utils import set_random_seed
9 |
10 | __all__ = [
11 | "SamplingMetadata",
12 | "SamplingMetadataCache",
13 | "set_random_seed",
14 | "BasevLLMParameter",
15 | "PackedvLLMParameter",
16 | ]
17 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
2 | For different settings of
3 | - E (number of experts)
4 | - N (intermediate size)
5 | - device_name (torch.cuda.get_device_name())
6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
7 |
8 | The example configurations provided are for the Mixtral model for TP2 on H100
9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 |
12 | See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
13 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/ops/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/kernels/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/quark/__init__.py
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/schemes/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .quark_scheme import QuarkScheme
5 | from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
6 | from .quark_w8a8_fp8 import QuarkW8A8Fp8
7 | from .quark_w8a8_int8 import QuarkW8A8Int8
8 |
9 | __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
10 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .layer_utils import replace_parameter, update_tensor_inplace
5 |
6 | __all__ = ['update_tensor_inplace', 'replace_parameter']
7 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 16,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 4
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 8,
24 | "num_stages": 5
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 16,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "BLOCK_SIZE_M": 16,
4 | "BLOCK_SIZE_N": 32,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "8": {
11 | "BLOCK_SIZE_M": 16,
12 | "BLOCK_SIZE_N": 32,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 4
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 16,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | # Adapted from llama.py
5 | """Inference-only Phi3 model code inherit from Llama.py"""
6 |
7 | from vllm.model_executor.models.llama import LlamaForCausalLM
8 |
9 |
10 | class Phi3ForCausalLM(LlamaForCausalLM):
11 |
12 | packed_modules_mapping = {
13 | "qkv_proj": [
14 | "qkv_proj",
15 | ],
16 | "gate_up_proj": [
17 | "gate_up_proj",
18 | ],
19 | }
20 |
--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/README.md:
--------------------------------------------------------------------------------
1 | # LoRA Resolver Plugins
2 |
3 | This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
4 | via the LoRAResolver plugin framework.
5 |
6 | Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
7 | to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
8 |
9 | # lora_filesystem_resolver
10 | This LoRA Resolver is installed with vLLM by default.
11 | To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
12 | for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
13 | for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
14 | load that adapter, and then service the request as normal. That adapter will then be available
15 | for future requests as normal.
16 |
--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/plugins/lora_resolvers/__init__.py
--------------------------------------------------------------------------------
/vllm/profiler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/profiler/__init__.py
--------------------------------------------------------------------------------
/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/prompt_adapter/__init__.py
--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 |
--------------------------------------------------------------------------------
/vllm/reasoning/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
5 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
6 | from .granite_reasoning_parser import GraniteReasoningParser
7 | from .qwen3_reasoning_parser import Qwen3ReasoningParser
8 |
9 | __all__ = [
10 | "ReasoningParser",
11 | "ReasoningParserManager",
12 | "DeepSeekR1ReasoningParser",
13 | "GraniteReasoningParser",
14 | "Qwen3ReasoningParser",
15 | ]
16 |
--------------------------------------------------------------------------------
/vllm/scripts.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.entrypoints.cli.main import main as vllm_main
5 | from vllm.logger import init_logger
6 |
7 | logger = init_logger(__name__)
8 |
9 |
10 | # Backwards compatibility for the move from vllm.scripts to
11 | # vllm.entrypoints.cli.main
12 | def main():
13 | logger.warning("vllm.scripts.main() is deprecated. Please re-install "
14 | "vllm or use vllm.entrypoints.cli.main.main() instead.")
15 | vllm_main()
16 |
--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/spec_decode/__init__.py
--------------------------------------------------------------------------------
/vllm/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/third_party/__init__.py
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .registry import get_chat_template_fallback_path
4 |
5 | __all__ = ["get_chat_template_fallback_path"]
6 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_basic.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- message['content'] -}}
3 | {%- endfor -%}
4 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_blip2.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'user' -%}
3 | {{- 'Question: ' + message['content'] + ' ' -}}
4 | {%- elif message['role'] == 'assistant' -%}
5 | {{- 'Answer: ' + message['content'] + ' ' -}}
6 | {%- endif -%}
7 | {%- endfor -%}
8 |
9 | {%- if add_generation_prompt -%}
10 | {{- 'Answer:' -}}
11 | {% endif %}
12 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
3 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
4 | {{- '<|im_end|>' + '\n' -}}
5 | {%- endif -%}
6 | {%- endfor -%}
7 |
8 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
9 | {{- '<|im_start|>assistant\n' -}}
10 | {%- endif -%}
11 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja:
--------------------------------------------------------------------------------
1 | {%- if messages[0]['role'] == 'system' -%}
2 | {%- set system_message = messages[0]['content'] -%}
3 | {%- set messages = messages[1:] -%}
4 | {%- else -%}
5 | {% set system_message = '' -%}
6 | {%- endif -%}
7 |
8 | {{ bos_token + system_message }}
9 | {%- for message in messages -%}
10 | {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 | {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 | {%- endif -%}
13 |
14 | {%- if message['role'] == 'user' -%}
15 | {{ '<|User|>: ' + message['content'] + '\n\n' }}
16 | {%- elif message['role'] == 'assistant' -%}
17 | {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
18 | {%- endif -%}
19 | {%- endfor -%}
20 |
21 | {%- if add_generation_prompt -%}
22 | {{ '<|Assistant|>: ' }}
23 | {%- endif -%}
24 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_fuyu.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- message['content'] + '\n' -}}
3 | {%- endfor -%}
4 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/h2ovl.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | # Adapted from
5 | # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
6 | # --------------------------------------------------------
7 | # H2OVL-Mississippi
8 | # Copyright (c) 2024 H2O.AI
9 | # Licensed under Apache 2.0 License [see LICENSE for details]
10 | # --------------------------------------------------------
11 |
12 | from .internvl import InternVLChatConfig
13 |
14 |
15 | class H2OVLChatConfig(InternVLChatConfig):
16 | model_type = "h2ovl_chat"
17 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | # Adapted from
5 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
6 | # --------------------------------------------------------
7 | # NVLM-D
8 | # Copyright (c) 2024 NVIDIA
9 | # Licensed under Apache 2.0 License [see LICENSE for details]
10 | # --------------------------------------------------------
11 | from .internvl import InternVLChatConfig
12 |
13 |
14 | class NVLM_D_Config(InternVLChatConfig):
15 | model_type = 'NVLM_D'
16 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.transformers_utils.processors.deepseek_vl2 import (
5 | DeepseekVLV2Processor)
6 | from vllm.transformers_utils.processors.ovis import OvisProcessor
7 |
8 | __all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
9 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
5 | truncate_tool_call_ids, validate_request_params)
6 |
7 | __all__ = [
8 | "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
9 | "validate_request_params"
10 | ]
11 |
--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
5 | TritonPlaceholder)
6 |
7 | if HAS_TRITON:
8 | import triton
9 | import triton.language as tl
10 | else:
11 | triton = TritonPlaceholder()
12 | tl = TritonLanguagePlaceholder()
13 |
14 | __all__ = ["HAS_TRITON", "triton", "tl"]
15 |
--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/usage/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/mla/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/core/sched/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/sched/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/core/sched/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.v1.request import Request, RequestStatus
4 |
5 |
6 | def check_stop(request: Request, max_model_len: int) -> bool:
7 | if (request.num_tokens >= max_model_len
8 | or request.num_output_tokens >= request.max_tokens):
9 | request.status = RequestStatus.FINISHED_LENGTH_CAPPED
10 | return True
11 |
12 | sampling_params = request.sampling_params
13 | last_token_id = request.output_token_ids[-1]
14 | if (not sampling_params.ignore_eos
15 | and last_token_id == request.eos_token_id):
16 | request.status = RequestStatus.FINISHED_STOPPED
17 | return True
18 |
19 | if last_token_id in (sampling_params.stop_token_ids or ()):
20 | request.status = RequestStatus.FINISHED_STOPPED
21 | request.stop_reason = last_token_id
22 | return True
23 | return False
24 |
--------------------------------------------------------------------------------
/vllm/v1/engine/exceptions.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | class EngineGenerateError(Exception):
4 | """Raised when a AsyncLLM.generate() fails. Recoverable."""
5 | pass
6 |
7 |
8 | class EngineDeadError(Exception):
9 | """Raised when the EngineCore dies. Unrecoverable."""
10 |
11 | def __init__(self, *args, suppress_context: bool = False, **kwargs):
12 | ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause." # noqa: E501
13 |
14 | super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
15 | # Make stack trace clearer when using with LLMEngine by
16 | # silencing irrelevant ZMQError.
17 | self.__suppress_context__ = suppress_context
18 |
--------------------------------------------------------------------------------
/vllm/v1/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/executor/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/metrics/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/sample/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/ops/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/sample/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/tpu/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/spec_decode/__init__.py
--------------------------------------------------------------------------------
/vllm/v1/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/worker/__init__.py
--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/vllm_flash_attn/.gitkeep
--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/worker/__init__.py
--------------------------------------------------------------------------------