├── .buildkite ├── check-wheel-size.py ├── generate_index.py ├── lm-eval-harness │ ├── configs │ │ ├── DeepSeek-V2-Lite-Chat.yaml │ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-70B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-QQQ.yaml │ │ ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Minitron-4B-Base-FP8.yaml │ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml │ │ ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml │ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml │ │ ├── Qwen2-57B-A14-Instruct.yaml │ │ ├── Qwen2.5-1.5B-Instruct.yaml │ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml │ │ ├── SparseLlama3.1_2of4_fp8_compressed.yaml │ │ ├── models-large.txt │ │ └── models-small.txt │ ├── conftest.py │ ├── run-lm-eval-gsm-hf-baseline.sh │ ├── run-lm-eval-gsm-vllm-baseline.sh │ └── test_lm_eval_correctness.py ├── nightly-benchmarks │ ├── README.md │ ├── benchmark-pipeline.yaml │ ├── nightly-annotation.md │ ├── nightly-descriptions.md │ ├── nightly-pipeline.yaml │ ├── performance-benchmarks-descriptions.md │ ├── scripts │ │ ├── convert-results-json-to-markdown.py │ │ ├── download-tokenizer.py │ │ ├── generate-nightly-markdown.py │ │ ├── get-lmdeploy-modelname.py │ │ ├── launch-server.sh │ │ ├── nightly-annotate.sh │ │ ├── run-nightly-benchmarks.sh │ │ ├── run-performance-benchmarks.sh │ │ ├── summary-nightly-results.py │ │ └── wait-for-image.sh │ └── tests │ │ ├── genai-perf-tests.json │ │ ├── latency-tests.json │ │ ├── nightly-tests.json │ │ ├── serving-tests.json │ │ └── throughput-tests.json ├── pyproject.toml ├── release-pipeline.yaml ├── scripts │ ├── annotate-release.sh │ ├── hardware_ci │ │ ├── run-amd-test.sh │ │ ├── run-cpu-test-ppc64le.sh │ │ ├── run-cpu-test-s390x.sh │ │ ├── run-cpu-test.sh │ │ ├── run-gh200-test.sh │ │ ├── run-hpu-test.sh │ │ ├── run-neuron-test.sh │ │ ├── run-tpu-v1-test.sh │ │ └── run-xpu-test.sh │ ├── run-benchmarks.sh │ ├── run-multi-node-test.sh │ └── upload-wheels.sh ├── test-pipeline.yaml └── test-template.j2 ├── .clang-format ├── .dockerignore ├── .github ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug-report.yml │ ├── 450-ci-failure.yml │ ├── 500-feature-request.yml │ ├── 600-new-model.yml │ ├── 700-performance-discussion.yml │ ├── 750-RFC.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── mergify.yml ├── scripts │ └── cleanup_pr_body.sh └── workflows │ ├── add_label_automerge.yml │ ├── cleanup_pr_body.yml │ ├── matchers │ ├── actionlint.json │ └── mypy.json │ ├── pre-commit.yml │ ├── publish.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── stale.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .shellcheckrc ├── .yapfignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DCO ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── ROCm_performance.md ├── SECURITY.md ├── benchmarks ├── P3L.py ├── P3L_mling.py ├── README.md ├── auto_tune.sh ├── backend_request_func.py ├── benchmark_dataset.py ├── benchmark_latency.py ├── benchmark_long_document_qa_throughput.py ├── benchmark_prefix_caching.py ├── benchmark_prioritization.py ├── benchmark_serving.py ├── benchmark_serving_structured_output.py ├── benchmark_throughput.py ├── benchmark_utils.py ├── cutlass_benchmarks │ ├── sparse_benchmarks.py │ ├── utils.py │ ├── w8a8_benchmarks.py │ └── weight_shapes.py ├── disagg_benchmarks │ ├── disagg_overhead_benchmark.sh │ ├── disagg_performance_benchmark.sh │ ├── disagg_prefill_proxy_server.py │ ├── round_robin_proxy.py │ └── visualize_benchmark_results.py ├── fused_kernels │ └── layernorm_rms_benchmarks.py ├── kernels │ ├── bench_fp8_gemm.py │ ├── benchmark_aqlm.py │ ├── benchmark_bitblas.py │ ├── benchmark_cutlass_fp4_moe.py │ ├── benchmark_grouped_gemm_cutlass.py │ ├── benchmark_layernorm.py │ ├── benchmark_lora.py │ ├── benchmark_machete.py │ ├── benchmark_marlin.py │ ├── benchmark_moe.py │ ├── benchmark_moe_permute_unpermute.py │ ├── benchmark_paged_attention.py │ ├── benchmark_quant.py │ ├── benchmark_rmsnorm.py │ ├── benchmark_rope.py │ ├── benchmark_shapes.py │ ├── benchmark_w8a8_block_fp8.py │ ├── deepgemm │ │ ├── README.md │ │ └── benchmark_fp8_block_dense_gemm.py │ ├── graph_machete_bench.py │ ├── moe_tune_script.sh │ ├── requirements.txt │ ├── utils.py │ └── weight_shapes.py ├── overheads │ └── benchmark_hashing.py ├── profiling │ ├── README.md │ ├── benchmark_latency.py │ └── benchmark_throughput.py ├── pyproject.toml ├── run_structured_output_benchmark.sh ├── sonnet.txt └── structured_schemas │ └── structured_schema_1.json ├── cmake ├── cpu_extension.cmake ├── external_projects │ ├── flashmla.cmake │ └── vllm_flash_attn.cmake ├── hipify.py └── utils.cmake ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cuh │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ ├── dtype_fp8.cuh │ ├── merge_attn_states.cu │ ├── mla │ │ ├── cutlass_mla_entry.cu │ │ └── cutlass_mla_kernels.cu │ ├── paged_attention_v1.cu │ ├── paged_attention_v2.cu │ └── vertical_slash_index.cu ├── cache.h ├── cache_kernels.cu ├── core │ ├── exception.hpp │ ├── math.hpp │ ├── registration.h │ └── scalar_type.hpp ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── cpu_types_arm.hpp │ ├── cpu_types_vsx.hpp │ ├── cpu_types_vxe.hpp │ ├── cpu_types_x86.hpp │ ├── dnnl_helper.hpp │ ├── layernorm.cpp │ ├── mla_decode.cpp │ ├── pos_encoding.cpp │ ├── quant.cpp │ ├── shm.cpp │ ├── torch_bindings.cpp │ └── utils.cpp ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── cuda_view.cu ├── cumem_allocator.cpp ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── cutlass_extensions │ ├── common.cpp │ ├── common.hpp │ ├── cute_utils.cuh │ ├── epilogue │ │ ├── broadcast_load_epilogue_array_c3x.hpp │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ ├── scaled_mm_epilogues_c2x.hpp │ │ └── scaled_mm_epilogues_c3x.hpp │ ├── gemm │ │ ├── collective │ │ │ ├── collective_builder.hpp │ │ │ ├── fp8_accumulation.hpp │ │ │ └── sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp │ │ └── dispatch_policy.hpp │ ├── torch_utils.hpp │ ├── vllm_collective_builder.cuh │ ├── vllm_custom_types.cuh │ ├── vllm_cutlass_library_extension.py │ ├── vllm_numeric_conversion.cuh │ └── vllm_type_utils.cuh ├── dispatch_utils.h ├── layernorm_kernels.cu ├── layernorm_quant_kernels.cu ├── mamba │ ├── causal_conv1d │ │ ├── causal_conv1d.cu │ │ ├── causal_conv1d.h │ │ └── static_switch.h │ └── mamba_ssm │ │ ├── selective_scan.h │ │ ├── selective_scan_fwd.cu │ │ └── static_switch.h ├── moe │ ├── marlin_moe_wna16 │ │ ├── .gitignore │ │ ├── generate_kernels.py │ │ ├── kernel.h │ │ ├── marlin_template.h │ │ └── ops.cu │ ├── moe_align_sum_kernels.cu │ ├── moe_ops.h │ ├── moe_permute_unpermute_op.cu │ ├── moe_wna16.cu │ ├── moe_wna16_utils.h │ ├── permute_unpermute_kernels │ │ ├── dispatch.h │ │ ├── moe_permute_unpermute_kernel.cu │ │ ├── moe_permute_unpermute_kernel.h │ │ └── moe_permute_unpermute_kernel.inl │ ├── topk_softmax_kernels.cu │ └── torch_bindings.cpp ├── ops.h ├── permute_cols.cu ├── pos_encoding_kernels.cu ├── prepare_inputs │ ├── advance_step.cu │ └── advance_step.cuh ├── quantization │ ├── activation_kernels.cu │ ├── aqlm │ │ └── gemm_kernels.cu │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── compressed_tensors │ │ └── int8_quant_kernels.cu │ ├── cutlass_w8a8 │ │ ├── Epilogues.md │ │ ├── c3x │ │ │ ├── cutlass_gemm_caller.cuh │ │ │ ├── scaled_mm.cuh │ │ │ ├── scaled_mm_azp_sm90_int8.cu │ │ │ ├── scaled_mm_blockwise_sm100_fp8.cu │ │ │ ├── scaled_mm_blockwise_sm100_fp8_dispatch.cuh │ │ │ ├── scaled_mm_blockwise_sm90_fp8.cu │ │ │ ├── scaled_mm_blockwise_sm90_fp8_dispatch.cuh │ │ │ ├── scaled_mm_helper.hpp │ │ │ ├── scaled_mm_kernels.hpp │ │ │ ├── scaled_mm_sm100_fp8.cu │ │ │ ├── scaled_mm_sm100_fp8_dispatch.cuh │ │ │ ├── scaled_mm_sm90_fp8.cu │ │ │ ├── scaled_mm_sm90_fp8_dispatch.cuh │ │ │ ├── scaled_mm_sm90_int8.cu │ │ │ └── scaled_mm_sm90_int8_dispatch.cuh │ │ ├── moe │ │ │ ├── get_group_starts.cuh │ │ │ ├── grouped_mm_c3x.cu │ │ │ ├── grouped_mm_c3x.cuh │ │ │ └── moe_data.cu │ │ ├── scaled_mm_c2x.cu │ │ ├── scaled_mm_c2x.cuh │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ ├── scaled_mm_c3x_sm100.cu │ │ ├── scaled_mm_c3x_sm90.cu │ │ └── scaled_mm_entry.cu │ ├── fp4 │ │ ├── nvfp4_blockwise_moe_kernel.cu │ │ ├── nvfp4_experts_quant.cu │ │ ├── nvfp4_quant_entry.cu │ │ ├── nvfp4_quant_kernels.cu │ │ ├── nvfp4_scaled_mm_entry.cu │ │ └── nvfp4_scaled_mm_kernels.cu │ ├── fp8 │ │ ├── amd │ │ │ └── quant_utils.cuh │ │ ├── common.cu │ │ ├── common.cuh │ │ └── nvidia │ │ │ └── quant_utils.cuh │ ├── fused_kernels │ │ ├── fused_layernorm_dynamic_per_token_quant.cu │ │ ├── layernorm_utils.cuh │ │ └── quant_conversions.cuh │ ├── gguf │ │ ├── dequantize.cuh │ │ ├── ggml-common.h │ │ ├── gguf_kernel.cu │ │ ├── mmq.cuh │ │ ├── mmvq.cuh │ │ ├── moe.cuh │ │ ├── moe_vec.cuh │ │ └── vecdotq.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── gptq_allspark │ │ ├── allspark_qgemm_w8a16.cu │ │ ├── allspark_repack.cu │ │ └── allspark_utils.cuh │ ├── gptq_marlin │ │ ├── .gitignore │ │ ├── awq_marlin_repack.cu │ │ ├── dequant.h │ │ ├── generate_kernels.py │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin_repack.cu │ │ ├── kernel.h │ │ ├── marlin.cuh │ │ ├── marlin_dtypes.cuh │ │ └── marlin_template.h │ ├── machete │ │ ├── Readme.md │ │ ├── generate.py │ │ ├── machete_collective_builder.cuh │ │ ├── machete_interleaving_utils.cuh │ │ ├── machete_mainloop.cuh │ │ ├── machete_mm_kernel.cuh │ │ ├── machete_mm_launcher.cuh │ │ ├── machete_prepack_kernel.cuh │ │ ├── machete_prepack_launcher.cuh │ │ ├── machete_prepacked_layout.cuh │ │ └── machete_pytorch.cu │ ├── marlin │ │ ├── dense │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ │ ├── base.h │ │ │ │ └── mem.h │ │ │ └── marlin_cuda_kernel.cu │ │ ├── qqq │ │ │ └── marlin_qqq_gemm_kernel.cu │ │ └── sparse │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ ├── base.h │ │ │ ├── mem.h │ │ │ └── mma.h │ │ │ └── marlin_24_cuda_kernel.cu │ ├── utils.cuh │ └── vectorization.cuh ├── rocm │ ├── attention.cu │ ├── custom.cu │ ├── fused_kernels.cu │ ├── ops.h │ ├── skinny_gemms.cu │ └── torch_bindings.cpp ├── sampler.cu ├── sparse │ └── cutlass │ │ ├── sparse_compressor_c3x.cuh │ │ ├── sparse_scaled_mm_c3x.cu │ │ ├── sparse_scaled_mm_c3x.cuh │ │ └── sparse_scaled_mm_entry.cu ├── torch_bindings.cpp └── type_convert.cuh ├── docker ├── Dockerfile ├── Dockerfile.arm ├── Dockerfile.cpu ├── Dockerfile.hpu ├── Dockerfile.neuron ├── Dockerfile.nightly_torch ├── Dockerfile.ppc64le ├── Dockerfile.rocm ├── Dockerfile.rocm_base ├── Dockerfile.s390x ├── Dockerfile.tpu └── Dockerfile.xpu ├── docs ├── .nav.yml ├── README.md ├── api │ ├── README.md │ └── vllm │ │ └── .meta.yml ├── assets │ ├── contributing │ │ └── dockerfile-stages-dependency.png │ ├── deployment │ │ ├── anything-llm-chat-with-doc.png │ │ ├── anything-llm-chat-without-doc.png │ │ ├── anything-llm-provider.png │ │ ├── anything-llm-upload-doc.png │ │ ├── architecture_helm_deployment.png │ │ ├── chatbox-chat.png │ │ ├── chatbox-settings.png │ │ ├── dify-chat.png │ │ ├── dify-create-chatbot.png │ │ ├── dify-settings.png │ │ ├── open_webui.png │ │ └── streamlit-chat.png │ ├── design │ │ ├── arch_overview │ │ │ ├── entrypoints.excalidraw.png │ │ │ └── llm_engine.excalidraw.png │ │ ├── hierarchy.png │ │ └── v1 │ │ │ ├── metrics │ │ │ ├── intervals-1.png │ │ │ ├── intervals-2.png │ │ │ └── intervals-3.png │ │ │ └── prefix_caching │ │ │ ├── example-time-1.png │ │ │ ├── example-time-3.png │ │ │ ├── example-time-4.png │ │ │ ├── example-time-5.png │ │ │ ├── example-time-6.png │ │ │ ├── example-time-7.png │ │ │ ├── free.png │ │ │ └── overview.png │ ├── features │ │ └── disagg_prefill │ │ │ ├── abstraction.jpg │ │ │ └── overview.jpg │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.ico │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png ├── cli │ └── README.md ├── community │ ├── meetups.md │ └── sponsors.md ├── configuration │ ├── README.md │ ├── conserving_memory.md │ ├── engine_args.md │ ├── env_vars.md │ ├── model_resolution.md │ ├── optimization.md │ └── serve_args.md ├── contributing │ ├── README.md │ ├── benchmarks.md │ ├── ci-failures.md │ ├── deprecation_policy.md │ ├── dockerfile │ │ └── dockerfile.md │ ├── model │ │ ├── README.md │ │ ├── basic.md │ │ ├── multimodal.md │ │ ├── registration.md │ │ └── tests.md │ ├── profiling.md │ └── vulnerability_management.md ├── deployment │ ├── docker.md │ ├── frameworks │ │ ├── anything-llm.md │ │ ├── autogen.md │ │ ├── bentoml.md │ │ ├── cerebrium.md │ │ ├── chatbox.md │ │ ├── dify.md │ │ ├── dstack.md │ │ ├── haystack.md │ │ ├── helm.md │ │ ├── litellm.md │ │ ├── lobe-chat.md │ │ ├── lws.md │ │ ├── modal.md │ │ ├── open-webui.md │ │ ├── retrieval_augmented_generation.md │ │ ├── skypilot.md │ │ ├── streamlit.md │ │ └── triton.md │ ├── integrations │ │ ├── kserve.md │ │ ├── kubeai.md │ │ ├── llamastack.md │ │ ├── llmaz.md │ │ └── production-stack.md │ ├── k8s.md │ └── nginx.md ├── design │ ├── arch_overview.md │ ├── automatic_prefix_caching.md │ ├── huggingface_integration.md │ ├── kernel │ │ └── paged_attention.md │ ├── mm_processing.md │ ├── multiprocessing.md │ ├── plugin_system.md │ └── v1 │ │ ├── metrics.md │ │ ├── prefix_caching.md │ │ └── torch_compile.md ├── dev-docker │ └── README.md ├── features │ ├── automatic_prefix_caching.md │ ├── compatibility_matrix.md │ ├── disagg_prefill.md │ ├── lora.md │ ├── multimodal_inputs.md │ ├── prompt_embeds.md │ ├── quantization │ │ ├── README.md │ │ ├── auto_awq.md │ │ ├── bitblas.md │ │ ├── bnb.md │ │ ├── fp8.md │ │ ├── gguf.md │ │ ├── gptqmodel.md │ │ ├── int4.md │ │ ├── int8.md │ │ ├── modelopt.md │ │ ├── quantized_kvcache.md │ │ ├── quark.md │ │ ├── supported_hardware.md │ │ └── torchao.md │ ├── reasoning_outputs.md │ ├── spec_decode.md │ ├── structured_outputs.md │ └── tool_calling.md ├── getting_started │ ├── installation │ │ ├── .nav.yml │ │ ├── README.md │ │ ├── ai_accelerator.md │ │ ├── ai_accelerator │ │ │ ├── hpu-gaudi.inc.md │ │ │ ├── neuron.inc.md │ │ │ └── tpu.inc.md │ │ ├── cpu.md │ │ ├── cpu │ │ │ ├── apple.inc.md │ │ │ ├── arm.inc.md │ │ │ ├── build.inc.md │ │ │ ├── s390x.inc.md │ │ │ └── x86.inc.md │ │ ├── device.template.md │ │ ├── gpu.md │ │ ├── gpu │ │ │ ├── cuda.inc.md │ │ │ ├── rocm.inc.md │ │ │ └── xpu.inc.md │ │ └── python_env_setup.inc.md │ └── quickstart.md ├── mkdocs │ ├── hooks │ │ ├── generate_examples.py │ │ ├── remove_announcement.py │ │ └── url_schemes.py │ ├── javascript │ │ └── run_llm_widget.js │ ├── overrides │ │ └── main.html │ └── stylesheets │ │ └── extra.css ├── models │ ├── extensions │ │ ├── fastsafetensor.md │ │ ├── runai_model_streamer.md │ │ └── tensorizer.md │ ├── generative_models.md │ ├── pooling_models.md │ └── supported_models.md ├── serving │ ├── distributed_serving.md │ ├── integrations │ │ ├── langchain.md │ │ └── llamaindex.md │ ├── offline_inference.md │ └── openai_compatible_server.md ├── training │ ├── rlhf.md │ └── trl.md └── usage │ ├── README.md │ ├── faq.md │ ├── metrics.md │ ├── reproducibility.md │ ├── security.md │ ├── troubleshooting.md │ ├── usage_stats.md │ └── v1_guide.md ├── examples ├── offline_inference │ ├── audio_language.py │ ├── automatic_prefix_caching.py │ ├── basic │ │ ├── README.md │ │ ├── basic.py │ │ ├── chat.py │ │ ├── classify.py │ │ ├── embed.py │ │ ├── generate.py │ │ └── score.py │ ├── batch_llm_inference.py │ ├── chat_with_tools.py │ ├── context_extension.py │ ├── data_parallel.py │ ├── disaggregated-prefill-v1 │ │ ├── README.md │ │ ├── decode_example.py │ │ ├── prefill_example.py │ │ └── run.sh │ ├── disaggregated_prefill.py │ ├── eagle.py │ ├── embed_jina_embeddings_v3.py │ ├── embed_matryoshka_fy.py │ ├── encoder_decoder.py │ ├── encoder_decoder_multimodal.py │ ├── llm_engine_example.py │ ├── load_sharded_state.py │ ├── lora_with_quantization_inference.py │ ├── metrics.py │ ├── mistral-small.py │ ├── mlpspeculator.py │ ├── multilora_inference.py │ ├── neuron.py │ ├── neuron_eagle.py │ ├── neuron_int8_quantization.py │ ├── neuron_multimodal.py │ ├── neuron_speculation.py │ ├── openai_batch │ │ ├── README.md │ │ └── openai_example_batch.jsonl │ ├── prefix_caching.py │ ├── prithvi_geospatial_mae.py │ ├── profiling.py │ ├── profiling_tpu │ │ ├── README.md │ │ └── profiling.py │ ├── prompt_embed_inference.py │ ├── qwen2_5_omni │ │ ├── README.md │ │ └── only_thinker.py │ ├── qwen_1m.py │ ├── reproducibility.py │ ├── rlhf.py │ ├── rlhf_colocate.py │ ├── rlhf_utils.py │ ├── save_sharded_state.py │ ├── simple_profiling.py │ ├── structured_outputs.py │ ├── torchrun_example.py │ ├── tpu.py │ ├── vision_language.py │ ├── vision_language_embedding.py │ └── vision_language_multi_image.py ├── online_serving │ ├── api_client.py │ ├── chart-helm │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── README.md │ │ ├── ct.yaml │ │ ├── lintconf.yaml │ │ ├── templates │ │ │ ├── _helpers.tpl │ │ │ ├── configmap.yaml │ │ │ ├── custom-objects.yaml │ │ │ ├── deployment.yaml │ │ │ ├── hpa.yaml │ │ │ ├── job.yaml │ │ │ ├── poddisruptionbudget.yaml │ │ │ ├── pvc.yaml │ │ │ ├── secrets.yaml │ │ │ └── service.yaml │ │ ├── values.schema.json │ │ └── values.yaml │ ├── cohere_rerank_client.py │ ├── disaggregated_prefill.sh │ ├── disaggregated_serving │ │ ├── README.md │ │ ├── disagg_proxy_demo.py │ │ └── kv_events.sh │ ├── gradio_openai_chatbot_webserver.py │ ├── gradio_webserver.py │ ├── jinaai_rerank_client.py │ ├── kv_events_subscriber.py │ ├── multi-node-serving.sh │ ├── multi_instance_data_parallel.py │ ├── openai_chat_completion_client.py │ ├── openai_chat_completion_client_for_multimodal.py │ ├── openai_chat_completion_client_with_tools.py │ ├── openai_chat_completion_client_with_tools_required.py │ ├── openai_chat_completion_structured_outputs.py │ ├── openai_chat_completion_structured_outputs_structural_tag.py │ ├── openai_chat_completion_structured_outputs_with_reasoning.py │ ├── openai_chat_completion_tool_calls_with_reasoning.py │ ├── openai_chat_completion_with_reasoning.py │ ├── openai_chat_completion_with_reasoning_streaming.py │ ├── openai_chat_embedding_client_for_multimodal.py │ ├── openai_classification_client.py │ ├── openai_completion_client.py │ ├── openai_cross_encoder_score.py │ ├── openai_embedding_client.py │ ├── openai_embedding_matryoshka_fy.py │ ├── openai_pooling_client.py │ ├── openai_transcription_client.py │ ├── opentelemetry │ │ ├── README.md │ │ └── dummy_client.py │ ├── prometheus_grafana │ │ ├── README.md │ │ ├── docker-compose.yaml │ │ ├── grafana.json │ │ └── prometheus.yaml │ ├── prompt_embed_inference_with_openai_client.py │ ├── ray_serve_deepseek.py │ ├── retrieval_augmented_generation_with_langchain.py │ ├── retrieval_augmented_generation_with_llamaindex.py │ ├── run_cluster.sh │ ├── sagemaker-entrypoint.sh │ ├── streamlit_openai_chatbot_webserver.py │ └── utils.py ├── others │ ├── lmcache │ │ ├── README.md │ │ ├── cpu_offload_lmcache.py │ │ ├── disagg_prefill_lmcache_v0.py │ │ ├── disagg_prefill_lmcache_v1 │ │ │ ├── configs │ │ │ │ ├── lmcache-decoder-config.yaml │ │ │ │ └── lmcache-prefiller-config.yaml │ │ │ ├── disagg_example_nixl.sh │ │ │ ├── disagg_proxy_server.py │ │ │ └── disagg_vllm_launcher.sh │ │ └── kv_cache_sharing_lmcache_v1.py │ ├── logging_configuration.md │ └── tensorize_vllm_model.py ├── pyproject.toml ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_dse_qwen2_vl.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja ├── template_teleflm.jinja ├── template_vlm2vec.jinja ├── tool_chat_template_deepseekr1.jinja ├── tool_chat_template_deepseekv3.jinja ├── tool_chat_template_granite.jinja ├── tool_chat_template_granite_20b_fc.jinja ├── tool_chat_template_hermes.jinja ├── tool_chat_template_internlm2_tool.jinja ├── tool_chat_template_llama3.1_json.jinja ├── tool_chat_template_llama3.2_json.jinja ├── tool_chat_template_llama3.2_pythonic.jinja ├── tool_chat_template_llama4_json.jinja ├── tool_chat_template_llama4_pythonic.jinja ├── tool_chat_template_mistral.jinja ├── tool_chat_template_mistral3.jinja ├── tool_chat_template_mistral_parallel.jinja ├── tool_chat_template_phi4_mini.jinja └── tool_chat_template_toolace.jinja ├── find_cuda_init.py ├── format.sh ├── mkdocs.yaml ├── pyproject.toml ├── requirements ├── build.txt ├── common.txt ├── cpu.txt ├── cuda.txt ├── dev.txt ├── docs.txt ├── hpu.txt ├── lint.txt ├── neuron.txt ├── nightly_torch_test.txt ├── rocm-build.txt ├── rocm-test.txt ├── rocm.txt ├── test.in ├── test.txt ├── tpu.txt └── xpu.txt ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── __init__.py │ ├── api_server_async_engine.py │ ├── conftest.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ └── test_request_tracker.py ├── basic_correctness │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_chunked_prefill.py │ ├── test_cpu_offload.py │ ├── test_cumem.py │ └── test_preemption.py ├── benchmarks │ ├── __init__.py │ ├── test_latency_cli.py │ ├── test_serve_cli.py │ └── test_throughput_cli.py ├── build_cython.py ├── compile │ ├── __init__.py │ ├── backend.py │ ├── conftest.py │ ├── piecewise │ │ ├── __init__.py │ │ ├── test_full_cudagraph.py │ │ ├── test_simple.py │ │ └── test_toy_llama.py │ ├── test_async_tp.py │ ├── test_basic_correctness.py │ ├── test_full_graph.py │ ├── test_functionalization.py │ ├── test_fusion.py │ ├── test_pass_manager.py │ ├── test_sequence_parallelism.py │ ├── test_silu_mul_quant_fusion.py │ └── test_wrapper.py ├── config │ ├── test_config.yaml │ └── test_config_with_model.yaml ├── conftest.py ├── core │ ├── __init__.py │ ├── block │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── e2e │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_correctness.py │ │ │ └── test_correctness_sliding_window.py │ │ ├── test_block_manager.py │ │ ├── test_block_table.py │ │ ├── test_common.py │ │ ├── test_cpu_gpu_block_allocator.py │ │ ├── test_naive_block.py │ │ └── test_prefix_caching_block.py │ ├── conftest.py │ ├── test_chunked_prefill_scheduler.py │ ├── test_num_computed_tokens_update.py │ ├── test_scheduler.py │ ├── test_scheduler_encoder_decoder.py │ ├── test_serialization.py │ └── utils.py ├── detokenizer │ ├── __init__.py │ ├── conftest.py │ ├── test_disable_detokenization.py │ ├── test_stop_checker.py │ ├── test_stop_reason.py │ └── test_stop_strings.py ├── distributed │ ├── __init__.py │ ├── conftest.py │ ├── test_ca_buffer_sharing.py │ ├── test_comm_ops.py │ ├── test_custom_all_reduce.py │ ├── test_distributed_oot.py │ ├── test_events.py │ ├── test_expert_parallel.py │ ├── test_multi_node_assignment.py │ ├── test_pipeline_parallel.py │ ├── test_pipeline_partition.py │ ├── test_pp_cudagraph.py │ ├── test_pynccl.py │ ├── test_same_node.py │ ├── test_sequence_parallel.py │ ├── test_shm_broadcast.py │ ├── test_torchrun_example.py │ └── test_utils.py ├── encoder_decoder │ ├── __init__.py │ └── test_e2e_correctness.py ├── engine │ ├── __init__.py │ ├── conftest.py │ ├── test_arg_utils.py │ ├── test_computed_prefix_blocks.py │ ├── test_executor.py │ ├── test_multi_step_output_processor.py │ ├── test_multiproc_workers.py │ ├── test_options.py │ └── test_short_mm_context.py ├── entrypoints │ ├── __init__.py │ ├── conftest.py │ ├── llm │ │ ├── __init__.py │ │ ├── test_accuracy.py │ │ ├── test_chat.py │ │ ├── test_collective_rpc.py │ │ ├── test_encode.py │ │ ├── test_generate.py │ │ ├── test_generate_multiple_loras.py │ │ ├── test_gpu_utilization.py │ │ ├── test_guided_generate.py │ │ ├── test_lazy_outlines.py │ │ └── test_prompt_validation.py │ ├── offline_mode │ │ ├── __init__.py │ │ └── test_offline_mode.py │ ├── openai │ │ ├── __init__.py │ │ ├── correctness │ │ │ ├── __init__.py │ │ │ ├── test_lmeval.py │ │ │ ├── test_mteb.py │ │ │ └── test_transcription_api_correctness.py │ │ ├── test_async_tokenization.py │ │ ├── test_audio.py │ │ ├── test_basic.py │ │ ├── test_chat.py │ │ ├── test_chat_echo.py │ │ ├── test_chat_logit_bias_validation.py │ │ ├── test_chat_template.py │ │ ├── test_chat_with_tool_reasoning.py │ │ ├── test_chunked_prompt.py │ │ ├── test_classification.py │ │ ├── test_cli_args.py │ │ ├── test_completion.py │ │ ├── test_completion_with_function_calling.py │ │ ├── test_completion_with_prompt_embeds.py │ │ ├── test_embedding.py │ │ ├── test_embedding_dimensions.py │ │ ├── test_encoder_decoder.py │ │ ├── test_lora_adapters.py │ │ ├── test_lora_resolvers.py │ │ ├── test_metrics.py │ │ ├── test_models.py │ │ ├── test_oot_registration.py │ │ ├── test_openai_schema.py │ │ ├── test_pooling.py │ │ ├── test_prompt_validation.py │ │ ├── test_rerank.py │ │ ├── test_return_tokens_as_ids.py │ │ ├── test_root_path.py │ │ ├── test_run_batch.py │ │ ├── test_score.py │ │ ├── test_serving_chat.py │ │ ├── test_serving_models.py │ │ ├── test_shutdown.py │ │ ├── test_sleep.py │ │ ├── test_tensorizer_entrypoint.py │ │ ├── test_tokenization.py │ │ ├── test_transcription_validation.py │ │ ├── test_truncation.py │ │ ├── test_video.py │ │ ├── test_vision.py │ │ ├── test_vision_embedding.py │ │ └── tool_parsers │ │ │ ├── __init__.py │ │ │ ├── test_llama4_pythonic_tool_parser.py │ │ │ ├── test_pythonic_tool_parser.py │ │ │ └── utils.py │ ├── test_api_server_process_manager.py │ ├── test_chat_utils.py │ └── test_ssl_cert_refresher.py ├── fastsafetensors_loader │ ├── __init__.py │ ├── test_fastsafetensors_loader.py │ └── test_weight_utils.py ├── kernels │ ├── __init__.py │ ├── allclose_default.py │ ├── attention │ │ ├── conftest.py │ │ ├── test_attention.py │ │ ├── test_attention_selector.py │ │ ├── test_blocksparse_attention.py │ │ ├── test_cache.py │ │ ├── test_cascade_flash_attn.py │ │ ├── test_encoder_decoder_attn.py │ │ ├── test_flash_attn.py │ │ ├── test_flashinfer.py │ │ ├── test_flashmla.py │ │ ├── test_lightning_attn.py │ │ ├── test_merge_attn_states.py │ │ ├── test_mha_attn.py │ │ ├── test_mla_decode_cpu.py │ │ ├── test_prefix_prefill.py │ │ ├── test_rocm_attention_selector.py │ │ ├── test_triton_decode_attention.py │ │ └── test_triton_unified_attention.py │ ├── core │ │ ├── test_activation.py │ │ ├── test_fused_quant_layernorm.py │ │ ├── test_layernorm.py │ │ ├── test_opcheck.py │ │ ├── test_permute_cols.py │ │ ├── test_pos_encoding.py │ │ ├── test_rotary_embedding.py │ │ └── test_uva.py │ ├── mamba │ │ ├── test_causal_conv1d.py │ │ ├── test_mamba_mixer2.py │ │ ├── test_mamba_ssm.py │ │ └── test_mamba_ssm_ssd.py │ ├── moe │ │ ├── __init__.py │ │ ├── deepep_utils.py │ │ ├── test_batched_moe.py │ │ ├── test_cutlass_moe.py │ │ ├── test_deepep_deepgemm_moe.py │ │ ├── test_deepep_moe.py │ │ ├── test_moe.py │ │ ├── test_moe_permute_unpermute.py │ │ ├── test_nvfp4_moe.py │ │ ├── test_pplx_moe.py │ │ ├── test_rocm_aiter_topk.py │ │ └── test_triton_moe_ptpc_fp8.py │ ├── quant_utils.py │ ├── quantization │ │ ├── nvfp4_utils.py │ │ ├── test_allspark_gemm.py │ │ ├── test_aqlm.py │ │ ├── test_awq.py │ │ ├── test_awq_triton.py │ │ ├── test_block_fp8.py │ │ ├── test_block_int8.py │ │ ├── test_cutlass_2of4_sparse.py │ │ ├── test_cutlass_scaled_mm.py │ │ ├── test_fp8_quant.py │ │ ├── test_ggml.py │ │ ├── test_gguf.py │ │ ├── test_gptq.py │ │ ├── test_int8_kernel.py │ │ ├── test_int8_quant.py │ │ ├── test_machete_mm.py │ │ ├── test_marlin_gemm.py │ │ ├── test_nvfp4_quant.py │ │ ├── test_nvfp4_scaled_mm.py │ │ ├── test_rocm_skinny_gemms.py │ │ └── test_triton_scaled_mm.py │ ├── test_apply_repetition_penalties.py │ ├── test_cutlass_mla_decode.py │ ├── test_fused_quant_activation.py │ ├── test_triton_flash_attention.py │ └── utils.py ├── kv_transfer │ ├── test_disagg.py │ ├── test_lookup_buffer.py │ ├── test_lookup_buffer.sh │ ├── test_module.py │ ├── test_send_recv.py │ └── test_send_recv.sh ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── test_add_lora.py │ ├── test_baichuan.py │ ├── test_chatglm3_tp.py │ ├── test_layers.py │ ├── test_llama_tp.py │ ├── test_lora_allowed_token_ids.py │ ├── test_lora_checkpoints.py │ ├── test_lora_functions.py │ ├── test_lora_huggingface.py │ ├── test_lora_manager.py │ ├── test_minicpmv_tp.py │ ├── test_mixtral.py │ ├── test_peft_helper.py │ ├── test_phi.py │ ├── test_punica_ops.py │ ├── test_quant_model.py │ ├── test_qwen2vl.py │ ├── test_resolver.py │ ├── test_tokenizer_group.py │ ├── test_transfomers_model.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ ├── __init__.py │ └── test_metrics.py ├── mistral_tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_mistral_tool_calls.py │ └── utils.py ├── model_executor │ ├── __init__.py │ ├── conftest.py │ ├── test_enabled_custom_ops.py │ ├── test_guided_processors.py │ ├── test_logits_processor.py │ ├── test_model_load_with_params.py │ └── test_weight_utils.py ├── models │ ├── __init__.py │ ├── fixtures │ │ ├── mistral_small_3_chat.json │ │ └── pixtral_chat.json │ ├── language │ │ ├── __init__.py │ │ ├── generation │ │ │ ├── __init__.py │ │ │ ├── test_bart.py │ │ │ ├── test_common.py │ │ │ ├── test_granite.py │ │ │ ├── test_granitemoehybrid.py │ │ │ ├── test_hybrid.py │ │ │ ├── test_mistral.py │ │ │ └── test_phimoe.py │ │ └── pooling │ │ │ ├── __init__.py │ │ │ ├── embed_utils.py │ │ │ ├── mteb_utils.py │ │ │ ├── test_baai.py │ │ │ ├── test_classification.py │ │ │ ├── test_embedding.py │ │ │ ├── test_gritlm.py │ │ │ ├── test_gte.py │ │ │ ├── test_intfloat.py │ │ │ ├── test_jina.py │ │ │ ├── test_nomic.py │ │ │ ├── test_nomic_max_model_len.py │ │ │ ├── test_scoring.py │ │ │ ├── test_snowflake_arctic_embed.py │ │ │ └── test_truncation_control.py │ ├── multimodal │ │ ├── __init__.py │ │ ├── generation │ │ │ ├── __init__.py │ │ │ ├── test_common.py │ │ │ ├── test_florence2.py │ │ │ ├── test_granite_speech.py │ │ │ ├── test_interleaved.py │ │ │ ├── test_mllama.py │ │ │ ├── test_phi4mm.py │ │ │ ├── test_pixtral.py │ │ │ ├── test_qwen2_vl.py │ │ │ ├── test_ultravox.py │ │ │ ├── test_whisper.py │ │ │ └── vlm_utils │ │ │ │ ├── __init__.py │ │ │ │ ├── builders.py │ │ │ │ ├── case_filtering.py │ │ │ │ ├── core.py │ │ │ │ ├── custom_inputs.py │ │ │ │ ├── model_utils.py │ │ │ │ ├── runners.py │ │ │ │ └── types.py │ │ ├── pooling │ │ │ ├── __init__.py │ │ │ ├── test_dse_qwen2_vl.py │ │ │ ├── test_intern_vit.py │ │ │ ├── test_llava_next.py │ │ │ └── test_phi3v.py │ │ └── processing │ │ │ ├── __init__.py │ │ │ ├── test_common.py │ │ │ ├── test_h2ovl.py │ │ │ ├── test_idefics3.py │ │ │ ├── test_internvl.py │ │ │ ├── test_llama4.py │ │ │ ├── test_llava_next.py │ │ │ ├── test_llava_onevision.py │ │ │ ├── test_minimax_vl_01.py │ │ │ ├── test_mllama.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_phi4mm.py │ │ │ ├── test_qwen2_vl.py │ │ │ └── test_smolvlm.py │ ├── quantization │ │ ├── __init__.py │ │ ├── test_aqlm.py │ │ ├── test_awq.py │ │ ├── test_bitblas.py │ │ ├── test_fp8.py │ │ ├── test_gguf.py │ │ ├── test_gptq_bitblas.py │ │ ├── test_gptq_marlin.py │ │ ├── test_gptq_marlin_24.py │ │ ├── test_modelopt.py │ │ ├── test_mxfp4.py │ │ └── test_nvfp4.py │ ├── registry.py │ ├── test_initialization.py │ ├── test_oot_registration.py │ ├── test_registry.py │ ├── test_transformers.py │ ├── test_utils.py │ ├── test_vision.py │ └── utils.py ├── mq_llm_engine │ ├── __init__.py │ ├── conftest.py │ ├── test_abort.py │ ├── test_error_handling.py │ ├── test_load.py │ └── utils.py ├── multi_step │ ├── __init__.py │ ├── test_correctness_async_llm.py │ └── test_correctness_llm.py ├── multimodal │ ├── __init__.py │ ├── assets │ │ ├── image1.png │ │ ├── image2.png │ │ └── rgba.png │ ├── test_hasher.py │ ├── test_image.py │ ├── test_inputs.py │ ├── test_processing.py │ ├── test_utils.py │ ├── test_video.py │ └── utils.py ├── neuron │ ├── 1_core │ │ ├── test_activation.py │ │ ├── test_block_table.py │ │ ├── test_cache.py │ │ ├── test_layernorm.py │ │ ├── test_logits_processor.py │ │ ├── test_neuron_model_runner.py │ │ ├── test_neuron_quant.py │ │ ├── test_prefix_prefill.py │ │ └── test_rotary_embedding.py │ └── 2_core │ │ ├── test_comm_ops.py │ │ ├── test_eagle.py │ │ ├── test_mistral.py │ │ └── test_multi_lora.py ├── plugins │ ├── lora_resolvers │ │ ├── __init__.py │ │ └── test_filesystem_resolver.py │ ├── vllm_add_dummy_model │ │ ├── setup.py │ │ └── vllm_add_dummy_model │ │ │ ├── __init__.py │ │ │ ├── my_gemma_embedding.py │ │ │ ├── my_llava.py │ │ │ └── my_opt.py │ └── vllm_add_dummy_platform │ │ ├── setup.py │ │ └── vllm_add_dummy_platform │ │ ├── __init__.py │ │ ├── dummy_attention_backend.py │ │ └── dummy_platform.py ├── plugins_tests │ ├── conftest.py │ ├── test_platform_plugins.py │ └── test_scheduler_plugins.py ├── prefix_caching │ ├── __init__.py │ ├── test_disable_sliding_window.py │ └── test_prefix_caching.py ├── prompt_adapter │ ├── test_bloom.py │ ├── test_multi_adapter_inference.py │ └── test_pa_lora.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── __init__.py │ ├── test_auto_round.py │ ├── test_bitsandbytes.py │ ├── test_compressed_tensors.py │ ├── test_configs.py │ ├── test_cpu_offload.py │ ├── test_experts_int8.py │ ├── test_fp8.py │ ├── test_gptq_dynamic.py │ ├── test_ipex_quant.py │ ├── test_lm_head.py │ ├── test_ptpc_fp8.py │ ├── test_quark.py │ ├── test_register_quantization_config.py │ ├── test_torchao.py │ └── utils.py ├── reasoning │ ├── __init__.py │ ├── test_deepseekr1_reasoning_parser.py │ ├── test_granite_reasoning_parser.py │ ├── test_qwen3_reasoning_parser.py │ └── utils.py ├── runai_model_streamer_test │ ├── __init__.py │ ├── test_runai_model_streamer_loader.py │ └── test_weight_utils.py ├── samplers │ ├── __init__.py │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_logits_processor.py │ ├── test_logprobs.py │ ├── test_no_bad_words.py │ ├── test_ranks.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ ├── test_seeded_generate.py │ └── test_typical_acceptance_sampler.py ├── spec_decode │ ├── __init__.py │ ├── conftest.py │ ├── e2e │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_compatibility.py │ │ ├── test_eagle_correctness.py │ │ ├── test_integration.py │ │ ├── test_integration_dist_tp2.py │ │ ├── test_integration_dist_tp4.py │ │ ├── test_logprobs.py │ │ ├── test_medusa_correctness.py │ │ ├── test_mlp_correctness.py │ │ ├── test_mtp_correctness.py │ │ ├── test_multistep_correctness.py │ │ ├── test_ngram_correctness.py │ │ └── test_seed.py │ ├── test_batch_expansion.py │ ├── test_dynamic_spec_decode.py │ ├── test_memory_usage.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_ngram_worker.py │ ├── test_scorer.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── standalone_tests │ ├── lazy_imports.py │ └── python_only_compile.sh ├── system_messages │ └── sonnet3.5_nov2024.txt ├── tensorizer_loader │ ├── __init__.py │ ├── conftest.py │ └── test_tensorizer.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_embedded_commit.py ├── test_inputs.py ├── test_logger.py ├── test_outputs.py ├── test_regression.py ├── test_sampling_params.py ├── test_scalartype.py ├── test_seed_behavior.py ├── test_sequence.py ├── test_sharded_state_loader.py ├── test_triton_utils.py ├── test_utils.py ├── test_version.py ├── test_vllm_port.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_get_eos.py │ ├── test_mistral_tokenizer.py │ ├── test_tokenizer.py │ ├── test_tokenizer_group.py │ └── test_tokenizer_registry.py ├── tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_chat_completion_request_validations.py │ ├── test_chat_completions.py │ ├── test_jamba_tool_parser.py │ ├── test_parallel_tool_calls.py │ ├── test_tool_calls.py │ ├── test_tool_choice_required.py │ └── utils.py ├── tpu │ ├── __init__.py │ ├── lora │ │ ├── __init__.py │ │ └── test_lora.py │ ├── test_compilation.py │ ├── test_custom_dispatcher.py │ ├── test_moe_pallas.py │ └── test_quantization_accuracy.py ├── tracing │ ├── __init__.py │ └── test_tracing.py ├── utils.py ├── v1 │ ├── __init__.py │ ├── core │ │ ├── test_kv_cache_utils.py │ │ ├── test_prefix_caching.py │ │ ├── test_scheduler.py │ │ ├── test_scheduler_e2e.py │ │ └── test_specialized_manager.py │ ├── e2e │ │ ├── __init__.py │ │ ├── test_cascade_attention.py │ │ ├── test_correctness_sliding_window.py │ │ └── test_spec_decode.py │ ├── engine │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_async_llm.py │ │ ├── test_engine_args.py │ │ ├── test_engine_core.py │ │ ├── test_engine_core_client.py │ │ ├── test_llm_engine.py │ │ ├── test_output_processor.py │ │ └── utils.py │ ├── entrypoints │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ └── test_struct_output_generate.py │ │ └── openai │ │ │ ├── test_chat_completion.py │ │ │ ├── test_completion.py │ │ │ └── test_multi_api_servers.py │ ├── kv_connector │ │ ├── nixl_integration │ │ │ ├── run_accuracy_test.sh │ │ │ ├── run_edge_case_test.sh │ │ │ ├── test_accuracy.py │ │ │ ├── test_edge_cases.py │ │ │ └── toy_proxy_server.py │ │ └── unit │ │ │ ├── __init__.py │ │ │ ├── test_multi_connector.py │ │ │ ├── test_nixl_connector.py │ │ │ ├── test_remote_decode_lifecycle.py │ │ │ ├── test_remote_prefill_lifecycle.py │ │ │ └── utils.py │ ├── metrics │ │ └── test_ray_metrics.py │ ├── sample │ │ ├── __init__.py │ │ ├── test_logprobs.py │ │ ├── test_logprobs_e2e.py │ │ ├── test_rejection_sampler.py │ │ ├── test_sampler.py │ │ ├── test_sampling_params_e2e.py │ │ ├── test_topk_topp_sampler.py │ │ └── utils.py │ ├── shutdown │ │ ├── test_delete.py │ │ ├── test_forward_error.py │ │ ├── test_processor_error.py │ │ ├── test_startup_error.py │ │ └── utils.py │ ├── spec_decode │ │ ├── test_eagle.py │ │ ├── test_max_len.py │ │ └── test_ngram.py │ ├── structured_output │ │ ├── __init__.py │ │ └── test_utils.py │ ├── test_async_llm_dp.py │ ├── test_metrics_reader.py │ ├── test_oracle.py │ ├── test_serial_utils.py │ ├── test_utils.py │ ├── tpu │ │ ├── __init__.py │ │ ├── test_basic.py │ │ ├── test_mha_attn.py │ │ ├── test_multimodal.py │ │ ├── test_pallas.py │ │ ├── test_perf.py │ │ ├── test_sampler.py │ │ ├── test_spmd_model_weight_loading.py │ │ ├── test_topk_topp_sampler.py │ │ ├── test_tpu_qkv_linear.py │ │ └── worker │ │ │ ├── __init__.py │ │ │ └── test_tpu_model_runner.py │ └── worker │ │ ├── __init__.py │ │ ├── test_gpu_input_batch.py │ │ └── test_gpu_model_runner.py ├── vllm_test_utils │ ├── setup.py │ └── vllm_test_utils │ │ ├── __init__.py │ │ ├── blame.py │ │ └── monitor.py ├── weight_loading │ ├── models-large.txt │ ├── models.txt │ ├── run_model_weight_loading_test.sh │ └── test_weight_loading.py └── worker │ ├── __init__.py │ ├── conftest.py │ ├── test_encoder_decoder_model_runner.py │ ├── test_model_input.py │ ├── test_model_runner.py │ ├── test_profile.py │ └── test_swap.py ├── tools ├── check_repo.sh ├── check_spdx_header.py ├── check_triton_import.py ├── enforce_regex_import.py ├── ep_kernels │ ├── README.md │ ├── install_python_libraries.sh │ ├── install_system_drivers.sh │ └── install_system_libraries.sh ├── install_nixl.sh ├── mypy.sh ├── png-lint.sh ├── profiler │ ├── print_layerwise_table.py │ └── visualize_layerwise_profile.py ├── report_build_time_ninja.py ├── shellcheck.sh └── update-dockerfile-graph.sh ├── use_existing_torch.py └── vllm ├── __init__.py ├── _custom_ops.py ├── _ipex_ops.py ├── adapter_commons ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── assets ├── __init__.py ├── audio.py ├── base.py ├── image.py └── video.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── blocksparse_attn.py │ ├── cpu_mla.py │ ├── dual_chunk_flash_attn.py │ ├── flash_attn.py │ ├── flashinfer.py │ ├── flashmla.py │ ├── hpu_attn.py │ ├── ipex_attn.py │ ├── mla │ │ ├── __init__.py │ │ └── common.py │ ├── pallas.py │ ├── placeholder_attn.py │ ├── rocm_aiter_mla.py │ ├── rocm_flash_attn.py │ ├── torch_sdpa.py │ ├── triton_mla.py │ ├── utils.py │ └── xformers.py ├── layer.py ├── ops │ ├── __init__.py │ ├── blocksparse_attention │ │ ├── __init__.py │ │ ├── blocksparse_attention_kernel.py │ │ ├── interface.py │ │ └── utils.py │ ├── chunked_prefill_paged_decode.py │ ├── flashmla.py │ ├── hpu_paged_attn.py │ ├── ipex_attn.py │ ├── merge_attn_states.py │ ├── nki_flash_attn.py │ ├── paged_attn.py │ ├── prefix_prefill.py │ ├── rocm_aiter_mla.py │ ├── rocm_aiter_paged_attn.py │ ├── triton_decode_attention.py │ ├── triton_flash_attention.py │ ├── triton_merge_attn_states.py │ └── triton_unified_attention.py ├── selector.py └── utils │ └── fa_utils.py ├── beam_search.py ├── benchmarks ├── __init__.py ├── datasets.py ├── endpoint_request_func.py ├── latency.py ├── serve.py ├── throughput.py └── utils.py ├── collect_env.py ├── compilation ├── __init__.py ├── activation_quant_fusion.py ├── backends.py ├── base_piecewise_backend.py ├── collective_fusion.py ├── compiler_interface.py ├── counter.py ├── cuda_piecewise_backend.py ├── decorators.py ├── fix_functionalization.py ├── fusion.py ├── fx_utils.py ├── inductor_pass.py ├── monitor.py ├── multi_output_match.py ├── noop_elimination.py ├── pass_manager.py ├── sequence_parallelism.py ├── torch25_custom_graph_pass.py ├── vllm_inductor_pass.py └── wrapper.py ├── config.py ├── connections.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ ├── prefix_caching_block.py │ └── utils.py ├── block_manager.py ├── evictor.py ├── interfaces.py ├── placeholder_block_space_manager.py └── scheduler.py ├── device_allocator ├── __init__.py └── cumem.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── all2all.py │ ├── base_device_communicator.py │ ├── cpu_communicator.py │ ├── cuda_communicator.py │ ├── cuda_wrapper.py │ ├── custom_all_reduce.py │ ├── custom_all_reduce_utils.py │ ├── hpu_communicator.py │ ├── neuron_communicator.py │ ├── pynccl.py │ ├── pynccl_wrapper.py │ ├── shm_broadcast.py │ ├── tpu_communicator.py │ └── xpu_communicator.py ├── kv_events.py ├── kv_transfer │ ├── README.md │ ├── __init__.py │ ├── disagg_prefill_workflow.jpg │ ├── kv_connector │ │ ├── __init__.py │ │ ├── base.py │ │ ├── factory.py │ │ ├── lmcache_connector.py │ │ ├── mooncake_store_connector.py │ │ ├── simple_connector.py │ │ ├── utils.py │ │ └── v1 │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── lmcache_connector.py │ │ │ ├── multi_connector.py │ │ │ ├── nixl_connector.py │ │ │ └── shared_storage_connector.py │ ├── kv_connector_agent.py │ ├── kv_lookup_buffer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mooncake_store.py │ │ └── simple_buffer.py │ ├── kv_pipe │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mooncake_pipe.py │ │ └── pynccl_pipe.py │ └── kv_transfer_state.py ├── parallel_state.py ├── tpu_distributed_utils.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── async_timeout.py ├── llm_engine.py ├── metrics.py ├── metrics_types.py ├── multiprocessing │ ├── __init__.py │ ├── client.py │ └── engine.py ├── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── multi_step.py │ ├── single_step.py │ ├── stop_checker.py │ └── util.py └── protocol.py ├── entrypoints ├── __init__.py ├── api_server.py ├── chat_utils.py ├── cli │ ├── __init__.py │ ├── benchmark │ │ ├── __init__.py │ │ ├── base.py │ │ ├── latency.py │ │ ├── main.py │ │ ├── serve.py │ │ └── throughput.py │ ├── collect_env.py │ ├── main.py │ ├── openai.py │ ├── run_batch.py │ ├── serve.py │ └── types.py ├── launcher.py ├── llm.py ├── logger.py ├── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── logits_processors.py │ ├── protocol.py │ ├── run_batch.py │ ├── serving_chat.py │ ├── serving_classification.py │ ├── serving_completion.py │ ├── serving_embedding.py │ ├── serving_engine.py │ ├── serving_models.py │ ├── serving_pooling.py │ ├── serving_score.py │ ├── serving_tokenization.py │ ├── serving_transcription.py │ └── tool_parsers │ │ ├── __init__.py │ │ ├── abstract_tool_parser.py │ │ ├── deepseekv3_tool_parser.py │ │ ├── granite_20b_fc_tool_parser.py │ │ ├── granite_tool_parser.py │ │ ├── hermes_tool_parser.py │ │ ├── internlm2_tool_parser.py │ │ ├── jamba_tool_parser.py │ │ ├── llama4_pythonic_tool_parser.py │ │ ├── llama_tool_parser.py │ │ ├── mistral_tool_parser.py │ │ ├── phi4mini_tool_parser.py │ │ ├── pythonic_tool_parser.py │ │ └── utils.py ├── score_utils.py ├── ssl.py └── utils.py ├── env_override.py ├── envs.py ├── executor ├── __init__.py ├── executor_base.py ├── mp_distributed_executor.py ├── msgspec_utils.py ├── multiproc_worker_utils.py ├── ray_distributed_executor.py ├── ray_utils.py └── uniproc_executor.py ├── forward_context.py ├── inputs ├── __init__.py ├── data.py ├── parse.py ├── preprocess.py └── registry.py ├── jsontree.py ├── logger.py ├── logging_utils ├── __init__.py ├── dump_input.py └── formatter.py ├── logits_process.py ├── lora ├── __init__.py ├── fully_sharded_layers.py ├── layers.py ├── lora.py ├── models.py ├── ops │ ├── __init__.py │ ├── torch_ops │ │ ├── __init__.py │ │ └── lora_ops.py │ ├── triton_ops │ │ ├── __init__.py │ │ ├── kernel_utils.py │ │ ├── lora_expand_op.py │ │ ├── lora_kernel_metadata.py │ │ ├── lora_shrink_op.py │ │ └── utils.py │ └── xla_ops │ │ ├── __init__.py │ │ └── lora_ops.py ├── peft_helper.py ├── punica_wrapper │ ├── __init__.py │ ├── punica_base.py │ ├── punica_cpu.py │ ├── punica_gpu.py │ ├── punica_hpu.py │ ├── punica_selector.py │ ├── punica_tpu.py │ └── utils.py ├── request.py ├── resolver.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── custom_op.py ├── guided_decoding │ ├── __init__.py │ ├── guidance_decoding.py │ ├── guidance_logits_processors.py │ ├── guided_fields.py │ ├── lm_format_enforcer_decoding.py │ ├── outlines_decoding.py │ ├── outlines_logits_processors.py │ ├── utils.py │ └── xgrammar_decoding.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── batched_deep_gemm_moe.py │ │ ├── batched_triton_or_deep_gemm_moe.py │ │ ├── configs │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=96,device_name=NVIDIA_H20.json │ │ │ ├── E=16,N=1024,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H100.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json │ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json │ │ │ ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=60,N=1408,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=176,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=352,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=704,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=896,device_name=NVIDIA_H20.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ └── README │ │ ├── cutlass_moe.py │ │ ├── deep_gemm_moe.py │ │ ├── deepep_ht_prepare_finalize.py │ │ ├── deepep_ll_prepare_finalize.py │ │ ├── fused_batched_moe.py │ │ ├── fused_marlin_moe.py │ │ ├── fused_moe.py │ │ ├── layer.py │ │ ├── modular_kernel.py │ │ ├── moe_align_block_size.py │ │ ├── moe_pallas.py │ │ ├── moe_permute_unpermute.py │ │ ├── moe_torch_iterative.py │ │ ├── pplx_prepare_finalize.py │ │ ├── prepare_finalize.py │ │ ├── rocm_aiter_fused_moe.py │ │ ├── triton_deep_gemm_moe.py │ │ └── utils.py │ ├── layernorm.py │ ├── lightning_attn.py │ ├── linear.py │ ├── logits_processor.py │ ├── mamba │ │ ├── __init__.py │ │ ├── mamba2_metadata.py │ │ ├── mamba_mixer.py │ │ ├── mamba_mixer2.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── causal_conv1d.py │ │ │ ├── mamba_ssm.py │ │ │ ├── ssd_bmm.py │ │ │ ├── ssd_chunk_scan.py │ │ │ ├── ssd_chunk_state.py │ │ │ ├── ssd_combined.py │ │ │ └── ssd_state_passing.py │ ├── pooler.py │ ├── quantization │ │ ├── __init__.py │ │ ├── aqlm.py │ │ ├── auto_round.py │ │ ├── awq.py │ │ ├── awq_marlin.py │ │ ├── awq_triton.py │ │ ├── base_config.py │ │ ├── bitblas.py │ │ ├── bitsandbytes.py │ │ ├── compressed_tensors │ │ │ ├── __init__.py │ │ │ ├── compressed_tensors.py │ │ │ ├── compressed_tensors_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── compressed_tensors_24.py │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ ├── compressed_tensors_w4a16_24.py │ │ │ │ ├── compressed_tensors_w4a16_nvfp4.py │ │ │ │ ├── compressed_tensors_w8a16_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_int8.py │ │ │ │ └── compressed_tensors_wNa16.py │ │ │ ├── triton_scaled_mm.py │ │ │ └── utils.py │ │ ├── deepspeedfp.py │ │ ├── experts_int8.py │ │ ├── fbgemm_fp8.py │ │ ├── fp8.py │ │ ├── gguf.py │ │ ├── gptq.py │ │ ├── gptq_bitblas.py │ │ ├── gptq_marlin.py │ │ ├── gptq_marlin_24.py │ │ ├── hqq_marlin.py │ │ ├── ipex_quant.py │ │ ├── kernels │ │ │ ├── __init__.py │ │ │ ├── mixed_precision │ │ │ │ ├── MPLinearKernel.py │ │ │ │ ├── __init__.py │ │ │ │ ├── allspark.py │ │ │ │ ├── bitblas.py │ │ │ │ ├── exllama.py │ │ │ │ ├── machete.py │ │ │ │ └── marlin.py │ │ │ └── scaled_mm │ │ │ │ ├── ScaledMMLinearKernel.py │ │ │ │ ├── __init__.py │ │ │ │ ├── aiter.py │ │ │ │ ├── cutlass.py │ │ │ │ ├── triton.py │ │ │ │ └── xla.py │ │ ├── kv_cache.py │ │ ├── marlin.py │ │ ├── modelopt.py │ │ ├── moe_wna16.py │ │ ├── neuron_quant.py │ │ ├── ptpc_fp8.py │ │ ├── qqq.py │ │ ├── quark │ │ │ ├── __init__.py │ │ │ ├── quark.py │ │ │ ├── quark_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── quark_scheme.py │ │ │ │ ├── quark_w4a4_mxfp4.py │ │ │ │ ├── quark_w8a8_fp8.py │ │ │ │ └── quark_w8a8_int8.py │ │ │ └── utils.py │ │ ├── schema.py │ │ ├── torchao.py │ │ ├── tpu_int8.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── allspark_utils.py │ │ │ ├── bitblas_utils.py │ │ │ ├── configs │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ └── N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── fp8_utils.py │ │ │ ├── gptq_utils.py │ │ │ ├── int8_utils.py │ │ │ ├── layer_utils.py │ │ │ ├── machete_utils.py │ │ │ ├── marlin_utils.py │ │ │ ├── marlin_utils_fp4.py │ │ │ ├── marlin_utils_fp8.py │ │ │ ├── marlin_utils_test.py │ │ │ ├── marlin_utils_test_24.py │ │ │ ├── marlin_utils_test_qqq.py │ │ │ ├── mxfp4_utils.py │ │ │ ├── nvfp4_emulation_utils.py │ │ │ ├── quant_utils.py │ │ │ └── w8a8_utils.py │ ├── rejection_sampler.py │ ├── resampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ ├── spec_decode_base_sampler.py │ ├── typical_acceptance_sampler.py │ ├── utils.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── base_loader.py │ ├── bitsandbytes_loader.py │ ├── default_loader.py │ ├── dummy_loader.py │ ├── gguf_loader.py │ ├── neuron.py │ ├── neuronx_distributed.py │ ├── runai_streamer_loader.py │ ├── sharded_state_loader.py │ ├── tensorizer.py │ ├── tensorizer_loader.py │ ├── tpu.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── adapters.py │ ├── aimv2.py │ ├── arctic.py │ ├── aria.py │ ├── aya_vision.py │ ├── baichuan.py │ ├── bamba.py │ ├── bart.py │ ├── bert.py │ ├── bert_with_rope.py │ ├── blip.py │ ├── blip2.py │ ├── bloom.py │ ├── chameleon.py │ ├── chatglm.py │ ├── clip.py │ ├── commandr.py │ ├── constant_size_cache.py │ ├── dbrx.py │ ├── deepseek.py │ ├── deepseek_mtp.py │ ├── deepseek_v2.py │ ├── deepseek_vl2.py │ ├── eagle.py │ ├── exaone.py │ ├── fairseq2_llama.py │ ├── falcon.py │ ├── falcon_h1.py │ ├── florence2.py │ ├── fuyu.py │ ├── gemma.py │ ├── gemma2.py │ ├── gemma3.py │ ├── gemma3_mm.py │ ├── glm.py │ ├── glm4.py │ ├── glm4v.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── granite.py │ ├── granite_speech.py │ ├── granitemoe.py │ ├── granitemoehybrid.py │ ├── granitemoeshared.py │ ├── gritlm.py │ ├── grok1.py │ ├── h2ovl.py │ ├── idefics2_vision_model.py │ ├── idefics3.py │ ├── interfaces.py │ ├── interfaces_base.py │ ├── intern_vit.py │ ├── internlm2.py │ ├── internlm2_ve.py │ ├── internvl.py │ ├── jais.py │ ├── jamba.py │ ├── kimi_vl.py │ ├── llama.py │ ├── llama4.py │ ├── llama_eagle.py │ ├── llama_eagle3.py │ ├── llava.py │ ├── llava_next.py │ ├── llava_next_video.py │ ├── llava_onevision.py │ ├── mamba.py │ ├── mamba2.py │ ├── mamba_cache.py │ ├── medusa.py │ ├── mimo.py │ ├── mimo_mtp.py │ ├── minicpm.py │ ├── minicpm3.py │ ├── minicpm_eagle.py │ ├── minicpmo.py │ ├── minicpmv.py │ ├── minimax_cache.py │ ├── minimax_text_01.py │ ├── minimax_vl_01.py │ ├── mistral3.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mllama.py │ ├── mllama4.py │ ├── mlp_speculator.py │ ├── modernbert.py │ ├── module_mapping.py │ ├── molmo.py │ ├── moonvit.py │ ├── mpt.py │ ├── nemotron.py │ ├── nemotron_nas.py │ ├── nvlm_d.py │ ├── olmo.py │ ├── olmo2.py │ ├── olmoe.py │ ├── opt.py │ ├── orion.py │ ├── ovis.py │ ├── paligemma.py │ ├── persimmon.py │ ├── phi.py │ ├── phi3.py │ ├── phi3_small.py │ ├── phi3v.py │ ├── phi4mm.py │ ├── phi4mm_audio.py │ ├── phi4mm_utils.py │ ├── phimoe.py │ ├── pixtral.py │ ├── plamo2.py │ ├── prithvi_geospatial_mae.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_5_omni_thinker.py │ ├── qwen2_5_vl.py │ ├── qwen2_audio.py │ ├── qwen2_moe.py │ ├── qwen2_rm.py │ ├── qwen2_vl.py │ ├── qwen3.py │ ├── qwen3_moe.py │ ├── qwen_vl.py │ ├── registry.py │ ├── roberta.py │ ├── siglip.py │ ├── skyworkr1v.py │ ├── smolvlm.py │ ├── solar.py │ ├── stablelm.py │ ├── starcoder2.py │ ├── tarsier.py │ ├── telechat2.py │ ├── teleflm.py │ ├── transformers.py │ ├── ultravox.py │ ├── utils.py │ ├── vision.py │ ├── whisper.py │ └── zamba2.py ├── parameter.py ├── pooling_metadata.py ├── sampling_metadata.py └── utils.py ├── multimodal ├── __init__.py ├── audio.py ├── base.py ├── hasher.py ├── image.py ├── inputs.py ├── parse.py ├── processing.py ├── profiling.py ├── registry.py ├── utils.py └── video.py ├── outputs.py ├── platforms ├── __init__.py ├── cpu.py ├── cuda.py ├── hpu.py ├── interface.py ├── neuron.py ├── rocm.py ├── tpu.py └── xpu.py ├── plugins ├── __init__.py └── lora_resolvers │ ├── README.md │ ├── __init__.py │ └── filesystem_resolver.py ├── pooling_params.py ├── profiler ├── __init__.py ├── layerwise_profile.py └── utils.py ├── prompt_adapter ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── py.typed ├── reasoning ├── __init__.py ├── abs_reasoning_parsers.py ├── deepseek_r1_reasoning_parser.py ├── granite_reasoning_parser.py └── qwen3_reasoning_parser.py ├── sampling_params.py ├── scalar_type.py ├── scripts.py ├── sequence.py ├── spec_decode ├── __init__.py ├── batch_expansion.py ├── draft_model_runner.py ├── interfaces.py ├── medusa_worker.py ├── metrics.py ├── mlp_speculator_worker.py ├── mqa_scorer.py ├── multi_step_worker.py ├── ngram_worker.py ├── proposer_worker_base.py ├── smaller_tp_proposer_worker.py ├── spec_decode_worker.py ├── target_model_runner.py ├── top1_proposer.py └── util.py ├── test_utils.py ├── third_party ├── __init__.py └── pynvml.py ├── tracing.py ├── transformers_utils ├── __init__.py ├── chat_templates │ ├── __init__.py │ ├── registry.py │ ├── template_basic.jinja │ ├── template_blip2.jinja │ ├── template_chatml.jinja │ ├── template_deepseek_vl2.jinja │ └── template_fuyu.jinja ├── config.py ├── configs │ ├── __init__.py │ ├── arctic.py │ ├── chatglm.py │ ├── cohere2.py │ ├── dbrx.py │ ├── deepseek_vl2.py │ ├── eagle.py │ ├── exaone.py │ ├── falcon.py │ ├── h2ovl.py │ ├── internvl.py │ ├── jais.py │ ├── kimi_vl.py │ ├── medusa.py │ ├── minimax_text_01.py │ ├── minimax_vl_01.py │ ├── mllama.py │ ├── mlp_speculator.py │ ├── moonvit.py │ ├── mpt.py │ ├── nemotron.py │ ├── nvlm_d.py │ ├── ovis.py │ ├── skyworkr1v.py │ ├── solar.py │ ├── telechat2.py │ └── ultravox.py ├── detokenizer.py ├── detokenizer_utils.py ├── processor.py ├── processors │ ├── __init__.py │ ├── deepseek_vl2.py │ └── ovis.py ├── s3_utils.py ├── tokenizer.py ├── tokenizer_base.py ├── tokenizer_group.py ├── tokenizers │ ├── __init__.py │ └── mistral.py └── utils.py ├── triton_utils ├── __init__.py └── importing.py ├── usage ├── __init__.py └── usage_lib.py ├── utils.py ├── v1 ├── __init__.py ├── attention │ ├── __init__.py │ └── backends │ │ ├── __init__.py │ │ ├── cpu_attn.py │ │ ├── flash_attn.py │ │ ├── flashinfer.py │ │ ├── mla │ │ ├── __init__.py │ │ ├── common.py │ │ ├── cutlass_mla.py │ │ ├── flashmla.py │ │ ├── rocm_aiter_mla.py │ │ └── triton_mla.py │ │ ├── pallas.py │ │ ├── triton_attn.py │ │ └── utils.py ├── core │ ├── __init__.py │ ├── block_pool.py │ ├── encoder_cache_manager.py │ ├── kv_cache_manager.py │ ├── kv_cache_utils.py │ ├── sched │ │ ├── __init__.py │ │ ├── interface.py │ │ ├── output.py │ │ ├── scheduler.py │ │ └── utils.py │ └── single_type_kv_cache_manager.py ├── engine │ ├── __init__.py │ ├── async_llm.py │ ├── coordinator.py │ ├── core.py │ ├── core_client.py │ ├── detokenizer.py │ ├── exceptions.py │ ├── llm_engine.py │ ├── logprobs.py │ ├── mm_input_cache.py │ ├── output_processor.py │ ├── parallel_sampling.py │ └── processor.py ├── executor │ ├── __init__.py │ ├── abstract.py │ ├── multiproc_executor.py │ └── ray_distributed_executor.py ├── kv_cache_interface.py ├── metrics │ ├── __init__.py │ ├── loggers.py │ ├── prometheus.py │ ├── ray_wrappers.py │ ├── reader.py │ └── stats.py ├── outputs.py ├── request.py ├── sample │ ├── __init__.py │ ├── metadata.py │ ├── ops │ │ ├── __init__.py │ │ ├── bad_words.py │ │ ├── penalties.py │ │ └── topk_topp_sampler.py │ ├── rejection_sampler.py │ ├── sampler.py │ └── tpu │ │ ├── __init__.py │ │ ├── metadata.py │ │ └── sampler.py ├── serial_utils.py ├── spec_decode │ ├── __init__.py │ ├── eagle.py │ ├── medusa.py │ ├── metadata.py │ ├── metrics.py │ ├── ngram_proposer.py │ └── utils.py ├── structured_output │ ├── __init__.py │ ├── backend_guidance.py │ ├── backend_types.py │ ├── backend_xgrammar.py │ ├── request.py │ └── utils.py ├── utils.py └── worker │ ├── __init__.py │ ├── block_table.py │ ├── cpu_model_runner.py │ ├── cpu_worker.py │ ├── gpu_input_batch.py │ ├── gpu_model_runner.py │ ├── gpu_worker.py │ ├── lora_model_runner_mixin.py │ ├── tpu_model_runner.py │ ├── tpu_worker.py │ ├── utils.py │ └── worker_base.py ├── version.py ├── vllm_flash_attn └── .gitkeep └── worker ├── __init__.py ├── cache_engine.py ├── cpu_enc_dec_model_runner.py ├── cpu_model_runner.py ├── cpu_pooling_model_runner.py ├── cpu_worker.py ├── enc_dec_model_runner.py ├── hpu_model_runner.py ├── hpu_worker.py ├── model_runner.py ├── model_runner_base.py ├── multi_step_hpu_worker.py ├── multi_step_model_runner.py ├── multi_step_neuron_model_runner.py ├── multi_step_neuronx_distributed_model_runner.py ├── multi_step_tpu_worker.py ├── multi_step_worker.py ├── neuron_model_runner.py ├── neuron_worker.py ├── neuronx_distributed_model_runner.py ├── pooling_model_runner.py ├── tpu_model_runner.py ├── tpu_worker.py ├── utils.py ├── worker.py ├── worker_base.py ├── xpu_model_runner.py └── xpu_worker.py /.buildkite/generate_index.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import argparse 5 | import os 6 | 7 | template = """ 8 | 9 | 10 |

Links for vLLM

11 | {wheel}
12 | 13 | 14 | """ 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--wheel", help="The wheel path.", required=True) 18 | args = parser.parse_args() 19 | 20 | filename = os.path.basename(args.wheel) 21 | 22 | with open("index.html", "w") as f: 23 | print(f"Generated index.html for {args.wheel}") 24 | # cloudfront requires escaping the '+' character 25 | f.write( 26 | template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) 27 | ) 28 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 3 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.671 9 | - name: "exact_match,flexible-extract" 10 | value: 0.664 11 | limit: 1000 12 | num_fewshot: 5 13 | trust_remote_code: True -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # For hf script, without -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 3 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.905 9 | - name: "exact_match,flexible-extract" 10 | value: 0.905 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # For hf script, without -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 3 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.892 9 | - name: "exact_match,flexible-extract" 10 | value: 0.892 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.752 9 | - name: "exact_match,flexible-extract" 10 | value: 0.754 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.753 9 | - name: "exact_match,flexible-extract" 10 | value: 0.753 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.755 9 | - name: "exact_match,flexible-extract" 10 | value: 0.755 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 3 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.753 9 | - name: "exact_match,flexible-extract" 10 | value: 0.753 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.764 9 | - name: "exact_match,flexible-extract" 10 | value: 0.764 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.728 9 | - name: "exact_match,flexible-extract" 10 | value: 0.728 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.758 9 | - name: "exact_match,flexible-extract" 10 | value: 0.759 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # For hf script, without -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 3 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.756 9 | - name: "exact_match,flexible-extract" 10 | value: 0.752 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 3 | model_name: "HandH1998/QQQ-Llama-3-8b-g128" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.419 9 | - name: "exact_match,flexible-extract" 10 | value: 0.416 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1 2 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.335 8 | - name: "exact_match,flexible-extract" 9 | value: 0.323 10 | limit: 1319 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 3 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.356 9 | - name: "exact_match,flexible-extract" 10 | value: 0.358 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 3 | model_name: "mgoin/Minitron-4B-Base-FP8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.231 9 | - name: "exact_match,flexible-extract" 10 | value: 0.22 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 3 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.86 9 | - name: "exact_match,flexible-extract" 10 | value: 0.86 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 3 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.624 9 | - name: "exact_match,flexible-extract" 10 | value: 0.624 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml: -------------------------------------------------------------------------------- 1 | # For hf script, without -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 3 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.616 9 | - name: "exact_match,flexible-extract" 10 | value: 0.632 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1 3 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.30 9 | - name: "exact_match,flexible-extract" 10 | value: 0.465 11 | limit: 1319 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.578 9 | - name: "exact_match,flexible-extract" 10 | value: 0.585 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 3 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.593 9 | - name: "exact_match,flexible-extract" 10 | value: 0.588 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.595 9 | - name: "exact_match,flexible-extract" 10 | value: 0.582 11 | limit: 1000 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 3 | model_name: "Qwen/Qwen2-57B-A14B-Instruct" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.792 9 | - name: "exact_match,flexible-extract" 10 | value: 0.824 11 | limit: 250 12 | num_fewshot: 5 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1 2 | model_name: "Qwen/Qwen2.5-1.5B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.54 8 | - name: "exact_match,flexible-extract" 9 | value: 0.59 10 | limit: 1319 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 2 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.47 8 | - name: "exact_match,flexible-extract" 9 | value: 0.64 10 | limit: 1319 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml: -------------------------------------------------------------------------------- 1 | # For vllm script, with -t option (tensor parallel size). 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 3 | model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" 4 | tasks: 5 | - name: "gsm8k" 6 | metrics: 7 | - name: "exact_match,strict-match" 8 | value: 0.6353 9 | - name: "exact_match,flexible-extract" 10 | value: 0.637 11 | limit: null 12 | num_fewshot: null 13 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-large.txt: -------------------------------------------------------------------------------- 1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml 2 | Meta-Llama-3-70B-Instruct.yaml 3 | Mixtral-8x7B-Instruct-v0.1.yaml 4 | Qwen2-57B-A14-Instruct.yaml 5 | DeepSeek-V2-Lite-Chat.yaml 6 | Meta-Llama-3-8B-QQQ.yaml 7 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-small.txt: -------------------------------------------------------------------------------- 1 | Qwen2.5-1.5B-Instruct.yaml 2 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml 3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml 4 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml 5 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml 6 | Qwen1.5-MoE-W4A16-compressed-tensors.yaml 7 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/nightly-annotation.md: -------------------------------------------------------------------------------- 1 | 2 | ## Description 3 | 4 | This file contains the downloading link for benchmarking results. 5 | 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml) 7 | - [benchmarking results](artifact://results.zip) 8 | - [benchmarking code](artifact://nightly-benchmarks.zip) 9 | 10 | Please download the visualization scripts in the post 11 | 12 | ## Results reproduction 13 | 14 | - Find the docker we use in `benchmarking pipeline` 15 | - Deploy the docker, and inside the docker: 16 | - Download `nightly-benchmarks.zip`. 17 | - In the same folder, run the following code: 18 | 19 | ```console 20 | export HF_TOKEN= 21 | apt update 22 | apt install -y git 23 | unzip nightly-benchmarks.zip 24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh 25 | ``` 26 | 27 | And the results will be inside `./benchmarks/results`. 28 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/download-tokenizer.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import argparse 5 | 6 | from transformers import AutoTokenizer 7 | 8 | 9 | def main(model, cachedir): 10 | # Load the tokenizer and save it to the specified directory 11 | tokenizer = AutoTokenizer.from_pretrained(model) 12 | tokenizer.save_pretrained(cachedir) 13 | print(f"Tokenizer saved to {cachedir}") 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser( 18 | description="Download and save Hugging Face tokenizer" 19 | ) 20 | parser.add_argument("--model", type=str, required=True, help="Name of the model") 21 | parser.add_argument( 22 | "--cachedir", type=str, required=True, help="Directory to save the tokenizer" 23 | ) 24 | 25 | args = parser.parse_args() 26 | main(args.model, args.cachedir) 27 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from lmdeploy.serve.openai.api_client import APIClient 5 | 6 | api_client = APIClient("http://localhost:8000") 7 | model_name = api_client.available_models[0] 8 | 9 | print(model_name) 10 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/wait-for-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) 3 | if [[ "$BUILDKITE_BRANCH" == "main" ]]; then 4 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" 5 | else 6 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" 7 | fi 8 | 9 | TIMEOUT_SECONDS=10 10 | 11 | retries=0 12 | while [ $retries -lt 1000 ]; do 13 | if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then 14 | exit 0 15 | fi 16 | 17 | echo "Waiting for image to be available..." 18 | 19 | retries=$((retries + 1)) 20 | sleep 5 21 | done 22 | 23 | exit 1 24 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/tests/genai-perf-tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_name": "llama8B_tp1_genai_perf", 4 | "qps_list": [4,8,16,32], 5 | "common_parameters": { 6 | "model": "meta-llama/Meta-Llama-3-8B-Instruct", 7 | "tp": 1, 8 | "port": 8000, 9 | "num_prompts": 500, 10 | "reuse_server": false 11 | }, 12 | "vllm_server_parameters": { 13 | "disable_log_stats": "", 14 | "disable_log_requests": "", 15 | "gpu_memory_utilization": 0.9, 16 | "num_scheduler_steps": 10, 17 | "max_num_seqs": 512, 18 | "dtype": "bfloat16" 19 | }, 20 | "genai_perf_input_parameters": { 21 | } 22 | } 23 | ] -------------------------------------------------------------------------------- /.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script build the CPU docker image and run the offline inference inside the container. 4 | # It serves a sanity check for compilation and basic model usage. 5 | set -ex 6 | 7 | # Setup cleanup 8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } 9 | trap remove_docker_container EXIT 10 | remove_docker_container 11 | 12 | # Try building the docker image 13 | docker build -t cpu-test -f docker/Dockerfile.s390x . 14 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | UseTab: Never 3 | IndentWidth: 2 4 | ColumnLimit: 80 5 | 6 | # Force pointers to the type for C++. 7 | DerivePointerAlignment: false 8 | PointerAlignment: Left 9 | 10 | # Reordering #include statements can (and currently will) introduce errors 11 | SortIncludes: false 12 | 13 | # Style choices 14 | AlignConsecutiveAssignments: false 15 | AlignConsecutiveDeclarations: false 16 | IndentPPDirectives: BeforeHash 17 | 18 | IncludeCategories: 19 | - Regex: '^<' 20 | Priority: 4 21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' 22 | Priority: 3 23 | - Regex: '^"(qoda|\.\.)/' 24 | Priority: 2 25 | - Regex: '.*' 26 | Priority: 1 27 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /.venv 2 | /build 3 | dist 4 | vllm/*.so 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | .mypy_cache 12 | 13 | # Distribution / packaging 14 | .Python 15 | /build/ 16 | cmake-build-*/ 17 | CMakeUserPresets.json 18 | develop-eggs/ 19 | /dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/about-codeowners/ 2 | # for more info about CODEOWNERS file 3 | 4 | * @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 5 | 6 | /csrc/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 7 | /vllm/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 8 | 9 | fused_moe @divakar-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 10 | 11 | /tests/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 12 | /.buildkite/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 13 | 14 | /benchmarks/profiling @AdrianAbeyta @dllehr-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang 15 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [vllm-project] 2 | open_collective: vllm 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Questions 4 | url: https://discuss.vllm.ai 5 | about: Ask questions and discuss with other vLLM community members 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Please direct your PRs to the upstream vllm (https://github.com/vllm-project/vllm.git) 2 | 3 | Accepting PRs into the ROCm fork (https://github.com/ROCm/vllm) will require a clear previously communicated exception 4 | -------------------------------------------------------------------------------- /.github/workflows/add_label_automerge.yml: -------------------------------------------------------------------------------- 1 | name: Add label on auto-merge enabled 2 | permissions: 3 | pull-requests: write 4 | on: 5 | pull_request_target: 6 | types: 7 | - auto_merge_enabled 8 | jobs: 9 | add-label-on-auto-merge: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Add label 13 | uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 14 | with: 15 | script: | 16 | github.rest.issues.addLabels({ 17 | owner: context.repo.owner, 18 | repo: context.repo.repo, 19 | issue_number: context.issue.number, 20 | labels: ['ready'] 21 | }) 22 | env: 23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 24 | -------------------------------------------------------------------------------- /.github/workflows/cleanup_pr_body.yml: -------------------------------------------------------------------------------- 1 | name: Cleanup PR Body 2 | 3 | on: 4 | pull_request_target: 5 | types: [opened, reopened, edited] 6 | 7 | permissions: 8 | pull-requests: write 9 | 10 | jobs: 11 | update-description: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 20 | with: 21 | python-version: '3.12' 22 | 23 | - name: Install Python dependencies 24 | run: | 25 | python3 -m pip install --upgrade pip 26 | python3 -m pip install regex 27 | 28 | - name: Update PR description 29 | env: 30 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 31 | run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" 32 | -------------------------------------------------------------------------------- /.github/workflows/matchers/actionlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "actionlint", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "message": 4, 12 | "code": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/matchers/mypy.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "mypy", 5 | "pattern": [ 6 | { 7 | "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", 8 | "file": 1, 9 | "line": 2, 10 | "severity": 3, 11 | "message": 4 12 | } 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | pre-commit: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 16 | - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 17 | with: 18 | python-version: "3.12" 19 | - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" 20 | - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" 21 | - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 22 | with: 23 | extra_args: --all-files --hook-stage manual 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | python_executable=python3 5 | 6 | # Update paths 7 | # Install requirements 8 | $python_executable -m pip install -r requirements/rocm.txt 9 | 10 | # Limit the number of parallel jobs to avoid OOM 11 | export MAX_JOBS=1 12 | # Make sure release wheels are built for the following architectures 13 | export PYTORCH_ROCM_ARCH="gfx90a;gfx942" 14 | 15 | rm -f "$(which sccache)" 16 | 17 | export MAX_JOBS=32 18 | 19 | # Build 20 | $python_executable setup.py bdist_wheel --dist-dir=dist 21 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses GitHub's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: true, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo "$1" | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo "$2" | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.12" 10 | 11 | mkdocs: 12 | configuration: mkdocs.yaml 13 | 14 | # Optionally declare the Python requirements required to build your docs 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | -------------------------------------------------------------------------------- /.shellcheckrc: -------------------------------------------------------------------------------- 1 | # rules currently disabled: 2 | # 3 | # SC1091 (info): Not following: was not specified as input (see shellcheck -x) 4 | # SC2004 (style): $/${} is unnecessary on arithmetic variables. 5 | # SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. 6 | # SC2155 (warning): Declare and assign separately to avoid masking return values. 7 | # SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. 8 | # 9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164 10 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing). 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements/common.txt 3 | include requirements/cuda.txt 4 | include requirements/rocm.txt 5 | include requirements/neuron.txt 6 | include requirements/cpu.txt 7 | include CMakeLists.txt 8 | 9 | recursive-include cmake * 10 | recursive-include csrc * 11 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. 6 | 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). 8 | 9 | --- 10 | 11 | Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. 12 | 13 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. 14 | -------------------------------------------------------------------------------- /benchmarks/kernels/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /benchmarks/structured_schemas/structured_schema_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "name": { "type": "string" }, 5 | "email": { "type": "string" }, 6 | "street": { "type": "string" }, 7 | "city": { "type": "string" }, 8 | "state": { "type": "string" }, 9 | "zip": { "type": "string" }, 10 | "phone": { "type": "string" }, 11 | "website": { "type": "string" }, 12 | "company": { "type": "string" }, 13 | "age": { "type": "integer" } 14 | }, 15 | "required": [ 16 | "name", 17 | "email" 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8.cuh" 8 | -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8 7 | #ifndef USE_ROCM 8 | #include 9 | #endif // USE_ROCM 10 | #endif // ENABLE_FP8 11 | 12 | namespace vllm { 13 | 14 | enum class Fp8KVCacheDataType { 15 | kAuto = 0, 16 | kFp8E4M3 = 1, 17 | kFp8E5M2 = 2, 18 | }; 19 | 20 | // fp8 vector types for quantization of kv cache 21 | template <> 22 | struct Vec { 23 | using Type = uint8_t; 24 | }; 25 | 26 | template <> 27 | struct Vec { 28 | using Type = uint16_t; 29 | }; 30 | 31 | template <> 32 | struct Vec { 33 | using Type = uint32_t; 34 | }; 35 | 36 | template <> 37 | struct Vec { 38 | using Type = uint2; 39 | }; 40 | 41 | } // namespace vllm 42 | -------------------------------------------------------------------------------- /csrc/core/exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VLLM_IMPLIES(p, q) (!(p) || (q)) 4 | -------------------------------------------------------------------------------- /csrc/core/math.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | inline constexpr uint32_t next_pow_2(uint32_t const num) { 7 | if (num <= 1) return num; 8 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); 9 | } 10 | 11 | template 12 | static inline constexpr auto div_ceil(A a, B b) { 13 | return (a + b - 1) / b; 14 | } 15 | 16 | // Round a down to the next multiple of b. The caller is responsible for making 17 | // sure that b is non-zero 18 | template 19 | inline constexpr T round_to_previous_multiple_of(T a, T b) { 20 | return a % b == 0 ? a : (a / b) * b; 21 | } 22 | 23 | // Round a up to the next multiple of b. The caller is responsible for making 24 | // sure that b is non-zero 25 | template 26 | inline constexpr T round_to_next_multiple_of(T a, T b) { 27 | return a % b == 0 ? a : ((a / b) + 1) * b; 28 | } 29 | -------------------------------------------------------------------------------- /csrc/cpu/cpu_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPU_TYPES_HPP 2 | #define CPU_TYPES_HPP 3 | 4 | #if defined(__x86_64__) 5 | // x86 implementation 6 | #include "cpu_types_x86.hpp" 7 | #elif defined(__POWER9_VECTOR__) 8 | // ppc implementation 9 | #include "cpu_types_vsx.hpp" 10 | #elif defined(__s390x__) 11 | // s390 implementation 12 | #include "cpu_types_vxe.hpp" 13 | #elif defined(__aarch64__) 14 | // arm implementation 15 | #include "cpu_types_arm.hpp" 16 | #else 17 | #warning "unsupported vLLM cpu implementation" 18 | #endif 19 | 20 | #endif -------------------------------------------------------------------------------- /csrc/cutlass_extensions/common.cpp: -------------------------------------------------------------------------------- 1 | #include "cutlass_extensions/common.hpp" 2 | 3 | int32_t get_sm_version_num() { 4 | int32_t major_capability, minor_capability; 5 | cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor, 6 | 0); 7 | cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor, 8 | 0); 9 | int32_t version_num = major_capability * 10 + minor_capability; 10 | return version_num; 11 | } -------------------------------------------------------------------------------- /csrc/moe/marlin_moe_wna16/.gitignore: -------------------------------------------------------------------------------- 1 | kernel_*.cu -------------------------------------------------------------------------------- /csrc/prepare_inputs/advance_step.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace prepare_inputs { 13 | 14 | static constexpr int max_threads = 256; 15 | static constexpr bool logging = false; 16 | 17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 18 | 19 | } // namespace prepare_inputs 20 | -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu: -------------------------------------------------------------------------------- 1 | #include "scaled_mm_kernels.hpp" 2 | #include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh" 3 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" 4 | 5 | namespace vllm { 6 | 7 | void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, 8 | torch::Tensor const& a, 9 | torch::Tensor const& b, 10 | torch::Tensor const& a_scales, 11 | torch::Tensor const& b_scales) { 12 | if (out.dtype() == torch::kBFloat16) { 13 | cutlass_gemm_blockwise_sm100_fp8_dispatch( 14 | out, a, b, a_scales, b_scales); 15 | 16 | } else { 17 | TORCH_CHECK(out.dtype() == torch::kFloat16); 18 | cutlass_gemm_blockwise_sm100_fp8_dispatch( 19 | out, a, b, a_scales, b_scales); 20 | } 21 | } 22 | 23 | } // namespace vllm 24 | -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "scaled_mm_kernels.hpp" 3 | #include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh" 4 | #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" 5 | 6 | namespace vllm { 7 | 8 | void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, 9 | torch::Tensor const& a, 10 | torch::Tensor const& b, 11 | torch::Tensor const& a_scales, 12 | torch::Tensor const& b_scales) { 13 | if (out.dtype() == torch::kBFloat16) { 14 | cutlass_gemm_blockwise_sm90_fp8_dispatch( 15 | out, a, b, a_scales, b_scales); 16 | 17 | } else { 18 | TORCH_CHECK(out.dtype() == torch::kFloat16); 19 | cutlass_gemm_blockwise_sm90_fp8_dispatch( 20 | out, a, b, a_scales, b_scales); 21 | } 22 | } 23 | 24 | } // namespace vllm -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu: -------------------------------------------------------------------------------- 1 | #include "c3x/scaled_mm_helper.hpp" 2 | #include "c3x/scaled_mm_kernels.hpp" 3 | 4 | /* 5 | This file defines quantized GEMM operations using the CUTLASS 3.x API, for 6 | NVIDIA GPUs with sm100 (Blackwell). 7 | */ 8 | 9 | #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100 10 | 11 | void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a, 12 | torch::Tensor const& b, 13 | torch::Tensor const& a_scales, 14 | torch::Tensor const& b_scales, 15 | std::optional const& bias) { 16 | dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias, 17 | vllm::cutlass_scaled_mm_sm100_fp8, 18 | nullptr, // int8 not supported on SM100 19 | vllm::cutlass_scaled_mm_blockwise_sm100_fp8); 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 14 | 15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 16 | const uint32_t q_1, 17 | half2 (&dq)[4], int stride, 18 | const uint32_t zero) { 19 | half dqh[8]; 20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 22 | 23 | for (int i = 0; i < 4; i++) 24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 25 | } 26 | 27 | } // namespace gptq 28 | } // namespace vllm 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /csrc/quantization/gptq_marlin/.gitignore: -------------------------------------------------------------------------------- 1 | kernel_*.cu -------------------------------------------------------------------------------- /csrc/rocm/custom.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // declare templates for front (cpp) and back (cuda) sides of function: 6 | // template 7 | 8 | void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K, 9 | cudaStream_t stream, const int rows_per_block); 10 | void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, 11 | const int64_t rows_per_block) { 12 | auto M = in_a.size(0); 13 | auto K = in_a.size(1); 14 | LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(), M, K, 15 | at::cuda::getCurrentCUDAStream(), rows_per_block); 16 | } 17 | -------------------------------------------------------------------------------- /docker/Dockerfile.hpu: -------------------------------------------------------------------------------- 1 | FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest 2 | 3 | COPY ./ /workspace/vllm 4 | 5 | WORKDIR /workspace/vllm 6 | 7 | RUN pip install -v -r requirements/hpu.txt 8 | 9 | ENV no_proxy=localhost,127.0.0.1 10 | ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true 11 | 12 | RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install 13 | 14 | # install development dependencies (for testing) 15 | RUN python3 -m pip install -e tests/vllm_test_utils 16 | 17 | WORKDIR /workspace/ 18 | 19 | RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks 20 | 21 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] 22 | -------------------------------------------------------------------------------- /docs/api/vllm/.meta.yml: -------------------------------------------------------------------------------- 1 | search: 2 | boost: 0.5 3 | -------------------------------------------------------------------------------- /docs/assets/contributing/dockerfile-stages-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/contributing/dockerfile-stages-dependency.png -------------------------------------------------------------------------------- /docs/assets/deployment/anything-llm-chat-with-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-with-doc.png -------------------------------------------------------------------------------- /docs/assets/deployment/anything-llm-chat-without-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-chat-without-doc.png -------------------------------------------------------------------------------- /docs/assets/deployment/anything-llm-provider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-provider.png -------------------------------------------------------------------------------- /docs/assets/deployment/anything-llm-upload-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/anything-llm-upload-doc.png -------------------------------------------------------------------------------- /docs/assets/deployment/architecture_helm_deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/architecture_helm_deployment.png -------------------------------------------------------------------------------- /docs/assets/deployment/chatbox-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-chat.png -------------------------------------------------------------------------------- /docs/assets/deployment/chatbox-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/chatbox-settings.png -------------------------------------------------------------------------------- /docs/assets/deployment/dify-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-chat.png -------------------------------------------------------------------------------- /docs/assets/deployment/dify-create-chatbot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-create-chatbot.png -------------------------------------------------------------------------------- /docs/assets/deployment/dify-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/dify-settings.png -------------------------------------------------------------------------------- /docs/assets/deployment/open_webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/open_webui.png -------------------------------------------------------------------------------- /docs/assets/deployment/streamlit-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/deployment/streamlit-chat.png -------------------------------------------------------------------------------- /docs/assets/design/arch_overview/entrypoints.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/entrypoints.excalidraw.png -------------------------------------------------------------------------------- /docs/assets/design/arch_overview/llm_engine.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/arch_overview/llm_engine.excalidraw.png -------------------------------------------------------------------------------- /docs/assets/design/hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/hierarchy.png -------------------------------------------------------------------------------- /docs/assets/design/v1/metrics/intervals-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-1.png -------------------------------------------------------------------------------- /docs/assets/design/v1/metrics/intervals-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-2.png -------------------------------------------------------------------------------- /docs/assets/design/v1/metrics/intervals-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/metrics/intervals-3.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-1.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-3.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-4.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-5.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-6.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/example-time-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/example-time-7.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/free.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/free.png -------------------------------------------------------------------------------- /docs/assets/design/v1/prefix_caching/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/design/v1/prefix_caching/overview.png -------------------------------------------------------------------------------- /docs/assets/features/disagg_prefill/abstraction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/abstraction.jpg -------------------------------------------------------------------------------- /docs/assets/features/disagg_prefill/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/features/disagg_prefill/overview.jpg -------------------------------------------------------------------------------- /docs/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/assets/logos/vllm-logo-only-light.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.ico -------------------------------------------------------------------------------- /docs/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/docs/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/community/sponsors.md: -------------------------------------------------------------------------------- 1 | # Sponsors 2 | 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! 4 | 5 | 6 | 7 | 8 | Cash Donations: 9 | 10 | - a16z 11 | - Dropbox 12 | - Sequoia Capital 13 | - Skywork AI 14 | - ZhenFund 15 | 16 | Compute Resources: 17 | 18 | - AMD 19 | - Anyscale 20 | - AWS 21 | - Crusoe Cloud 22 | - Databricks 23 | - DeepInfra 24 | - Google Cloud 25 | - Intel 26 | - Lambda Lab 27 | - Nebius 28 | - Novita AI 29 | - NVIDIA 30 | - Replicate 31 | - Roblox 32 | - RunPod 33 | - Trainy 34 | - UC Berkeley 35 | - UC San Diego 36 | 37 | Slack Sponsor: Anyscale 38 | 39 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. 40 | -------------------------------------------------------------------------------- /docs/configuration/README.md: -------------------------------------------------------------------------------- 1 | # Configuration Options 2 | 3 | This section lists the most common options for running vLLM. 4 | 5 | There are three main levels of configuration, from highest priority to lowest priority: 6 | 7 | - [Request parameters][completions-api] and [input arguments][sampling-params] 8 | - [Engine arguments](./engine_args.md) 9 | - [Environment variables](./env_vars.md) 10 | -------------------------------------------------------------------------------- /docs/configuration/env_vars.md: -------------------------------------------------------------------------------- 1 | # Environment Variables 2 | 3 | vLLM uses the following environment variables to configure the system: 4 | 5 | !!! warning 6 | Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. 7 | 8 | All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). 9 | 10 | ```python 11 | --8<-- "vllm/envs.py:env-vars-definition" 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/contributing/model/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Adding a New Model 3 | --- 4 | [](){ #new-model } 5 | 6 | This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. 7 | 8 | Contents: 9 | 10 | - [Basic](basic.md) 11 | - [Registration](registration.md) 12 | - [Tests](tests.md) 13 | - [Multimodal](multimodal.md) 14 | 15 | !!! note 16 | The complexity of adding a new model depends heavily on the model's architecture. 17 | The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. 18 | However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. 19 | 20 | !!! tip 21 | If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) 22 | or ask on our [developer slack](https://slack.vllm.ai). 23 | We will be happy to help you out! 24 | -------------------------------------------------------------------------------- /docs/deployment/frameworks/bentoml.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: BentoML 3 | --- 4 | [](){ #deployment-bentoml } 5 | 6 | [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). 9 | -------------------------------------------------------------------------------- /docs/deployment/frameworks/lobe-chat.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Lobe Chat 3 | --- 4 | [](){ #deployment-lobe-chat } 5 | 6 | [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework. 7 | 8 | Supports speech-synthesis, multi-modal, and extensible (function call) plugin system. 9 | 10 | One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application. 11 | 12 | It supports vLLM as a AI model provider to efficiently serve large language models. 13 | 14 | For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm). 15 | -------------------------------------------------------------------------------- /docs/deployment/frameworks/modal.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Modal 3 | --- 4 | [](){ #deployment-modal } 5 | 6 | vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling. 7 | 8 | For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference). 9 | -------------------------------------------------------------------------------- /docs/deployment/frameworks/open-webui.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Open WebUI 3 | --- 4 | [](){ #deployment-open-webui } 5 | 6 | 1. Install the [Docker](https://docs.docker.com/engine/install/) 7 | 8 | 2. Start the vLLM server with the supported chat completion model, e.g. 9 | 10 | ```console 11 | vllm serve qwen/Qwen1.5-0.5B-Chat 12 | ``` 13 | 14 | 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): 15 | 16 | ```console 17 | docker run -d -p 3000:8080 \ 18 | --name open-webui \ 19 | -v open-webui:/app/backend/data \ 20 | -e OPENAI_API_BASE_URL=http://:/v1 \ 21 | --restart always \ 22 | ghcr.io/open-webui/open-webui:main 23 | ``` 24 | 25 | 1. Open it in the browser: 26 | 27 | On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. 28 | 29 | ![](../../assets/deployment/open_webui.png) 30 | -------------------------------------------------------------------------------- /docs/deployment/frameworks/triton.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: NVIDIA Triton 3 | --- 4 | [](){ #deployment-triton } 5 | 6 | The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. 7 | -------------------------------------------------------------------------------- /docs/deployment/integrations/kserve.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: KServe 3 | --- 4 | [](){ #deployment-kserve } 5 | 6 | vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/deployment/integrations/kubeai.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: KubeAI 3 | --- 4 | [](){ #deployment-kubeai } 5 | 6 | [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. 7 | 8 | Please see the Installation Guides for environment specific instructions: 9 | 10 | - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/) 11 | - [EKS](https://www.kubeai.org/installation/eks/) 12 | - [GKE](https://www.kubeai.org/installation/gke/) 13 | 14 | Once you have KubeAI installed, you can 15 | [configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/) 16 | using vLLM. 17 | -------------------------------------------------------------------------------- /docs/deployment/integrations/llmaz.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: llmaz 3 | --- 4 | [](){ #deployment-llmaz } 5 | 6 | [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend. 7 | 8 | Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details. 9 | -------------------------------------------------------------------------------- /docs/features/quantization/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quantization 3 | --- 4 | [](){ #quantization-index } 5 | 6 | Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. 7 | 8 | Contents: 9 | 10 | - [Supported_Hardware](supported_hardware.md) 11 | - [Auto_Awq](auto_awq.md) 12 | - [Bnb](bnb.md) 13 | - [Bitblas](bitblas.md) 14 | - [Gguf](gguf.md) 15 | - [Gptqmodel](gptqmodel.md) 16 | - [Int4](int4.md) 17 | - [Int8](int8.md) 18 | - [Fp8](fp8.md) 19 | - [Modelopt](modelopt.md) 20 | - [Quark](quark.md) 21 | - [Quantized_Kvcache](quantized_kvcache.md) 22 | - [Torchao](torchao.md) 23 | -------------------------------------------------------------------------------- /docs/getting_started/installation/.nav.yml: -------------------------------------------------------------------------------- 1 | nav: 2 | - README.md 3 | - gpu.md 4 | - cpu.md 5 | - ai_accelerator.md -------------------------------------------------------------------------------- /docs/getting_started/installation/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Installation 3 | --- 4 | [](){ #installation-index } 5 | 6 | vLLM supports the following hardware platforms: 7 | 8 | - [GPU](gpu.md) 9 | - [NVIDIA CUDA](gpu.md#nvidia-cuda) 10 | - [AMD ROCm](gpu.md#amd-rocm) 11 | - [Intel XPU](gpu.md#intel-xpu) 12 | - [CPU](cpu.md) 13 | - [Intel/AMD x86](cpu.md#intelamd-x86) 14 | - [ARM AArch64](cpu.md#arm-aarch64) 15 | - [Apple silicon](cpu.md#apple-silicon) 16 | - [IBM Z (S390X)](cpu.md#ibm-z-s390x) 17 | - [Other AI accelerators](ai_accelerator.md) 18 | - [Google TPU](ai_accelerator.md#google-tpu) 19 | - [Intel Gaudi](ai_accelerator.md#intel-gaudi) 20 | - [AWS Neuron](ai_accelerator.md#aws-neuron) 21 | -------------------------------------------------------------------------------- /docs/getting_started/installation/device.template.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Requirements 4 | 5 | ## Set up using Python 6 | 7 | ### Pre-built wheels 8 | 9 | ### Build wheel from source 10 | 11 | ## Set up using Docker 12 | 13 | ### Pre-built images 14 | 15 | ### Build image from source 16 | 17 | ## Extra information 18 | -------------------------------------------------------------------------------- /docs/getting_started/installation/python_env_setup.inc.md: -------------------------------------------------------------------------------- 1 | It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: 2 | 3 | ```console 4 | uv venv --python 3.12 --seed 5 | source .venv/bin/activate 6 | ``` 7 | -------------------------------------------------------------------------------- /docs/mkdocs/hooks/remove_announcement.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import os 4 | from typing import Literal 5 | 6 | 7 | def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): 8 | # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa 9 | if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag": 10 | # remove the warning banner if the version is a tagged release 11 | docs_dir = os.path.dirname(__file__) 12 | announcement_path = os.path.join(docs_dir, 13 | "mkdocs/overrides/main.html") 14 | # The file might be removed already if the build is triggered multiple 15 | # times (readthedocs build both HTML and PDF versions separately) 16 | if os.path.exists(announcement_path): 17 | os.remove(announcement_path) 18 | -------------------------------------------------------------------------------- /docs/mkdocs/javascript/run_llm_widget.js: -------------------------------------------------------------------------------- 1 | // Add RunLLM widget 2 | document.addEventListener("DOMContentLoaded", function () { 3 | var script = document.createElement("script"); 4 | script.type = "module"; 5 | script.id = "runllm-widget-script" 6 | 7 | script.src = "https://widget.runllm.com"; 8 | 9 | script.setAttribute("version", "stable"); 10 | script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. 11 | script.setAttribute("runllm-name", "vLLM"); 12 | script.setAttribute("runllm-position", "BOTTOM_RIGHT"); 13 | script.setAttribute("runllm-position-y", "120px"); 14 | script.setAttribute("runllm-position-x", "20px"); 15 | script.setAttribute("runllm-assistant-id", "207"); 16 | 17 | script.async = true; 18 | document.head.appendChild(script); 19 | }); 20 | -------------------------------------------------------------------------------- /docs/mkdocs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block announce %} 4 |

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.

5 | {% endblock %} 6 | -------------------------------------------------------------------------------- /docs/models/extensions/fastsafetensor.md: -------------------------------------------------------------------------------- 1 | Loading Model weights with fastsafetensors 2 | =================================================================== 3 | 4 | Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. 5 | For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` 6 | -------------------------------------------------------------------------------- /docs/serving/integrations/langchain.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: LangChain 3 | --- 4 | [](){ #serving-langchain } 5 | 6 | vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . 7 | 8 | To install LangChain, run 9 | 10 | ```console 11 | pip install langchain langchain_community -q 12 | ``` 13 | 14 | To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. 15 | 16 | ```python 17 | from langchain_community.llms import VLLM 18 | 19 | llm = VLLM(model="mosaicml/mpt-7b", 20 | trust_remote_code=True, # mandatory for hf models 21 | max_new_tokens=128, 22 | top_k=10, 23 | top_p=0.95, 24 | temperature=0.8, 25 | # tensor_parallel_size=... # for distributed inference 26 | ) 27 | 28 | print(llm("What is the capital of France ?")) 29 | ``` 30 | 31 | Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. 32 | -------------------------------------------------------------------------------- /docs/serving/integrations/llamaindex.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: LlamaIndex 3 | --- 4 | [](){ #serving-llamaindex } 5 | 6 | vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . 7 | 8 | To install LlamaIndex, run 9 | 10 | ```console 11 | pip install llama-index-llms-vllm -q 12 | ``` 13 | 14 | To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. 15 | 16 | ```python 17 | from llama_index.llms.vllm import Vllm 18 | 19 | llm = Vllm( 20 | model="microsoft/Orca-2-7b", 21 | tensor_parallel_size=4, 22 | max_new_tokens=100, 23 | vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, 24 | ) 25 | ``` 26 | 27 | Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. 28 | -------------------------------------------------------------------------------- /docs/training/rlhf.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning from Human Feedback 2 | 3 | Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. 4 | 5 | vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). 6 | 7 | See the following basic examples to get started if you don't want to use an existing library: 8 | 9 | - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) 10 | - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) 11 | - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) 12 | -------------------------------------------------------------------------------- /docs/usage/README.md: -------------------------------------------------------------------------------- 1 | # Using vLLM 2 | 3 | vLLM supports the following usage patterns: 4 | 5 | - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model. 6 | - [Deployment](../deployment/docker.md): Scale up model instances for production. 7 | - [Training](../training/rlhf.md): Train or fine-tune a model. 8 | -------------------------------------------------------------------------------- /examples/offline_inference/disaggregated-prefill-v1/README.md: -------------------------------------------------------------------------------- 1 | # Disaggregated Prefill V1 2 | 3 | This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM. 4 | 5 | ## Files 6 | 7 | - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. 8 | - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`. 9 | - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. 10 | - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. 11 | -------------------------------------------------------------------------------- /examples/offline_inference/disaggregated-prefill-v1/run.sh: -------------------------------------------------------------------------------- 1 | rm -rf local_storage/ 2 | 3 | if [ -f "output.txt" ]; then 4 | rm output.txt 5 | fi 6 | 7 | # The directory of current script 8 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")") 9 | 10 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py" 11 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py" 12 | -------------------------------------------------------------------------------- /examples/offline_inference/openai_batch/openai_example_batch.jsonl: -------------------------------------------------------------------------------- 1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} 2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} 3 | -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/.helmignore: -------------------------------------------------------------------------------- 1 | *.png 2 | .git/ 3 | ct.yaml 4 | lintconf.yaml 5 | values.schema.json 6 | /workflows -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: chart-vllm 3 | description: Chart vllm 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.0.1 19 | 20 | maintainers: 21 | - name: mfournioux 22 | -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/ct.yaml: -------------------------------------------------------------------------------- 1 | chart-dirs: 2 | - charts 3 | validate-maintainers: false -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.configs -}} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: "{{ .Release.Name }}-configs" 6 | namespace: {{ .Release.Namespace }} 7 | data: 8 | {{- with .Values.configs }} 9 | {{- toYaml . | nindent 2 }} 10 | {{- end }} 11 | {{- end -}} -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/custom-objects.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.customObjects }} 2 | {{- range .Values.customObjects }} 3 | {{- tpl (. | toYaml) $ }} 4 | --- 5 | {{- end }} 6 | {{- end }} -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: policy/v1 2 | kind: PodDisruptionBudget 3 | metadata: 4 | name: "{{ .Release.Name }}-pdb" 5 | namespace: {{ .Release.Namespace }} 6 | spec: 7 | maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }} -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.extraInit }} 2 | apiVersion: v1 3 | kind: PersistentVolumeClaim 4 | metadata: 5 | name: "{{ .Release.Name }}-storage-claim" 6 | namespace: {{ .Release.Namespace }} 7 | spec: 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: {{ .Values.extraInit.pvcStorage }} 13 | {{- end }} -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/secrets.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: "{{ .Release.Name }}-secrets" 5 | namespace: {{ .Release.Namespace }} 6 | type: Opaque 7 | data: 8 | {{- range $key, $val := .Values.secrets }} 9 | {{ $key }}: {{ $val | b64enc | quote }} 10 | {{- end }} -------------------------------------------------------------------------------- /examples/online_serving/chart-helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: "{{ .Release.Name }}-service" 5 | namespace: {{ .Release.Namespace }} 6 | spec: 7 | type: ClusterIP 8 | ports: 9 | - name: {{ include "chart.service-port-name" . }} 10 | port: {{ include "chart.service-port" . }} 11 | targetPort: {{ include "chart.container-port-name" . }} 12 | protocol: TCP 13 | selector: 14 | {{- include "chart.labels" . | nindent 4 }} -------------------------------------------------------------------------------- /examples/online_serving/disaggregated_serving/README.md: -------------------------------------------------------------------------------- 1 | # Disaggregated Serving 2 | 3 | This example contains scripts that demonstrate the disaggregated serving features of vLLM. 4 | 5 | ## Files 6 | 7 | - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances). 8 | - `kv_events.sh` - Demonstrates KV cache event publishing. 9 | -------------------------------------------------------------------------------- /examples/online_serving/prometheus_grafana/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/online_serving/prometheus_grafana/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/online_serving/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | from openai import APIConnectionError, OpenAI 4 | from openai.pagination import SyncPage 5 | from openai.types.model import Model 6 | 7 | 8 | def get_first_model(client: OpenAI) -> str: 9 | """ 10 | Get the first model from the vLLM server. 11 | """ 12 | try: 13 | models: SyncPage[Model] = client.models.list() 14 | except APIConnectionError as e: 15 | raise RuntimeError( 16 | "Failed to get the list of models from the vLLM server at " 17 | f"{client.base_url} with API key {client.api_key}. Check\n" 18 | "1. the server is running\n" 19 | "2. the server URL is correct\n" 20 | "3. the API key is correct" 21 | ) from e 22 | 23 | if len(models.data) == 0: 24 | raise RuntimeError(f"No models found on the vLLM server at {client.base_url}") 25 | 26 | return models.data[0].id 27 | -------------------------------------------------------------------------------- /examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml: -------------------------------------------------------------------------------- 1 | local_cpu: False 2 | max_local_cpu_size: 0 3 | #local_disk: 4 | max_local_disk_size: 0 5 | remote_serde: NULL 6 | 7 | enable_nixl: True 8 | nixl_role: "receiver" 9 | nixl_peer_host: "localhost" 10 | nixl_peer_port: 55555 11 | nixl_buffer_size: 1073741824 # 1GB 12 | nixl_buffer_device: "cuda" 13 | nixl_enable_gc: True 14 | -------------------------------------------------------------------------------- /examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml: -------------------------------------------------------------------------------- 1 | local_cpu: False 2 | max_local_cpu_size: 0 3 | #local_disk: 4 | max_local_disk_size: 0 5 | remote_serde: NULL 6 | 7 | enable_nixl: True 8 | nixl_role: "sender" 9 | nixl_peer_host: "localhost" 10 | nixl_peer_port: 55555 11 | nixl_buffer_size: 1073741824 # 1GB 12 | nixl_buffer_device: "cuda" 13 | nixl_enable_gc: True 14 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_teleflm.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages %} 2 | {%- if message['role'] == 'user' %} 3 | {{- '<_user>' + message['content']|trim }} 4 | {%- elif message['role'] == 'system' %} 5 | {{- '<_system>' + message['content']|trim }} 6 | {%- elif message['role'] == 'assistant' %} 7 | {{- '<_bot>' + message['content'] }} 8 | {%- endif %} 9 | {%- endfor %} 10 | {%- if add_generation_prompt %} 11 | {{- '<_bot>' }} 12 | {%- endif %} 13 | -------------------------------------------------------------------------------- /examples/template_vlm2vec.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages | length > 1 -%} 2 | {{ raise_exception('Embedding models should only embed one message at a time') }} 3 | {%- endif -%} 4 | 5 | {% set vars = namespace(parts=[], next_image_id=1) %} 6 | {%- for message in messages -%} 7 | {%- for content in message['content'] -%} 8 | {%- if content['type'] == 'text' -%} 9 | {%- set vars.parts = vars.parts + [content['text']] %} 10 | {%- elif content['type'] == 'image' -%} 11 | {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} 12 | {%- set vars.next_image_id = vars.next_image_id + 1 %} 13 | {%- endif -%} 14 | {%- endfor -%} 15 | {%- endfor -%} 16 | {{ vars.parts | join(' ') }} 17 | -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "vLLM linting system has been moved from format.sh to pre-commit hooks." 4 | echo "Please run 'pip install -r requirements/lint.txt', followed by" 5 | echo "'pre-commit install' to install the pre-commit hooks." 6 | echo "Then linters will run automatically before each commit." -------------------------------------------------------------------------------- /requirements/build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.26.1 3 | ninja 4 | packaging>=24.2 5 | setuptools>=77.0.3,<80.0.0 6 | setuptools-scm>=8 7 | torch==2.7.0 8 | wheel 9 | jinja2>=3.1.6 10 | regex 11 | -------------------------------------------------------------------------------- /requirements/cuda.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding 5 | numba == 0.61.2; python_version > '3.9' 6 | 7 | # Dependencies for NVIDIA GPUs 8 | ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. 9 | torch==2.7.0 10 | torchaudio==2.7.0 11 | # These must be updated alongside torch 12 | torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version 13 | # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30 14 | xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 15 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r lint.txt 2 | -r test.txt 3 | 4 | # Avoid adding requirements directly to this file. 5 | # Instead, modify the two files referenced above. 6 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-api-autonav 3 | mkdocs-material 4 | mkdocstrings-python 5 | mkdocs-gen-files 6 | mkdocs-awesome-nav 7 | python-markdown-math 8 | regex 9 | ruff 10 | -------------------------------------------------------------------------------- /requirements/hpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | # Dependencies for HPU code 5 | ray 6 | triton==3.1.0 7 | pandas 8 | numpy==1.26.4 9 | tabulate 10 | setuptools>=77.0.3,<80.0.0 11 | setuptools-scm>=8 12 | vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624 13 | -------------------------------------------------------------------------------- /requirements/lint.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | pre-commit==4.0.1 3 | -------------------------------------------------------------------------------- /requirements/neuron.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | # Dependencies for Neuron devices 5 | packaging>=24.2 6 | setuptools>=77.0.3,<80.0.0 7 | torch-neuronx >= 2.5.0 8 | neuronx-cc>=2.0.0a0 9 | torchvision # Required for Llama3.2 multimodal image preprocessing 10 | -------------------------------------------------------------------------------- /requirements/rocm-build.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | --extra-index-url https://download.pytorch.org/whl/rocm6.2.4 5 | torch==2.7.0 6 | torchvision==0.22.0 7 | torchaudio==2.7.0 8 | 9 | triton==3.2 10 | cmake>=3.26.1,<4 11 | packaging>=24.2 12 | setuptools>=77.0.3,<80.0.0 13 | setuptools-scm>=8 14 | wheel 15 | jinja2>=3.1.6 16 | amdsmi==6.2.4 17 | -------------------------------------------------------------------------------- /requirements/rocm-test.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | # entrypoints test 5 | # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai 6 | audioread==3.0.1 7 | cffi==1.17.1 8 | decorator==5.2.1 9 | lazy-loader==0.4 10 | platformdirs==4.3.6 11 | pooch==1.8.2 12 | #pycparse==2.22 13 | soundfile==0.13.1 14 | soxr==0.5.0.post1 15 | librosa==0.10.2.post1 16 | 17 | # entrypoints test 18 | #vllm[video] # required by entrypoints/openai/test_video.py 19 | decord==0.6.0 20 | 21 | # entrypoints test 22 | #sentence-transformers # required by entrypoints/openai/test_score.py 23 | sentence-transformers==3.4.1 24 | 25 | # Basic Models Test 26 | matplotlib==3.10.3 27 | 28 | # Multi-Modal Models Test (Extended) 3 29 | blobfile==3.0.0 30 | 31 | 32 | -------------------------------------------------------------------------------- /requirements/rocm.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding 5 | numba == 0.61.2; python_version > '3.9' 6 | 7 | # Dependencies for AMD GPUs 8 | boto3 9 | botocore 10 | datasets 11 | ray>=2.10.0,<2.45.0 12 | peft 13 | pytest-asyncio 14 | tensorizer>=2.9.0 15 | setuptools-scm>=8 16 | setuptools>=77.0.3,<80.0.0 17 | runai-model-streamer==0.11.0 18 | runai-model-streamer-s3==0.11.0 19 | -------------------------------------------------------------------------------- /requirements/xpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r common.txt 3 | 4 | ray>=2.9 5 | cmake>=3.26.1 6 | packaging>=24.2 7 | setuptools-scm>=8 8 | setuptools>=77.0.3,<80.0.0 9 | wheel 10 | jinja2>=3.1.6 11 | datasets # for benchmark scripts 12 | 13 | torch==2.7.0+xpu 14 | torchaudio 15 | torchvision 16 | pytorch-triton-xpu 17 | --extra-index-url=https://download.pytorch.org/whl/xpu 18 | 19 | # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu 20 | # FIXME: This will be fix in ipex 2.7. just leave this here for awareness. 21 | intel-extension-for-pytorch==2.7.10+xpu 22 | oneccl_bind_pt==2.7.0+xpu 23 | --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 24 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/async_engine/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') 13 | -------------------------------------------------------------------------------- /tests/basic_correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/basic_correctness/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/test_cpu_offload.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from ..utils import compare_two_settings 5 | 6 | 7 | def test_cpu_offload(): 8 | compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], 9 | ["--cpu-offload-gb", "1"]) 10 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/benchmarks/__init__.py -------------------------------------------------------------------------------- /tests/benchmarks/test_latency_cli.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import subprocess 4 | 5 | import pytest 6 | 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" 8 | 9 | 10 | @pytest.mark.benchmark 11 | def test_bench_latency(): 12 | command = [ 13 | "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32", 14 | "--output-len", "1", "--enforce-eager", "--load-format", "dummy" 15 | ] 16 | result = subprocess.run(command, capture_output=True, text=True) 17 | print(result.stdout) 18 | print(result.stderr) 19 | 20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}" 21 | -------------------------------------------------------------------------------- /tests/benchmarks/test_throughput_cli.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import subprocess 4 | 5 | import pytest 6 | 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" 8 | 9 | 10 | @pytest.mark.benchmark 11 | def test_bench_throughput(): 12 | command = [ 13 | "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len", 14 | "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy" 15 | ] 16 | result = subprocess.run(command, capture_output=True, text=True) 17 | print(result.stdout) 18 | print(result.stderr) 19 | 20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}" 21 | -------------------------------------------------------------------------------- /tests/compile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/__init__.py -------------------------------------------------------------------------------- /tests/compile/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | # TEST V1: this should be removed. Right now V1 overrides 7 | # all the torch compile logic. We should re-enable this 8 | # as we add torch compile support back to V1. 9 | @pytest.fixture(scope="function", autouse=True) 10 | def use_v0_only(monkeypatch): 11 | """ 12 | Since this module is V0 only, set VLLM_USE_V1=0 for 13 | all tests in the module. 14 | """ 15 | monkeypatch.setenv('VLLM_USE_V1', '0') 16 | -------------------------------------------------------------------------------- /tests/compile/piecewise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/compile/piecewise/__init__.py -------------------------------------------------------------------------------- /tests/config/test_config.yaml: -------------------------------------------------------------------------------- 1 | port: 12312 2 | served_model_name: mymodel 3 | tensor_parallel_size: 2 4 | trust_remote_code: true 5 | multi_step_stream_outputs: false 6 | -------------------------------------------------------------------------------- /tests/config/test_config_with_model.yaml: -------------------------------------------------------------------------------- 1 | # Same as test_config.yaml but with model specified 2 | model: config-model 3 | port: 12312 4 | served_model_name: mymodel 5 | tensor_parallel_size: 2 6 | trust_remote_code: true 7 | multi_step_stream_outputs: false 8 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/__init__.py -------------------------------------------------------------------------------- /tests/core/block/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture() 8 | def should_do_global_cleanup_after_test() -> bool: 9 | """Disable the global cleanup fixture for tests in this directory. This 10 | provides a ~10x speedup for unit tests that don't load a model to GPU. 11 | 12 | This requires that tests in this directory clean up after themselves if they 13 | use the GPU. 14 | """ 15 | return False 16 | -------------------------------------------------------------------------------- /tests/core/block/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/core/block/e2e/__init__.py -------------------------------------------------------------------------------- /tests/core/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') 13 | -------------------------------------------------------------------------------- /tests/detokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/detokenizer/__init__.py -------------------------------------------------------------------------------- /tests/detokenizer/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def v1(run_with_both_engines): 8 | # Simple autouse wrapper to run both engines for each test 9 | # This can be promoted up to conftest.py to run for every 10 | # test in a package 11 | pass 12 | -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/distributed/__init__.py -------------------------------------------------------------------------------- /tests/distributed/test_distributed_oot.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from ..entrypoints.openai.test_oot_registration import ( 5 | run_and_test_dummy_opt_api_server) 6 | 7 | 8 | def test_distributed_oot(dummy_opt_path: str): 9 | run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2) 10 | -------------------------------------------------------------------------------- /tests/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/encoder_decoder/__init__.py -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/engine/__init__.py -------------------------------------------------------------------------------- /tests/engine/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') 13 | -------------------------------------------------------------------------------- /tests/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/llm/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/test_prompt_validation.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import pytest 5 | 6 | from vllm import LLM 7 | 8 | 9 | @pytest.fixture(autouse=True) 10 | def v1(run_with_both_engines): 11 | # Simple autouse wrapper to run both engines for each test 12 | # This can be promoted up to conftest.py to run for every 13 | # test in a package 14 | pass 15 | 16 | 17 | def test_empty_prompt(): 18 | llm = LLM(model="openai-community/gpt2", enforce_eager=True) 19 | with pytest.raises(ValueError, match='decoder prompt cannot be empty'): 20 | llm.generate([""]) 21 | 22 | 23 | @pytest.mark.skip_v1 24 | def test_out_of_vocab_token(): 25 | llm = LLM(model="openai-community/gpt2", enforce_eager=True) 26 | with pytest.raises(ValueError, match='out of vocabulary'): 27 | llm.generate({"prompt_token_ids": [999999]}) 28 | -------------------------------------------------------------------------------- /tests/entrypoints/offline_mode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/offline_mode/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/correctness/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/entrypoints/openai/tool_parsers/__init__.py -------------------------------------------------------------------------------- /tests/fastsafetensors_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/fastsafetensors_loader/__init__.py -------------------------------------------------------------------------------- /tests/fastsafetensors_loader/test_fastsafetensors_loader.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm import SamplingParams 5 | from vllm.config import LoadFormat 6 | 7 | test_model = "openai-community/gpt2" 8 | 9 | prompts = [ 10 | "Hello, my name is", 11 | "The president of the United States is", 12 | "The capital of France is", 13 | "The future of AI is", 14 | ] 15 | # Create a sampling params object. 16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0) 17 | 18 | 19 | def test_model_loader_download_files(vllm_runner): 20 | with vllm_runner(test_model, 21 | load_format=LoadFormat.FASTSAFETENSORS) as llm: 22 | deserialized_outputs = llm.generate(prompts, sampling_params) 23 | assert deserialized_outputs 24 | -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/__init__.py -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import torch 5 | 6 | # Reference default values of atol and rtol are from 7 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 8 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 9 | default_rtol = { 10 | torch.float16: 1e-3, 11 | torch.bfloat16: 1.6e-2, 12 | torch.float: 1.3e-6 13 | } 14 | 15 | 16 | def get_default_atol(output) -> float: 17 | return default_atol[output.dtype] 18 | 19 | 20 | def get_default_rtol(output) -> float: 21 | return default_rtol[output.dtype] 22 | -------------------------------------------------------------------------------- /tests/kernels/attention/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import pytest 5 | 6 | from vllm.utils import (create_kv_caches_with_random, 7 | create_kv_caches_with_random_flash) 8 | 9 | 10 | @pytest.fixture() 11 | def kv_cache_factory(): 12 | return create_kv_caches_with_random 13 | 14 | 15 | @pytest.fixture() 16 | def kv_cache_factory_flashinfer(): 17 | return create_kv_caches_with_random_flash 18 | -------------------------------------------------------------------------------- /tests/kernels/core/test_opcheck.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | """ 4 | Tests for miscellaneous utilities 5 | """ 6 | 7 | import torch 8 | 9 | from tests.kernels.utils import opcheck 10 | 11 | 12 | def test_convert_fp8_opcheck(): 13 | data = torch.randn((256, 256), dtype=torch.float32, device="cuda") 14 | result = torch.empty_like(data, dtype=torch.float8_e4m3fn) 15 | opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) 16 | 17 | 18 | # TODO: Add this back, currently fails with 19 | # csrc/cuda_utils_kernels.cu:15 'invalid argument' 20 | # @pytest.mark.skipif(not current_platform.is_cuda(), 21 | # reason="Only supported for CUDA") 22 | # def test_cuda_utils_opcheck(): 23 | # opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) 24 | # opcheck( 25 | # torch.ops._C_cuda_utils. 26 | # get_max_shared_memory_per_block_device_attribute, (0, )) 27 | -------------------------------------------------------------------------------- /tests/kernels/core/test_permute_cols.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import pytest 5 | import torch 6 | 7 | from tests.kernels.utils import opcheck 8 | from vllm._custom_ops import permute_cols 9 | 10 | 11 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)]) 12 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16]) 13 | def test_permute_cols(shape, dtype): 14 | x = torch.randn(shape, dtype=dtype).cuda() 15 | perm = torch.randperm(x.shape[1]).to(torch.int).cuda() 16 | opcheck(torch.ops._C.permute_cols, (x, perm)) 17 | y = permute_cols(x, perm) 18 | torch.testing.assert_close(y, x[:, perm]) -------------------------------------------------------------------------------- /tests/kernels/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/kernels/moe/__init__.py -------------------------------------------------------------------------------- /tests/kv_transfer/test_lookup_buffer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RANK=0 python3 test_lookup_buffer.py & 3 | PID0=$! 4 | RANK=1 python3 test_lookup_buffer.py & 5 | PID1=$! 6 | 7 | wait $PID0 8 | wait $PID1 9 | -------------------------------------------------------------------------------- /tests/kv_transfer/test_send_recv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 python3 test_send_recv.py & 4 | PID0=$! 5 | RANK=1 python3 test_send_recv.py & 6 | PID1=$! 7 | 8 | wait $PID0 9 | wait $PID1 10 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/mistral_tool_use/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mistral_tool_use/__init__.py -------------------------------------------------------------------------------- /tests/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/model_executor/__init__.py -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/__init__.py -------------------------------------------------------------------------------- /tests/models/language/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/generation/__init__.py -------------------------------------------------------------------------------- /tests/models/language/pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/language/pooling/__init__.py -------------------------------------------------------------------------------- /tests/models/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/__init__.py -------------------------------------------------------------------------------- /tests/models/multimodal/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/__init__.py -------------------------------------------------------------------------------- /tests/models/multimodal/generation/vlm_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/generation/vlm_utils/__init__.py -------------------------------------------------------------------------------- /tests/models/multimodal/pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/pooling/__init__.py -------------------------------------------------------------------------------- /tests/models/multimodal/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/multimodal/processing/__init__.py -------------------------------------------------------------------------------- /tests/models/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/models/quantization/__init__.py -------------------------------------------------------------------------------- /tests/mq_llm_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/mq_llm_engine/__init__.py -------------------------------------------------------------------------------- /tests/mq_llm_engine/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') 13 | -------------------------------------------------------------------------------- /tests/multi_step/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multi_step/__init__.py -------------------------------------------------------------------------------- /tests/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/__init__.py -------------------------------------------------------------------------------- /tests/multimodal/assets/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image1.png -------------------------------------------------------------------------------- /tests/multimodal/assets/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/image2.png -------------------------------------------------------------------------------- /tests/multimodal/assets/rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/multimodal/assets/rgba.png -------------------------------------------------------------------------------- /tests/multimodal/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import numpy as np 5 | from PIL import Image 6 | 7 | 8 | def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int): 9 | w, h = rng.randint(min_wh, max_wh, size=(2, )) 10 | arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) 11 | return Image.fromarray(arr) 12 | 13 | 14 | def random_video( 15 | rng: np.random.RandomState, 16 | min_frames: int, 17 | max_frames: int, 18 | min_wh: int, 19 | max_wh: int, 20 | ): 21 | num_frames = rng.randint(min_frames, max_frames) 22 | w, h = rng.randint(min_wh, max_wh, size=(2, )) 23 | return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) 24 | 25 | 26 | def random_audio( 27 | rng: np.random.RandomState, 28 | min_len: int, 29 | max_len: int, 30 | sr: int, 31 | ): 32 | audio_len = rng.randint(min_len, max_len) 33 | return rng.rand(audio_len), sr 34 | -------------------------------------------------------------------------------- /tests/neuron/1_core/test_neuron_quant.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | from vllm.model_executor.layers.quantization.neuron_quant import ( 4 | NeuronQuantConfig) 5 | 6 | 7 | def test_get_supported_act_dtypes(): 8 | neuron_quant_config = NeuronQuantConfig() 9 | supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes() 10 | target_list = ["any_dtype1", "any_dtype2"] 11 | for dtype in target_list: 12 | assert dtype in supported_act_dtypes 13 | -------------------------------------------------------------------------------- /tests/plugins/lora_resolvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/plugins/lora_resolvers/__init__.py -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from setuptools import setup 5 | 6 | setup(name='vllm_add_dummy_model', 7 | version='0.1', 8 | packages=['vllm_add_dummy_model'], 9 | entry_points={ 10 | 'vllm.general_plugins': 11 | ["register_dummy_model = vllm_add_dummy_model:register"] 12 | }) 13 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm import ModelRegistry 5 | 6 | 7 | def register(): 8 | # Test directly passing the model 9 | from .my_opt import MyOPTForCausalLM 10 | 11 | if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs(): 12 | ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM) 13 | 14 | # Test passing lazy model 15 | if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs(): 16 | ModelRegistry.register_model( 17 | "MyGemma2Embedding", 18 | "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding", 19 | ) 20 | 21 | if "MyLlava" not in ModelRegistry.get_supported_archs(): 22 | ModelRegistry.register_model("MyLlava", 23 | "vllm_add_dummy_model.my_llava:MyLlava") 24 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from typing import Optional 5 | 6 | import torch 7 | 8 | from vllm.model_executor.models.opt import OPTForCausalLM 9 | from vllm.model_executor.sampling_metadata import SamplingMetadata 10 | 11 | 12 | class MyOPTForCausalLM(OPTForCausalLM): 13 | 14 | def compute_logits( 15 | self, hidden_states: torch.Tensor, 16 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: 17 | # this dummy model always predicts the first token 18 | logits = super().compute_logits(hidden_states, sampling_metadata) 19 | if logits is not None: 20 | logits.zero_() 21 | logits[:, 0] += 1.0 22 | return logits 23 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_platform/setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name='vllm_add_dummy_platform', 8 | version='0.1', 9 | packages=['vllm_add_dummy_platform'], 10 | entry_points={ 11 | 'vllm.platform_plugins': [ 12 | "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa 13 | ] 14 | }) 15 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from typing import Optional 5 | 6 | 7 | def dummy_platform_plugin() -> Optional[str]: 8 | return "vllm_add_dummy_platform.dummy_platform.DummyPlatform" 9 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.attention.backends.flash_attn import FlashAttentionBackend 5 | 6 | 7 | class DummyAttentionBackend(FlashAttentionBackend): 8 | 9 | @staticmethod 10 | def get_name() -> str: 11 | return "Dummy_Backend" 12 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.platforms.cuda import CudaPlatform 5 | 6 | 7 | class DummyPlatform(CudaPlatform): 8 | device_name = "DummyDevice" 9 | 10 | def get_attn_backend_cls(self, backend_name, head_size, dtype, 11 | kv_cache_dtype, block_size, use_v1, use_mla): 12 | return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501 13 | -------------------------------------------------------------------------------- /tests/plugins_tests/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') -------------------------------------------------------------------------------- /tests/prefix_caching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/prefix_caching/__init__.py -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/quantization/__init__.py -------------------------------------------------------------------------------- /tests/quantization/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.model_executor.layers.quantization import get_quantization_config 5 | from vllm.platforms import current_platform 6 | 7 | 8 | def is_quant_method_supported(quant_method: str) -> bool: 9 | # Currently, all quantization methods require Nvidia or AMD GPUs 10 | if not (current_platform.is_cuda() or current_platform.is_rocm()): 11 | return False 12 | 13 | capability = current_platform.get_device_capability() 14 | assert capability is not None 15 | 16 | min_capability = get_quantization_config(quant_method).get_min_capability() 17 | 18 | return capability.to_int() >= min_capability 19 | -------------------------------------------------------------------------------- /tests/reasoning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/reasoning/__init__.py -------------------------------------------------------------------------------- /tests/runai_model_streamer_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/runai_model_streamer_test/__init__.py -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/samplers/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | Since this module is V0 only, set VLLM_USE_V1=0 for 10 | all tests in the module. 11 | """ 12 | monkeypatch.setenv('VLLM_USE_V1', '0') 13 | -------------------------------------------------------------------------------- /tests/spec_decode/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/spec_decode/e2e/__init__.py -------------------------------------------------------------------------------- /tests/standalone_tests/python_only_compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script tests if the python only compilation works correctly 3 | # for users who do not have any compilers installed on their system 4 | 5 | set -e 6 | set -x 7 | 8 | cd /vllm-workspace/ 9 | 10 | # uninstall vllm 11 | pip3 uninstall -y vllm 12 | # restore the original files 13 | mv test_docs/vllm ./vllm 14 | 15 | # remove all compilers 16 | apt remove --purge build-essential -y 17 | apt autoremove -y 18 | 19 | echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py 20 | 21 | VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . 22 | 23 | # Run the script 24 | python3 -c 'import vllm' 25 | 26 | # Check if the clangd log file was created 27 | if [ ! -f /tmp/changed.file ]; then 28 | echo "changed.file was not created, python only compilation failed" 29 | exit 1 30 | fi 31 | -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tensorizer_loader/__init__.py -------------------------------------------------------------------------------- /tests/tensorizer_loader/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | from vllm.distributed import cleanup_dist_env_and_memory 6 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig 7 | 8 | 9 | @pytest.fixture(autouse=True) 10 | def cleanup(): 11 | cleanup_dist_env_and_memory(shutdown_ray=True) 12 | 13 | 14 | @pytest.fixture(autouse=True) 15 | def tensorizer_config(): 16 | config = TensorizerConfig(tensorizer_uri="vllm") 17 | return config 18 | -------------------------------------------------------------------------------- /tests/test_embedded_commit.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import vllm 5 | 6 | 7 | def test_embedded_commit_defined(): 8 | assert hasattr(vllm, "__version__") 9 | assert hasattr(vllm, "__version_tuple__") 10 | assert vllm.__version__ != "dev" 11 | assert vllm.__version_tuple__ != (0, 0, "dev") 12 | -------------------------------------------------------------------------------- /tests/test_outputs.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.outputs import RequestOutput 5 | 6 | 7 | def test_request_output_forward_compatible(): 8 | output = RequestOutput(request_id="test_request_id", 9 | prompt="test prompt", 10 | prompt_token_ids=[1, 2, 3], 11 | prompt_logprobs=None, 12 | outputs=[], 13 | finished=False, 14 | example_arg_added_in_new_version="some_value") 15 | assert output is not None 16 | -------------------------------------------------------------------------------- /tests/test_seed_behavior.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import random 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from vllm.platforms.interface import Platform 9 | 10 | 11 | def test_seed_behavior(): 12 | # Test with a specific seed 13 | Platform.seed_everything(42) 14 | random_value_1 = random.randint(0, 100) 15 | np_random_value_1 = np.random.randint(0, 100) 16 | torch_random_value_1 = torch.randint(0, 100, (1, )).item() 17 | 18 | Platform.seed_everything(42) 19 | random_value_2 = random.randint(0, 100) 20 | np_random_value_2 = np.random.randint(0, 100) 21 | torch_random_value_2 = torch.randint(0, 100, (1, )).item() 22 | 23 | assert random_value_1 == random_value_2 24 | assert np_random_value_1 == np_random_value_2 25 | assert torch_random_value_1 == torch_random_value_2 26 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import pytest 5 | from transformers import PreTrainedTokenizerBase 6 | 7 | from vllm.transformers_utils.tokenizer import get_tokenizer 8 | 9 | TOKENIZER_NAMES = [ 10 | "facebook/opt-125m", 11 | "gpt2", 12 | ] 13 | 14 | 15 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) 16 | def test_tokenizer_revision(tokenizer_name: str): 17 | # Assume that "main" branch always exists 18 | tokenizer = get_tokenizer(tokenizer_name, revision="main") 19 | assert isinstance(tokenizer, PreTrainedTokenizerBase) 20 | 21 | # Assume that "never" branch always does not exist 22 | with pytest.raises(OSError, match='not a valid git identifier'): 23 | get_tokenizer(tokenizer_name, revision="never") 24 | -------------------------------------------------------------------------------- /tests/tool_use/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tool_use/__init__.py -------------------------------------------------------------------------------- /tests/tpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/__init__.py -------------------------------------------------------------------------------- /tests/tpu/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tpu/lora/__init__.py -------------------------------------------------------------------------------- /tests/tracing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/tracing/__init__.py -------------------------------------------------------------------------------- /tests/v1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/__init__.py -------------------------------------------------------------------------------- /tests/v1/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/e2e/__init__.py -------------------------------------------------------------------------------- /tests/v1/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/engine/__init__.py -------------------------------------------------------------------------------- /tests/v1/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/__init__.py -------------------------------------------------------------------------------- /tests/v1/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/entrypoints/llm/__init__.py -------------------------------------------------------------------------------- /tests/v1/kv_connector/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/kv_connector/unit/__init__.py -------------------------------------------------------------------------------- /tests/v1/sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/sample/__init__.py -------------------------------------------------------------------------------- /tests/v1/shutdown/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | """Shutdown test utils""" 4 | 5 | SHUTDOWN_TEST_TIMEOUT_SEC = 120 6 | SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30 7 | -------------------------------------------------------------------------------- /tests/v1/structured_output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/structured_output/__init__.py -------------------------------------------------------------------------------- /tests/v1/tpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/__init__.py -------------------------------------------------------------------------------- /tests/v1/tpu/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/tpu/worker/__init__.py -------------------------------------------------------------------------------- /tests/v1/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/v1/worker/__init__.py -------------------------------------------------------------------------------- /tests/vllm_test_utils/setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name='vllm_test_utils', 8 | version='0.1', 9 | packages=['vllm_test_utils'], 10 | ) 11 | -------------------------------------------------------------------------------- /tests/vllm_test_utils/vllm_test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | """ 4 | vllm_utils is a package for vLLM testing utilities. 5 | It does not import any vLLM modules. 6 | """ 7 | 8 | from .blame import BlameResult, blame 9 | from .monitor import MonitoredValues, monitor 10 | 11 | __all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"] 12 | -------------------------------------------------------------------------------- /tests/weight_loading/models-large.txt: -------------------------------------------------------------------------------- 1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main 2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main 3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main 4 | compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main 5 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main 6 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True 7 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main 8 | compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/tests/worker/__init__.py -------------------------------------------------------------------------------- /tests/worker/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="function", autouse=True) 7 | def use_v0_only(monkeypatch): 8 | """ 9 | This module tests V0 internals, so set VLLM_USE_V1=0. 10 | """ 11 | monkeypatch.setenv('VLLM_USE_V1', '0') -------------------------------------------------------------------------------- /tools/check_repo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) 3 | 4 | if ! git diff --quiet; then 5 | echo "Repo is dirty" >&2 6 | 7 | exit 1 8 | fi 9 | 10 | if ! git describe --tags; then 11 | echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 12 | 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /tools/ep_kernels/install_system_drivers.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | # prepare workspace directory 4 | WORKSPACE=$1 5 | if [ -z "$WORKSPACE" ]; then 6 | export WORKSPACE=$(pwd)/ep_kernels_workspace 7 | fi 8 | 9 | if [ ! -d "$WORKSPACE" ]; then 10 | mkdir -p $WORKSPACE 11 | fi 12 | 13 | # build and install gdrcopy driver 14 | pushd $WORKSPACE 15 | cd gdrcopy_src 16 | ./insmod.sh 17 | # run gdrcopy_copybw to test the installation 18 | $WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw 19 | 20 | # turn on IBGDA 21 | echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf 22 | update-initramfs -u 23 | 24 | echo "Please reboot the system to apply the changes" 25 | -------------------------------------------------------------------------------- /tools/ep_kernels/install_system_libraries.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | # prepare workspace directory 4 | WORKSPACE=$1 5 | if [ -z "$WORKSPACE" ]; then 6 | export WORKSPACE=$(pwd)/ep_kernels_workspace 7 | fi 8 | 9 | if [ ! -d "$WORKSPACE" ]; then 10 | mkdir -p $WORKSPACE 11 | fi 12 | 13 | # build and install gdrcopy system packages 14 | pushd $WORKSPACE 15 | cd gdrcopy_src/packages 16 | apt install devscripts -y 17 | CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh 18 | dpkg -i *.deb 19 | -------------------------------------------------------------------------------- /tools/mypy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CI=${1:-0} 4 | PYTHON_VERSION=${2:-local} 5 | 6 | if [ "$CI" -eq 1 ]; then 7 | set -e 8 | fi 9 | 10 | if [ $PYTHON_VERSION == "local" ]; then 11 | PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') 12 | fi 13 | 14 | run_mypy() { 15 | echo "Running mypy on $1" 16 | if [ "$CI" -eq 1 ] && [ -z "$1" ]; then 17 | mypy --python-version "${PYTHON_VERSION}" "$@" 18 | return 19 | fi 20 | mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" 21 | } 22 | 23 | run_mypy # Note that this is less strict than CI 24 | run_mypy tests 25 | run_mypy vllm/attention 26 | run_mypy vllm/compilation 27 | run_mypy vllm/distributed 28 | run_mypy vllm/engine 29 | run_mypy vllm/executor 30 | run_mypy vllm/inputs 31 | run_mypy vllm/lora 32 | run_mypy vllm/model_executor 33 | run_mypy vllm/plugins 34 | run_mypy vllm/prompt_adapter 35 | run_mypy vllm/spec_decode 36 | run_mypy vllm/worker 37 | run_mypy vllm/v1 38 | -------------------------------------------------------------------------------- /tools/png-lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Ensure that *.excalidraw.png files have the excalidraw metadata 4 | # embedded in them. This ensures they can be loaded back into 5 | # the tool and edited in the future. 6 | 7 | find . -iname '*.excalidraw.png' | while read -r file; do 8 | if git check-ignore -q "$file"; then 9 | continue 10 | fi 11 | if ! grep -q "excalidraw+json" "$file"; then 12 | echo "$file was not exported from excalidraw with 'Embed Scene' enabled." 13 | exit 1 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /tools/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | scversion="stable" 5 | 6 | if [ -d "shellcheck-${scversion}" ]; then 7 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}" 8 | fi 9 | 10 | if ! [ -x "$(command -v shellcheck)" ]; then 11 | if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then 12 | echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" 13 | exit 1 14 | fi 15 | 16 | # automatic local install if linux x86_64 17 | wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv 18 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}" 19 | fi 20 | 21 | # TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh 22 | find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' 23 | -------------------------------------------------------------------------------- /use_existing_torch.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import glob 5 | 6 | requires_files = glob.glob('requirements/*.txt') 7 | requires_files += ["pyproject.toml"] 8 | for file in requires_files: 9 | print(f">>> cleaning {file}") 10 | with open(file) as f: 11 | lines = f.readlines() 12 | if "torch" in "".join(lines).lower(): 13 | print("removed:") 14 | with open(file, 'w') as f: 15 | for line in lines: 16 | if 'torch' not in line.lower(): 17 | f.write(line) 18 | else: 19 | print(line.strip()) 20 | print(f"<<< done cleaning {file}") 21 | print() 22 | -------------------------------------------------------------------------------- /vllm/adapter_commons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/adapter_commons/__init__.py -------------------------------------------------------------------------------- /vllm/adapter_commons/layers.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from dataclasses import dataclass 5 | 6 | 7 | @dataclass 8 | class AdapterMapping: 9 | # Per every token in input_ids: 10 | index_mapping: tuple[int, ...] 11 | # Per sampled token: 12 | prompt_mapping: tuple[int, ...] 13 | 14 | def __post_init__(self): 15 | self.index_mapping = tuple(self.index_mapping) 16 | self.prompt_mapping = tuple(self.prompt_mapping) -------------------------------------------------------------------------------- /vllm/adapter_commons/request.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from abc import ABC, abstractmethod 5 | 6 | 7 | class AdapterRequest(ABC): 8 | """ 9 | Base class for adapter requests. 10 | """ 11 | 12 | @property 13 | @abstractmethod 14 | def adapter_id(self) -> int: 15 | raise NotImplementedError 16 | 17 | def __post_init__(self) -> None: 18 | if self.adapter_id < 1: 19 | raise ValueError(f"id must be > 0, got {self.adapter_id}") 20 | 21 | def __eq__(self, value: object) -> bool: 22 | return isinstance( 23 | value, self.__class__) and self.adapter_id == value.adapter_id 24 | 25 | def __hash__(self) -> int: 26 | return hash(self.adapter_id) 27 | -------------------------------------------------------------------------------- /vllm/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/assets/__init__.py -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.attention.backends.abstract import (AttentionBackend, 5 | AttentionMetadata, 6 | AttentionMetadataBuilder, 7 | AttentionState, AttentionType) 8 | from vllm.attention.layer import Attention 9 | from vllm.attention.selector import get_attn_backend 10 | 11 | __all__ = [ 12 | "Attention", 13 | "AttentionBackend", 14 | "AttentionMetadata", 15 | "AttentionType", 16 | "AttentionMetadataBuilder", 17 | "Attention", 18 | "AttentionState", 19 | "get_attn_backend", 20 | ] 21 | -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/attention/backends/mla/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/backends/mla/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/blocksparse_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/attention/ops/blocksparse_attention/__init__.py -------------------------------------------------------------------------------- /vllm/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/benchmarks/__init__.py -------------------------------------------------------------------------------- /vllm/compilation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/compilation/__init__.py -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /vllm/device_allocator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/device_allocator/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from .communication_op import * 5 | from .parallel_state import * 6 | from .utils import * 7 | -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/neuron_communicator.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import torch 4 | 5 | from vllm.distributed.device_communicators.base_device_communicator import ( 6 | DeviceCommunicatorBase) 7 | from vllm.platforms import current_platform 8 | 9 | if current_platform.is_neuron(): 10 | import torch_xla.core.xla_model as xm 11 | 12 | 13 | class NeuronCommunicator(DeviceCommunicatorBase): 14 | 15 | def all_reduce(self, x: torch.Tensor) -> torch.Tensor: 16 | return xm.all_reduce(xm.REDUCE_SUM, x) 17 | 18 | def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: 19 | assert dim == -1, "Neuron only supports dim=-1 for all-gather." 20 | return xm.all_gather(x, dim=dim) 21 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.distributed.kv_transfer.kv_transfer_state import ( 5 | KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, 6 | has_kv_transfer_group, is_v1_kv_transfer_group) 7 | 8 | __all__ = [ 9 | "get_kv_transfer_group", "has_kv_transfer_group", 10 | "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized", 11 | "KVConnectorBaseType" 12 | ] 13 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_connector/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/v1/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | from vllm.distributed.kv_transfer.kv_connector.v1.base import ( 4 | KVConnectorBase_V1, KVConnectorRole) 5 | 6 | __all__ = ["KVConnectorRole", "KVConnectorBase_V1"] 7 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_pipe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/distributed/kv_transfer/kv_pipe/__init__.py -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/cli/benchmark/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/benchmark/serve.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import argparse 4 | 5 | from vllm.benchmarks.serve import add_cli_args, main 6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase 7 | from vllm.entrypoints.cli.types import CLISubcommand 8 | 9 | 10 | class BenchmarkServingSubcommand(BenchmarkSubcommandBase): 11 | """ The `serve` subcommand for vllm bench. """ 12 | 13 | def __init__(self): 14 | self.name = "serve" 15 | super().__init__() 16 | 17 | @property 18 | def help(self) -> str: 19 | return "Benchmark the online serving throughput." 20 | 21 | def add_cli_args(self, parser: argparse.ArgumentParser) -> None: 22 | add_cli_args(parser) 23 | 24 | @staticmethod 25 | def cmd(args: argparse.Namespace) -> None: 26 | main(args) 27 | 28 | 29 | def cmd_init() -> list[CLISubcommand]: 30 | return [BenchmarkServingSubcommand()] 31 | -------------------------------------------------------------------------------- /vllm/entrypoints/cli/types.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import argparse 5 | 6 | from vllm.utils import FlexibleArgumentParser 7 | 8 | 9 | class CLISubcommand: 10 | """Base class for CLI argument handlers.""" 11 | 12 | name: str 13 | 14 | @staticmethod 15 | def cmd(args: argparse.Namespace) -> None: 16 | raise NotImplementedError("Subclasses should implement this method") 17 | 18 | def validate(self, args: argparse.Namespace) -> None: 19 | # No validation by default 20 | pass 21 | 22 | def subparser_init( 23 | self, 24 | subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: 25 | raise NotImplementedError("Subclasses should implement this method") 26 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/logging_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.logging_utils.formatter import NewLineFormatter 5 | 6 | __all__ = [ 7 | "NewLineFormatter", 8 | ] 9 | -------------------------------------------------------------------------------- /vllm/logging_utils/formatter.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | import logging 5 | 6 | 7 | class NewLineFormatter(logging.Formatter): 8 | """Adds logging prefix to newlines to align multi-line messages.""" 9 | 10 | def __init__(self, fmt, datefmt=None, style="%"): 11 | logging.Formatter.__init__(self, fmt, datefmt, style) 12 | 13 | def format(self, record): 14 | msg = logging.Formatter.format(self, record) 15 | if record.message != "": 16 | parts = msg.split(record.message) 17 | msg = msg.replace("\n", "\r\n" + parts[0]) 18 | return msg 19 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/lora/ops/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/torch_ops/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 5 | from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, 6 | sgmv_expand, sgmv_expand_slice, 7 | sgmv_shrink) 8 | 9 | __all__ = [ 10 | "bgmv_expand", 11 | "bgmv_expand_slice", 12 | "bgmv_shrink", 13 | "sgmv_expand", 14 | "sgmv_expand_slice", 15 | "sgmv_shrink", 16 | ] 17 | -------------------------------------------------------------------------------- /vllm/lora/ops/triton_ops/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand 5 | from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta 6 | from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink 7 | 8 | __all__ = [ 9 | "lora_expand", 10 | "lora_shrink", 11 | "LoRAKernelMeta", 12 | ] 13 | -------------------------------------------------------------------------------- /vllm/lora/ops/xla_ops/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice, 5 | bgmv_shrink) 6 | 7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"] 8 | -------------------------------------------------------------------------------- /vllm/lora/punica_wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase 5 | from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper 6 | 7 | __all__ = [ 8 | "PunicaWrapperBase", 9 | "get_punica_wrapper", 10 | ] 11 | -------------------------------------------------------------------------------- /vllm/lora/punica_wrapper/punica_selector.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.logger import init_logger 5 | from vllm.platforms import current_platform 6 | from vllm.utils import resolve_obj_by_qualname 7 | 8 | from .punica_base import PunicaWrapperBase 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: 14 | punica_wrapper_qualname = current_platform.get_punica_wrapper() 15 | punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname) 16 | punica_wrapper = punica_wrapper_cls(*args, **kwargs) 17 | assert punica_wrapper is not None, \ 18 | "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong." 19 | logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1]) 20 | return punica_wrapper 21 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.model_executor.parameter import (BasevLLMParameter, 5 | PackedvLLMParameter) 6 | from vllm.model_executor.sampling_metadata import (SamplingMetadata, 7 | SamplingMetadataCache) 8 | from vllm.model_executor.utils import set_random_seed 9 | 10 | __all__ = [ 11 | "SamplingMetadata", 12 | "SamplingMetadataCache", 13 | "set_random_seed", 14 | "BasevLLMParameter", 15 | "PackedvLLMParameter", 16 | ] 17 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | 12 | See `benchmark/kernels/benchmark_moe.py` on how to generate these config files. 13 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/mamba/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/kernels/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/quark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/model_executor/layers/quantization/quark/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/quark/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from .quark_scheme import QuarkScheme 5 | from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4 6 | from .quark_w8a8_fp8 import QuarkW8A8Fp8 7 | from .quark_w8a8_int8 import QuarkW8A8Int8 8 | 9 | __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"] 10 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from .layer_utils import replace_parameter, update_tensor_inplace 5 | 6 | __all__ = ['update_tensor_inplace', 'replace_parameter'] 7 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 32, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 64, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 32, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 32, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 16, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 16, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 32, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 32, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 4 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 8, 24 | "num_stages": 5 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 32, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 32, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 16, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 64, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "BLOCK_SIZE_M": 16, 4 | "BLOCK_SIZE_N": 32, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "8": { 11 | "BLOCK_SIZE_M": 16, 12 | "BLOCK_SIZE_N": 32, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 4 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 64, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 64, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 16, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 1, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 16, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 64, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 16, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 64, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 16, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 32, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 64, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 128, 6 | "GROUP_SIZE_M": 16, 7 | "num_warps": 4, 8 | "num_stages": 3 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 64, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 128, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 64, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 128, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 64, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 1, 15 | "num_warps": 4, 16 | "num_stages": 2 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 1, 23 | "num_warps": 4, 24 | "num_stages": 2 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json: -------------------------------------------------------------------------------- 1 | { 2 | "2048": { 3 | "BLOCK_SIZE_M": 128, 4 | "BLOCK_SIZE_N": 128, 5 | "BLOCK_SIZE_K": 64, 6 | "GROUP_SIZE_M": 64, 7 | "num_warps": 4, 8 | "num_stages": 2 9 | }, 10 | "3072": { 11 | "BLOCK_SIZE_M": 128, 12 | "BLOCK_SIZE_N": 128, 13 | "BLOCK_SIZE_K": 64, 14 | "GROUP_SIZE_M": 32, 15 | "num_warps": 4, 16 | "num_stages": 3 17 | }, 18 | "4096": { 19 | "BLOCK_SIZE_M": 128, 20 | "BLOCK_SIZE_N": 128, 21 | "BLOCK_SIZE_K": 64, 22 | "GROUP_SIZE_M": 64, 23 | "num_warps": 4, 24 | "num_stages": 3 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /vllm/model_executor/models/phi3.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | # Adapted from llama.py 5 | """Inference-only Phi3 model code inherit from Llama.py""" 6 | 7 | from vllm.model_executor.models.llama import LlamaForCausalLM 8 | 9 | 10 | class Phi3ForCausalLM(LlamaForCausalLM): 11 | 12 | packed_modules_mapping = { 13 | "qkv_proj": [ 14 | "qkv_proj", 15 | ], 16 | "gate_up_proj": [ 17 | "gate_up_proj", 18 | ], 19 | } 20 | -------------------------------------------------------------------------------- /vllm/plugins/lora_resolvers/README.md: -------------------------------------------------------------------------------- 1 | # LoRA Resolver Plugins 2 | 3 | This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters 4 | via the LoRAResolver plugin framework. 5 | 6 | Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins 7 | to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins. 8 | 9 | # lora_filesystem_resolver 10 | This LoRA Resolver is installed with vLLM by default. 11 | To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request 12 | for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory 13 | for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will 14 | load that adapter, and then service the request as normal. That adapter will then be available 15 | for future requests as normal. 16 | -------------------------------------------------------------------------------- /vllm/plugins/lora_resolvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/plugins/lora_resolvers/__init__.py -------------------------------------------------------------------------------- /vllm/profiler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/profiler/__init__.py -------------------------------------------------------------------------------- /vllm/prompt_adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/prompt_adapter/__init__.py -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/reasoning/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager 5 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser 6 | from .granite_reasoning_parser import GraniteReasoningParser 7 | from .qwen3_reasoning_parser import Qwen3ReasoningParser 8 | 9 | __all__ = [ 10 | "ReasoningParser", 11 | "ReasoningParserManager", 12 | "DeepSeekR1ReasoningParser", 13 | "GraniteReasoningParser", 14 | "Qwen3ReasoningParser", 15 | ] 16 | -------------------------------------------------------------------------------- /vllm/scripts.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.entrypoints.cli.main import main as vllm_main 5 | from vllm.logger import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | # Backwards compatibility for the move from vllm.scripts to 11 | # vllm.entrypoints.cli.main 12 | def main(): 13 | logger.warning("vllm.scripts.main() is deprecated. Please re-install " 14 | "vllm or use vllm.entrypoints.cli.main.main() instead.") 15 | vllm_main() 16 | -------------------------------------------------------------------------------- /vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/third_party/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | from .registry import get_chat_template_fallback_path 4 | 5 | __all__ = ["get_chat_template_fallback_path"] 6 | -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/template_basic.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {{- message['content'] -}} 3 | {%- endfor -%} 4 | -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/template_blip2.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'Question: ' + message['content'] + ' ' -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Answer: ' + message['content'] + ' ' -}} 6 | {%- endif -%} 7 | {%- endfor -%} 8 | 9 | {%- if add_generation_prompt -%} 10 | {{- 'Answer:' -}} 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}} 3 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 4 | {{- '<|im_end|>' + '\n' -}} 5 | {%- endif -%} 6 | {%- endfor -%} 7 | 8 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 9 | {{- '<|im_start|>assistant\n' -}} 10 | {%- endif -%} 11 | -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages[0]['role'] == 'system' -%} 2 | {%- set system_message = messages[0]['content'] -%} 3 | {%- set messages = messages[1:] -%} 4 | {%- else -%} 5 | {% set system_message = '' -%} 6 | {%- endif -%} 7 | 8 | {{ bos_token + system_message }} 9 | {%- for message in messages -%} 10 | {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} 11 | {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} 12 | {%- endif -%} 13 | 14 | {%- if message['role'] == 'user' -%} 15 | {{ '<|User|>: ' + message['content'] + '\n\n' }} 16 | {%- elif message['role'] == 'assistant' -%} 17 | {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }} 18 | {%- endif -%} 19 | {%- endfor -%} 20 | 21 | {%- if add_generation_prompt -%} 22 | {{ '<|Assistant|>: ' }} 23 | {%- endif -%} 24 | -------------------------------------------------------------------------------- /vllm/transformers_utils/chat_templates/template_fuyu.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {{- message['content'] + '\n' -}} 3 | {%- endfor -%} 4 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/h2ovl.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | # Adapted from 5 | # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py 6 | # -------------------------------------------------------- 7 | # H2OVL-Mississippi 8 | # Copyright (c) 2024 H2O.AI 9 | # Licensed under Apache 2.0 License [see LICENSE for details] 10 | # -------------------------------------------------------- 11 | 12 | from .internvl import InternVLChatConfig 13 | 14 | 15 | class H2OVLChatConfig(InternVLChatConfig): 16 | model_type = "h2ovl_chat" 17 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/nvlm_d.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | # Adapted from 5 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py 6 | # -------------------------------------------------------- 7 | # NVLM-D 8 | # Copyright (c) 2024 NVIDIA 9 | # Licensed under Apache 2.0 License [see LICENSE for details] 10 | # -------------------------------------------------------- 11 | from .internvl import InternVLChatConfig 12 | 13 | 14 | class NVLM_D_Config(InternVLChatConfig): 15 | model_type = 'NVLM_D' 16 | -------------------------------------------------------------------------------- /vllm/transformers_utils/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.transformers_utils.processors.deepseek_vl2 import ( 5 | DeepseekVLV2Processor) 6 | from vllm.transformers_utils.processors.ovis import OvisProcessor 7 | 8 | __all__ = ["DeepseekVLV2Processor", "OvisProcessor"] 9 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from .mistral import (MistralTokenizer, maybe_serialize_tool_calls, 5 | truncate_tool_call_ids, validate_request_params) 6 | 7 | __all__ = [ 8 | "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids", 9 | "validate_request_params" 10 | ] 11 | -------------------------------------------------------------------------------- /vllm/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | 4 | from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder, 5 | TritonPlaceholder) 6 | 7 | if HAS_TRITON: 8 | import triton 9 | import triton.language as tl 10 | else: 11 | triton = TritonPlaceholder() 12 | tl = TritonLanguagePlaceholder() 13 | 14 | __all__ = ["HAS_TRITON", "triton", "tl"] 15 | -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/usage/__init__.py -------------------------------------------------------------------------------- /vllm/v1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/__init__.py -------------------------------------------------------------------------------- /vllm/v1/attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/__init__.py -------------------------------------------------------------------------------- /vllm/v1/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/v1/attention/backends/mla/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/attention/backends/mla/__init__.py -------------------------------------------------------------------------------- /vllm/v1/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/__init__.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/core/sched/__init__.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | from vllm.v1.request import Request, RequestStatus 4 | 5 | 6 | def check_stop(request: Request, max_model_len: int) -> bool: 7 | if (request.num_tokens >= max_model_len 8 | or request.num_output_tokens >= request.max_tokens): 9 | request.status = RequestStatus.FINISHED_LENGTH_CAPPED 10 | return True 11 | 12 | sampling_params = request.sampling_params 13 | last_token_id = request.output_token_ids[-1] 14 | if (not sampling_params.ignore_eos 15 | and last_token_id == request.eos_token_id): 16 | request.status = RequestStatus.FINISHED_STOPPED 17 | return True 18 | 19 | if last_token_id in (sampling_params.stop_token_ids or ()): 20 | request.status = RequestStatus.FINISHED_STOPPED 21 | request.stop_reason = last_token_id 22 | return True 23 | return False 24 | -------------------------------------------------------------------------------- /vllm/v1/engine/exceptions.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | class EngineGenerateError(Exception): 4 | """Raised when a AsyncLLM.generate() fails. Recoverable.""" 5 | pass 6 | 7 | 8 | class EngineDeadError(Exception): 9 | """Raised when the EngineCore dies. Unrecoverable.""" 10 | 11 | def __init__(self, *args, suppress_context: bool = False, **kwargs): 12 | ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause." # noqa: E501 13 | 14 | super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs) 15 | # Make stack trace clearer when using with LLMEngine by 16 | # silencing irrelevant ZMQError. 17 | self.__suppress_context__ = suppress_context 18 | -------------------------------------------------------------------------------- /vllm/v1/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/executor/__init__.py -------------------------------------------------------------------------------- /vllm/v1/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/metrics/__init__.py -------------------------------------------------------------------------------- /vllm/v1/sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/__init__.py -------------------------------------------------------------------------------- /vllm/v1/sample/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/ops/__init__.py -------------------------------------------------------------------------------- /vllm/v1/sample/tpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/sample/tpu/__init__.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/v1/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/v1/worker/__init__.py -------------------------------------------------------------------------------- /vllm/vllm_flash_attn/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/vllm_flash_attn/.gitkeep -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ROCm/vllm/cdfe72b0140fbe2f569b3cbc69c95de1c4d6988a/vllm/worker/__init__.py --------------------------------------------------------------------------------