├── tests
    ├── __init__.py
    ├── lora
    │   └── __init__.py
    ├── tools
    │   └── __init__.py
    ├── tpu
    │   ├── __init__.py
    │   └── lora
    │   │   └── __init__.py
    ├── v1
    │   ├── __init__.py
    │   ├── core
    │   │   └── __init__.py
    │   ├── e2e
    │   │   └── __init__.py
    │   ├── engine
    │   │   └── __init__.py
    │   ├── sample
    │   │   └── __init__.py
    │   ├── tpu
    │   │   ├── __init__.py
    │   │   └── worker
    │   │   │   └── __init__.py
    │   ├── tracing
    │   │   └── __init__.py
    │   ├── worker
    │   │   └── __init__.py
    │   ├── cudagraph
    │   │   └── __init__.py
    │   ├── entrypoints
    │   │   ├── __init__.py
    │   │   ├── llm
    │   │   │   └── __init__.py
    │   │   └── openai
    │   │   │   └── responses
    │   │   │       └── __init__.py
    │   ├── executor
    │   │   └── __init__.py
    │   ├── kv_connector
    │   │   ├── __init__.py
    │   │   └── unit
    │   │   │   └── __init__.py
    │   ├── logits_processors
    │   │   └── __init__.py
    │   ├── structured_output
    │   │   └── __init__.py
    │   ├── shutdown
    │   │   └── utils.py
    │   └── test_request.py
    ├── benchmarks
    │   ├── __init__.py
    │   ├── test_latency_cli.py
    │   └── test_throughput_cli.py
    ├── compile
    │   ├── __init__.py
    │   └── piecewise
    │   │   └── __init__.py
    ├── engine
    │   └── __init__.py
    ├── kernels
    │   ├── __init__.py
    │   ├── moe
    │   │   ├── __init__.py
    │   │   └── modular_kernel_tools
    │   │   │   └── __init__.py
    │   ├── attention
    │   │   └── conftest.py
    │   ├── core
    │   │   └── test_permute_cols.py
    │   └── allclose_default.py
    ├── models
    │   ├── __init__.py
    │   ├── language
    │   │   ├── __init__.py
    │   │   ├── pooling
    │   │   │   └── __init__.py
    │   │   ├── generation
    │   │   │   └── __init__.py
    │   │   ├── generation_ppl_test
    │   │   │   ├── __init__.py
    │   │   │   ├── test_gpt.py
    │   │   │   ├── test_gemma.py
    │   │   │   └── test_qwen.py
    │   │   └── pooling_mteb_test
    │   │   │   └── __init__.py
    │   ├── multimodal
    │   │   ├── __init__.py
    │   │   ├── pooling
    │   │   │   └── __init__.py
    │   │   ├── generation
    │   │   │   ├── __init__.py
    │   │   │   └── vlm_utils
    │   │   │   │   └── __init__.py
    │   │   └── processing
    │   │   │   └── __init__.py
    │   └── quantization
    │   │   └── __init__.py
    ├── multimodal
    │   ├── __init__.py
    │   └── assets
    │   │   ├── rgba.png
    │   │   ├── image1.png
    │   │   └── image2.png
    ├── reasoning
    │   └── __init__.py
    ├── samplers
    │   └── __init__.py
    ├── tool_use
    │   └── __init__.py
    ├── detokenizer
    │   └── __init__.py
    ├── distributed
    │   ├── __init__.py
    │   └── test_distributed_oot.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   └── test_prompt_validation.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── correctness
    │   │   │   └── __init__.py
    │   │   ├── tool_parsers
    │   │   │   └── __init__.py
    │   │   └── conftest.py
    │   ├── pooling
    │   │   ├── __init__.py
    │   │   ├── llm
    │   │   │   └── __init__.py
    │   │   ├── openai
    │   │   │   └── __init__.py
    │   │   └── correctness
    │   │   │   └── __init__.py
    │   └── offline_mode
    │   │   └── __init__.py
    ├── model_executor
    │   ├── __init__.py
    │   └── model_loader
    │   │   └── __init__.py
    ├── quantization
    │   ├── __init__.py
    │   └── utils.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_do_lower_case.py
    │   └── test_tokenizer.py
    ├── basic_correctness
    │   ├── __init__.py
    │   └── test_cpu_offload.py
    ├── mistral_tool_use
    │   └── __init__.py
    ├── tensorizer_loader
    │   └── __init__.py
    ├── transformers_utils
    │   └── __init__.py
    ├── fastsafetensors_loader
    │   ├── __init__.py
    │   └── test_fastsafetensors_loader.py
    ├── plugins
    │   ├── lora_resolvers
    │   │   └── __init__.py
    │   ├── prithvi_io_processor_plugin
    │   │   ├── prithvi_io_processor
    │   │   │   └── __init__.py
    │   │   └── setup.py
    │   ├── vllm_add_dummy_platform
    │   │   ├── vllm_add_dummy_platform
    │   │   │   ├── __init__.py
    │   │   │   ├── dummy_attention_backend.py
    │   │   │   └── dummy_custom_ops.py
    │   │   └── setup.py
    │   └── vllm_add_dummy_model
    │   │   ├── setup.py
    │   │   └── vllm_add_dummy_model
    │   │       └── my_opt.py
    ├── runai_model_streamer_test
    │   └── __init__.py
    ├── config
    │   ├── test_config.yaml
    │   └── test_config_with_model.yaml
    ├── evals
    │   ├── gpt_oss
    │   │   ├── __init__.py
    │   │   └── conftest.py
    │   └── gsm8k
    │   │   ├── __init__.py
    │   │   └── configs
    │   │       ├── Qwen3-0.6B-FP8.yaml
    │   │       ├── Qwen1.5-MoE-W4A16-CT.yaml
    │   │       ├── Llama-3.2-1B-Instruct-INT8-CT.yaml
    │   │       ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
    │   │       ├── DeepSeek-V2-Lite-Instruct-FP8.yaml
    │   │       ├── Llama-3-8B-Instruct-nonuniform-CT.yaml
    │   │       └── models-small.txt
    ├── kv_transfer
    │   ├── test_lookup_buffer.sh
    │   └── test_send_recv.sh
    ├── utils_
    │   └── __init__.py
    ├── vllm_test_utils
    │   ├── setup.py
    │   └── vllm_test_utils
    │   │   └── __init__.py
    ├── test_embedded_commit.py
    ├── weight_loading
    │   └── models-large.txt
    ├── test_outputs.py
    ├── prompts
    │   └── example.txt
    ├── standalone_tests
    │   └── python_only_compile.sh
    └── test_seed_behavior.py
├── vllm
    ├── assets
    │   └── __init__.py
    ├── core
    │   ├── __init__.py
    │   └── block
    │   │   └── __init__.py
    ├── engine
    │   ├── __init__.py
    │   └── output_processor
    │   │   └── __init__.py
    ├── lora
    │   ├── __init__.py
    │   ├── ops
    │   │   ├── __init__.py
    │   │   ├── ipex_ops
    │   │   │   └── __init__.py
    │   │   ├── xla_ops
    │   │   │   └── __init__.py
    │   │   ├── triton_ops
    │   │   │   └── __init__.py
    │   │   └── torch_ops
    │   │   │   └── __init__.py
    │   ├── layers
    │   │   └── qkv_x_parallel_linear.py
    │   └── punica_wrapper
    │   │   ├── __init__.py
    │   │   └── punica_selector.py
    ├── ray
    │   ├── __init__.py
    │   └── lazy_utils.py
    ├── usage
    │   └── __init__.py
    ├── v1
    │   ├── __init__.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   └── sched
    │   │   │   └── __init__.py
    │   ├── executor
    │   │   └── __init__.py
    │   ├── metrics
    │   │   └── __init__.py
    │   ├── pool
    │   │   └── __init__.py
    │   ├── sample
    │   │   ├── __init__.py
    │   │   ├── ops
    │   │   │   └── __init__.py
    │   │   └── tpu
    │   │   │   └── __init__.py
    │   ├── worker
    │   │   ├── __init__.py
    │   │   └── ubatch_utils.py
    │   ├── attention
    │   │   ├── __init__.py
    │   │   └── backends
    │   │   │   ├── __init__.py
    │   │   │   └── mla
    │   │   │       └── __init__.py
    │   ├── spec_decode
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   └── engine
    │   │   └── exceptions.py
    ├── worker
    │   └── __init__.py
    ├── benchmarks
    │   ├── __init__.py
    │   └── lib
    │   │   └── __init__.py
    ├── compilation
    │   └── __init__.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── openai
    │   │   └── __init__.py
    │   ├── cli
    │   │   ├── benchmark
    │   │   │   ├── __init__.py
    │   │   │   ├── serve.py
    │   │   │   ├── latency.py
    │   │   │   ├── throughput.py
    │   │   │   └── base.py
    │   │   ├── __init__.py
    │   │   └── types.py
    │   └── constants.py
    ├── executor
    │   └── __init__.py
    ├── profiler
    │   └── __init__.py
    ├── third_party
    │   └── __init__.py
    ├── attention
    │   ├── layers
    │   │   └── __init__.py
    │   ├── ops
    │   │   └── __init__.py
    │   ├── utils
    │   │   └── __init__.py
    │   ├── backends
    │   │   ├── __init__.py
    │   │   └── mla
    │   │   │   └── __init__.py
    │   └── __init__.py
    ├── device_allocator
    │   └── __init__.py
    ├── vllm_flash_attn
    │   └── .gitkeep
    ├── model_executor
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── mamba
    │   │   │   ├── __init__.py
    │   │   │   └── ops
    │   │   │   │   └── __init__.py
    │   │   ├── quantization
    │   │   │   ├── quark
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── schemes
    │   │   │   │   │   └── __init__.py
    │   │   │   ├── kernels
    │   │   │   │   └── __init__.py
    │   │   │   ├── compressed_tensors
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── transform
    │   │   │   │   │   └── utils.py
    │   │   │   └── utils
    │   │   │   │   ├── configs
    │   │   │   │       ├── README.md
    │   │   │   │       ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │       └── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── mxfp8_utils.py
    │   │   ├── shared_fused_moe
    │   │   │   └── __init__.py
    │   │   ├── fla
    │   │   │   ├── __init__.py
    │   │   │   └── ops
    │   │   │   │   └── __init__.py
    │   │   ├── fused_moe
    │   │   │   └── configs
    │   │   │   │   └── README
    │   │   └── attention_layer_base.py
    │   ├── warmup
    │   │   └── __init__.py
    │   ├── models
    │   │   └── phi3.py
    │   └── __init__.py
    ├── plugins
    │   └── lora_resolvers
    │   │   ├── __init__.py
    │   │   └── README.md
    ├── distributed
    │   ├── kv_transfer
    │   │   ├── kv_pipe
    │   │   │   └── __init__.py
    │   │   ├── kv_connector
    │   │   │   ├── __init__.py
    │   │   │   ├── v1
    │   │   │   │   ├── p2p
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── __init__.py
    │   │   │   └── base.py
    │   │   ├── kv_lookup_buffer
    │   │   │   └── __init__.py
    │   │   ├── disagg_prefill_workflow.jpg
    │   │   └── __init__.py
    │   ├── device_communicators
    │   │   └── __init__.py
    │   ├── __init__.py
    │   └── eplb
    │   │   └── __init__.py
    ├── py.typed
    ├── transformers_utils
    │   ├── chat_templates
    │   │   ├── template_basic.jinja
    │   │   ├── template_fuyu.jinja
    │   │   ├── __init__.py
    │   │   ├── template_blip2.jinja
    │   │   ├── template_chatml.jinja
    │   │   └── template_deepseek_vl2.jinja
    │   ├── configs
    │   │   └── speculators
    │   │   │   └── __init__.py
    │   ├── tokenizers
    │   │   └── __init__.py
    │   ├── config_parser_base.py
    │   └── processors
    │   │   └── __init__.py
    ├── logging_utils
    │   └── __init__.py
    ├── tasks.py
    ├── scripts.py
    ├── triton_utils
    │   └── __init__.py
    └── env_override.py
├── docs
    ├── cli
    │   ├── .meta.yml
    │   ├── chat.md
    │   ├── complete.md
    │   ├── .nav.yml
    │   ├── serve.md
    │   ├── run-batch.md
    │   ├── bench
    │   │   ├── serve.md
    │   │   ├── latency.md
    │   │   └── throughput.md
    │   └── json_tip.inc.md
    ├── api
    │   └── vllm
    │   │   └── .meta.yml
    ├── community
    │   └── contact_us.md
    ├── assets
    │   ├── design
    │   │   ├── hierarchy.png
    │   │   ├── tpu
    │   │   │   └── most_model_len.png
    │   │   ├── metrics
    │   │   │   ├── intervals-1.png
    │   │   │   ├── intervals-2.png
    │   │   │   └── intervals-3.png
    │   │   ├── paged_attention
    │   │   │   ├── key.png
    │   │   │   ├── k_vecs.png
    │   │   │   ├── q_vecs.png
    │   │   │   ├── query.png
    │   │   │   ├── v_vec.png
    │   │   │   ├── value.png
    │   │   │   └── logits_vec.png
    │   │   ├── prefix_caching
    │   │   │   ├── free.png
    │   │   │   ├── overview.png
    │   │   │   ├── example-time-1.png
    │   │   │   ├── example-time-3.png
    │   │   │   ├── example-time-4.png
    │   │   │   ├── example-time-5.png
    │   │   │   ├── example-time-6.png
    │   │   │   └── example-time-7.png
    │   │   ├── hybrid_kv_cache_manager
    │   │   │   ├── full_attn.png
    │   │   │   ├── overview.png
    │   │   │   ├── sw_attn.png
    │   │   │   ├── memory_layout.png
    │   │   │   └── basic_grouping_example.png
    │   │   ├── arch_overview
    │   │   │   ├── llm_engine.excalidraw.png
    │   │   │   └── entrypoints.excalidraw.png
    │   │   └── fused_moe_modular_kernel
    │   │   │   ├── fused_moe_batched.png
    │   │   │   ├── fused_experts_blocks.png
    │   │   │   ├── fused_moe_non_batched.png
    │   │   │   └── prepare_and_finalize_blocks.png
    │   ├── deployment
    │   │   ├── dify-chat.png
    │   │   ├── open_webui.png
    │   │   ├── chatbox-chat.png
    │   │   ├── dify-settings.png
    │   │   ├── dp_external_lb.png
    │   │   ├── dp_internal_lb.png
    │   │   ├── streamlit-chat.png
    │   │   ├── chatbox-settings.png
    │   │   ├── dify-create-chatbot.png
    │   │   ├── anything-llm-provider.png
    │   │   ├── anything-llm-upload-doc.png
    │   │   ├── anything-llm-chat-with-doc.png
    │   │   ├── anything-llm-chat-without-doc.png
    │   │   └── architecture_helm_deployment.png
    │   ├── logos
    │   │   ├── vllm-logo-text-dark.png
    │   │   ├── vllm-logo-only-light.ico
    │   │   ├── vllm-logo-only-light.png
    │   │   └── vllm-logo-text-light.png
    │   ├── features
    │   │   └── disagg_prefill
    │   │   │   ├── overview.jpg
    │   │   │   ├── workflow.png
    │   │   │   ├── abstraction.jpg
    │   │   │   └── high_level_design.png
    │   └── contributing
    │   │   └── dockerfile-stages-dependency.png
    ├── getting_started
    │   └── installation
    │   │   ├── .nav.yml
    │   │   ├── device.template.md
    │   │   └── python_env_setup.inc.md
    ├── mkdocs
    │   ├── overrides
    │   │   ├── main.html
    │   │   └── partials
    │   │   │   └── toc-item.html
    │   ├── javascript
    │   │   ├── mathjax.js
    │   │   └── run_llm_widget.js
    │   └── hooks
    │   │   └── remove_announcement.py
    ├── deployment
    │   ├── frameworks
    │   │   ├── modal.md
    │   │   ├── triton.md
    │   │   ├── bentoml.md
    │   │   └── lobe-chat.md
    │   └── integrations
    │   │   ├── kserve.md
    │   │   ├── llmaz.md
    │   │   └── kubeai.md
    ├── configuration
    │   └── README.md
    ├── examples
    │   └── README.md
    ├── models
    │   └── extensions
    │   │   └── fastsafetensor.md
    ├── usage
    │   └── README.md
    └── serving
    │   └── integrations
    │       ├── llamaindex.md
    │       └── langchain.md
├── benchmarks
    ├── kernels
    │   └── requirements.txt
    ├── multi_turn
    │   ├── requirements.txt
    │   ├── bench_utils.py
    │   └── generate_multi_turn.json
    ├── benchmark_serving.py
    ├── benchmark_latency.py
    ├── benchmark_throughput.py
    └── structured_schemas
    │   └── structured_schema_1.json
├── csrc
    ├── moe
    │   └── marlin_moe_wna16
    │   │   └── .gitignore
    ├── quantization
    │   ├── gptq_marlin
    │   │   └── .gitignore
    │   ├── per_token_group_quant_8bit.h
    │   ├── cutlass_w8a8
    │   │   ├── scaled_mm_c3x_sm100.cu
    │   │   └── scaled_mm_c3x_sm120.cu
    │   └── gptq
    │   │   └── qdq_8.cuh
    ├── core
    │   ├── exception.hpp
    │   └── math.hpp
    ├── attention
    │   ├── attention_dtypes.h
    │   └── dtype_fp8.cuh
    ├── cutlass_extensions
    │   └── common.cpp
    ├── cub_helpers.h
    └── cpu
    │   └── cpu_types.hpp
├── requirements
    ├── lint.txt
    ├── kv_connectors.txt
    ├── dev.txt
    ├── build.txt
    ├── rocm-build.txt
    ├── cpu-build.txt
    ├── docs.txt
    ├── rocm.txt
    ├── xpu.txt
    ├── cuda.txt
    └── rocm-test.txt
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── config.yml
    ├── workflows
    │   ├── matchers
    │   │   ├── mypy.json
    │   │   ├── markdownlint.json
    │   │   └── actionlint.json
    │   ├── scripts
    │   │   ├── build.sh
    │   │   ├── create_release.js
    │   │   ├── cuda-install.sh
    │   │   └── pytorch-install.sh
    │   ├── add_label_automerge.yml
    │   └── cleanup_pr_body.yml
    └── scale-config.yml
├── .yapfignore
├── examples
    ├── online_serving
    │   ├── chart-helm
    │   │   ├── ct.yaml
    │   │   ├── .helmignore
    │   │   ├── templates
    │   │   │   ├── custom-objects.yaml
    │   │   │   ├── poddisruptionbudget.yaml
    │   │   │   ├── secrets.yaml
    │   │   │   ├── configmap.yaml
    │   │   │   ├── pvc.yaml
    │   │   │   └── service.yaml
    │   │   └── Chart.yaml
    │   ├── prometheus_grafana
    │   │   ├── prometheus.yaml
    │   │   └── docker-compose.yaml
    │   ├── structured_outputs
    │   │   └── pyproject.toml
    │   └── disaggregated_serving
    │   │   └── README.md
    ├── template_chatml.jinja
    ├── others
    │   └── lmcache
    │   │   └── disagg_prefill_lmcache_v1
    │   │       └── configs
    │   │           ├── lmcache-decoder-config.yaml
    │   │           └── lmcache-prefiller-config.yaml
    ├── offline_inference
    │   ├── disaggregated-prefill-v1
    │   │   ├── run.sh
    │   │   └── README.md
    │   └── openai_batch
    │   │   └── openai_example_batch.jsonl
    ├── template_teleflm.jinja
    ├── template_falcon.jinja
    ├── template_baichuan.jinja
    ├── template_falcon_180b.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_vlm2vec.jinja
    └── template_alpaca.jinja
├── tools
    ├── profiler
    │   └── nsys_profile_tools
    │   │   └── images
    │   │       ├── csv1.png
    │   │       ├── html.png
    │   │       └── html_tbl.png
    ├── check_repo.sh
    ├── png-lint.sh
    └── ep_kernels
    │   └── configure_system_drivers.sh
├── CONTRIBUTING.md
├── .buildkite
    ├── lm-eval-harness
    │   └── configs
    │   │   ├── models-large.txt
    │   │   ├── models-small.txt
    │   │   ├── Qwen2.5-1.5B-Instruct.yaml
    │   │   ├── Qwen2-57B-A14-Instruct.yaml
    │   │   ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
    │   │   ├── Minitron-4B-Base-FP8.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct.yaml
    │   │   ├── DeepSeek-V2-Lite-Chat.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1.yaml
    │   │   ├── Qwen2-1.5B-Instruct-FP8W8.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8.yaml
    │   │   ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
    │   │   ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
    │   │   ├── SparseLlama3.1_2of4_fp8_compressed.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
    │   │   └── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
    ├── nightly-benchmarks
    │   ├── scripts
    │   │   ├── get-lmdeploy-modelname.py
    │   │   ├── wait-for-image.sh
    │   │   └── download-tokenizer.py
    │   └── tests
    │   │   └── genai-perf-tests.json
    └── scripts
    │   ├── tpu
    │       ├── config_v6e_1.env
    │       └── quantized_v6e_1.env
    │   ├── rerun-test.sh
    │   ├── ci-clean-log.sh
    │   └── hardware_ci
    │       └── run-cpu-test-s390x.sh
├── .markdownlint.yaml
├── MANIFEST.in
├── format.sh
├── .gemini
    └── config.yaml
├── .dockerignore
├── .readthedocs.yaml
├── .shellcheckrc
├── .coveragerc
├── use_existing_torch.py
└── .clang-format


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/ray/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/reasoning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tpu/lora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/sample/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/tpu/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/tracing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/profiler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/third_party/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/executor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/pool/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/sample/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/cli/.meta.yml:
--------------------------------------------------------------------------------
1 | toc_depth: 3


--------------------------------------------------------------------------------
/tests/detokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/kernels/moe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/language/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/cudagraph/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/executor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/kv_connector/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/tpu/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/attention/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/attention/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/device_allocator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/attention/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/core/sched/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/sample/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/sample/tpu/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/spec_decode/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/compile/piecewise/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/pooling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/mistral_tool_use/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/kv_connector/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/logits_processors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/structured_output/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/warmup/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/attention/backends/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/pooling/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/language/pooling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/multimodal/pooling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/runai_model_streamer_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/v1/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_moe_wna16/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu


--------------------------------------------------------------------------------
/tests/entrypoints/openai/correctness/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/pooling/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/language/generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/multimodal/generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/multimodal/processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_pipe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq_marlin/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu


--------------------------------------------------------------------------------
/docs/api/vllm/.meta.yml:
--------------------------------------------------------------------------------
1 | search:
2 |   boost: 0.5
3 | 


--------------------------------------------------------------------------------
/tests/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/entrypoints/pooling/correctness/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/kernels/moe/modular_kernel_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/language/pooling_mteb_test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/v1/entrypoints/openai/responses/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/multimodal/generation/vlm_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements/lint.txt:
--------------------------------------------------------------------------------
1 | # formatting
2 | pre-commit==4.0.1
3 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: vllm
3 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | vllm/model_executor/layers/fla/ops/*.py
3 | 


--------------------------------------------------------------------------------
/docs/community/contact_us.md:
--------------------------------------------------------------------------------
1 | # Contact Us
2 | 
3 | --8<-- "README.md:contact-us"
4 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 | 


--------------------------------------------------------------------------------
/docs/cli/chat.md:
--------------------------------------------------------------------------------
1 | # vllm chat
2 | 
3 | ## Options
4 | 
5 | --8<-- "docs/argparse/chat.md"
6 | 


--------------------------------------------------------------------------------
/requirements/kv_connectors.txt:
--------------------------------------------------------------------------------
1 | lmcache
2 | nixl >= 0.5.1 # Required for disaggregated prefill
3 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/ct.yaml:
--------------------------------------------------------------------------------
1 | chart-dirs:
2 |   - charts
3 | validate-maintainers: false


--------------------------------------------------------------------------------
/docs/cli/complete.md:
--------------------------------------------------------------------------------
1 | # vllm complete
2 | 
3 | ## Options
4 | 
5 | --8<-- "docs/argparse/complete.md"
6 | 


--------------------------------------------------------------------------------
/docs/assets/design/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hierarchy.png


--------------------------------------------------------------------------------
/tests/multimodal/assets/rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/rgba.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-chat.png


--------------------------------------------------------------------------------
/tests/multimodal/assets/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image1.png


--------------------------------------------------------------------------------
/tests/multimodal/assets/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image2.png


--------------------------------------------------------------------------------
/docs/assets/deployment/open_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/open_webui.png


--------------------------------------------------------------------------------
/docs/getting_started/installation/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 |   - README.md
3 |   - gpu.md
4 |   - cpu.md
5 |   - google_tpu.md
6 | 


--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/chatbox-chat.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-settings.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dp_external_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dp_external_lb.png


--------------------------------------------------------------------------------
/docs/assets/deployment/dp_internal_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dp_internal_lb.png


--------------------------------------------------------------------------------
/docs/assets/deployment/streamlit-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/streamlit-chat.png


--------------------------------------------------------------------------------
/docs/assets/design/tpu/most_model_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/tpu/most_model_len.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/benchmarks/multi_turn/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.24
2 | pandas>=2.0.0
3 | aiohttp>=3.10
4 | transformers>=4.46
5 | xlsxwriter>=3.2.1


--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/chatbox-settings.png


--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-1.png


--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-2.png


--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-3.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/key.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/free.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/free.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-only-light.ico


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/.helmignore:
--------------------------------------------------------------------------------
1 | *.png
2 | .git/
3 | ct.yaml
4 | lintconf.yaml
5 | values.schema.json
6 | /workflows


--------------------------------------------------------------------------------
/tests/config/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | trust_remote_code: true
5 | 


--------------------------------------------------------------------------------
/docs/assets/deployment/dify-create-chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-create-chatbot.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/k_vecs.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/q_vecs.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/query.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/v_vec.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/value.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/overview.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-provider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-provider.png


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/overview.jpg


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/workflow.png


--------------------------------------------------------------------------------
/tests/evals/gpt_oss/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project


--------------------------------------------------------------------------------
/tests/evals/gsm8k/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-upload-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-upload-doc.png


--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/logits_vec.png


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/abstraction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/abstraction.jpg


--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/csv1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/csv1.png


--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/html.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/html.png


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-with-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-chat-with-doc.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-1.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-3.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-4.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-5.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-6.png


--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-7.png


--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/html_tbl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/html_tbl.png


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_basic.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 |     {{- message['content'] -}}
3 | {%- endfor -%}
4 | 


--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-without-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-chat-without-doc.png


--------------------------------------------------------------------------------
/docs/assets/deployment/architecture_helm_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/architecture_helm_deployment.png


--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/full_attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/full_attn.png


--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/overview.png


--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_fuyu.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 |     {{- message['content'] + '\n' -}}
3 | {%- endfor -%}
4 | 


--------------------------------------------------------------------------------
/docs/assets/contributing/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/contributing/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/llm_engine.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/arch_overview/llm_engine.excalidraw.png


--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/high_level_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/high_level_design.png


--------------------------------------------------------------------------------
/docs/cli/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 |   - README.md
3 |   - serve.md
4 |   - chat.md
5 |   - complete.md
6 |   - run-batch.md
7 |   - vllm bench:
8 |     - bench/*.md
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 | 
3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
4 | 


--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/entrypoints.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/arch_overview/entrypoints.excalidraw.png


--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r lint.txt
2 | -r test.txt
3 | 
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/speculators/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 


--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml:
--------------------------------------------------------------------------------
1 | model_name: "Qwen/Qwen3-0.6B-FP8"
2 | accuracy_threshold: 0.375
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/README.md:
--------------------------------------------------------------------------------
1 | # Quantization Kernel Config
2 | 
3 | Use scripts under `benchmarks/kernels/` to generate these config files.
4 | 


--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png


--------------------------------------------------------------------------------
/docs/cli/serve.md:
--------------------------------------------------------------------------------
 1 | # vllm serve
 2 | 
 3 | ## JSON CLI Arguments
 4 | 
 5 | --8<-- "docs/cli/json_tip.inc.md"
 6 | 
 7 | ## Options
 8 | 
 9 | --8<-- "docs/argparse/serve.md"
10 | 


--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png


--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png


--------------------------------------------------------------------------------
/vllm/benchmarks/lib/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Benchmark library utilities."""
4 | 


--------------------------------------------------------------------------------
/docs/cli/run-batch.md:
--------------------------------------------------------------------------------
 1 | # vllm run-batch
 2 | 
 3 | ## JSON CLI Arguments
 4 | 
 5 | --8<-- "docs/cli/json_tip.inc.md"
 6 | 
 7 | ## Options
 8 | 
 9 | --8<-- "docs/argparse/run-batch.md"
10 | 


--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png


--------------------------------------------------------------------------------
/docs/cli/bench/serve.md:
--------------------------------------------------------------------------------
 1 | # vllm bench serve
 2 | 
 3 | ## JSON CLI Arguments
 4 | 
 5 | --8<-- "docs/cli/json_tip.inc.md"
 6 | 
 7 | ## Options
 8 | 
 9 | --8<-- "docs/argparse/bench_serve.md"
10 | 


--------------------------------------------------------------------------------
/docs/cli/bench/latency.md:
--------------------------------------------------------------------------------
 1 | # vllm bench latency
 2 | 
 3 | ## JSON CLI Arguments
 4 | 
 5 | --8<-- "docs/cli/json_tip.inc.md"
 6 | 
 7 | ## Options
 8 | 
 9 | --8<-- "docs/argparse/bench_latency.md"
10 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/custom-objects.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.customObjects }}
2 | {{- range .Values.customObjects }}
3 | {{- tpl (. | toYaml) $ }}
4 | ---
5 | {{- end }}
6 | {{- end }}


--------------------------------------------------------------------------------
/tests/kv_transfer/test_lookup_buffer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python3 test_lookup_buffer.py &
3 | PID0=$!
4 | RANK=1 python3 test_lookup_buffer.py &
5 | PID1=$!
6 | 
7 | wait $PID0
8 | wait $PID1
9 | 


--------------------------------------------------------------------------------
/tests/kv_transfer/test_send_recv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0 python3 test_send_recv.py &
 4 | PID0=$!
 5 | RANK=1 python3 test_send_recv.py &
 6 | PID1=$!
 7 | 
 8 | wait $PID0
 9 | wait $PID1
10 | 


--------------------------------------------------------------------------------
/docs/cli/bench/throughput.md:
--------------------------------------------------------------------------------
 1 | # vllm bench throughput
 2 | 
 3 | ## JSON CLI Arguments
 4 | 
 5 | --8<-- "docs/cli/json_tip.inc.md"
 6 | 
 7 | ## Options
 8 | 
 9 | --8<-- "docs/argparse/bench_throughput.md"
10 | 


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
2 | accuracy_threshold: 0.45
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
2 | accuracy_threshold: 0.31
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
2 | accuracy_threshold: 0.60
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Questions
4 |     url: https://discuss.vllm.ai
5 |     about: Ask questions and discuss with other vLLM community members
6 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/tests/config/test_config_with_model.yaml:
--------------------------------------------------------------------------------
1 | # Same as test_config.yaml but with model specified
2 | model: config-model
3 | port: 12312
4 | served_model_name: mymodel
5 | tensor_parallel_size: 2
6 | trust_remote_code: true
7 | 


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
2 | accuracy_threshold: 0.72
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
2 | accuracy_threshold: 0.74
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 | 


--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
 1 | # Should be mirrored in pyproject.toml
 2 | cmake>=3.26.1
 3 | ninja
 4 | packaging>=24.2
 5 | setuptools>=77.0.3,<80.0.0
 6 | setuptools-scm>=8
 7 | torch==2.8.0
 8 | wheel
 9 | jinja2>=3.1.6
10 | regex
11 | build
12 | 


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from .communication_op import *
5 | from .parallel_state import *
6 | from .utils import *
7 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
 1 | MD007:
 2 |   indent: 4
 3 | MD013: false
 4 | MD024:
 5 |   siblings_only: true
 6 | MD033: false
 7 | MD042: false
 8 | MD045: false
 9 | MD046: false
10 | MD051: false
11 | MD052: false
12 | MD053: false
13 | MD059: false
14 | 


--------------------------------------------------------------------------------
/tests/utils_/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | This module is named `utils_` instead of `utils` to avoid obscuring
5 | `tests/utils.py`.
6 | """
7 | 


--------------------------------------------------------------------------------
/tests/v1/shutdown/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Shutdown test utils"""
4 | 
5 | SHUTDOWN_TEST_TIMEOUT_SEC = 120
6 | SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
7 | 


--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen3-0.6B-FP8.yaml
2 | Llama-3.2-1B-Instruct-INT8-CT.yaml
3 | Llama-3-8B-Instruct-nonuniform-CT.yaml
4 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
5 | Qwen1.5-MoE-W4A16-CT.yaml
6 | DeepSeek-V2-Lite-Instruct-FP8.yaml
7 | 


--------------------------------------------------------------------------------
/vllm/logging_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.logging_utils.formatter import NewLineFormatter
5 | 
6 | __all__ = [
7 |     "NewLineFormatter",
8 | ]
9 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements/common.txt
 3 | include requirements/cuda.txt
 4 | include requirements/rocm.txt
 5 | include requirements/cpu.txt
 6 | include CMakeLists.txt
 7 | 
 8 | recursive-include cmake *
 9 | recursive-include csrc *
10 | 


--------------------------------------------------------------------------------
/docs/mkdocs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block announce %}
4 |   <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
5 | {% endblock %}
6 | 


--------------------------------------------------------------------------------
/vllm/distributed/eplb/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | '''
4 | Expert parallelism load balancer (EPLB).
5 | '''
6 | 
7 | from .eplb_state import *
8 | from .rebalance_algo import *
9 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .registry import get_chat_template_fallback_path
4 | 
5 | __all__ = ["get_chat_template_fallback_path"]
6 | 


--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: policy/v1
2 | kind: PodDisruptionBudget
3 | metadata:
4 |   name: "{{ .Release.Name }}-pdb"
5 |   namespace: {{ .Release.Namespace }}
6 | spec:
7 |   maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}


--------------------------------------------------------------------------------
/vllm/lora/layers/qkv_x_parallel_linear.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .base import BaseLayerWithLoRA
4 | 
5 | 
6 | #TODO: Implement this
7 | class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
4 | echo "Please run 'pip install -r requirements/lint.txt', followed by"
5 | echo "'pre-commit install' to install the pre-commit hooks."
6 | echo "Then linters will run automatically before each commit."


--------------------------------------------------------------------------------
/tests/vllm_test_utils/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name='vllm_test_utils',
 8 |     version='0.1',
 9 |     packages=['vllm_test_utils'],
10 | )
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/shared_fused_moe/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import (
4 |     SharedFusedMoE)
5 | 
6 | __all__ = ["SharedFusedMoE"]
7 | 


--------------------------------------------------------------------------------
/examples/online_serving/structured_outputs/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "examples-online-structured-outputs"
3 | requires-python = ">=3.9, <3.13"
4 | dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
5 | version = "0.0.0"
6 | 
7 | [project.scripts]
8 | structured-outputs = "structured_outputs:main"
9 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from .layer_utils import replace_parameter, update_tensor_inplace
5 | 
6 | __all__ = ['update_tensor_inplace', 'replace_parameter']
7 | 


--------------------------------------------------------------------------------
/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | 
5 | def register_prithvi():
6 |     return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor"  # noqa: E501
7 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/modal.md:
--------------------------------------------------------------------------------
1 | # Modal
2 | 
3 | vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
4 | 
5 | For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
6 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/secrets.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   name: "{{ .Release.Name }}-secrets"
 5 |   namespace: {{ .Release.Namespace }}
 6 | type: Opaque
 7 | data:
 8 |   {{- range $key, $val := .Values.secrets }}
 9 |   {{ $key }}: {{ $val | b64enc | quote }}
10 |   {{- end }}


--------------------------------------------------------------------------------
/.gemini/config.yaml:
--------------------------------------------------------------------------------
1 | # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
2 | have_fun: false  # Just review the code
3 | code_review:
4 |   comment_severity_threshold: HIGH  # Reduce quantity of comments
5 |   pull_request_opened:
6 |     summary: false  # Don't summarize the PR in a separate comment
7 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation/device.template.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Requirements
 4 | 
 5 | ## Set up using Python
 6 | 
 7 | ### Pre-built wheels
 8 | 
 9 | ### Build wheel from source
10 | 
11 | ## Set up using Docker
12 | 
13 | ### Pre-built images
14 | 
15 | ### Build image from source
16 | 
17 | ## Extra information
18 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/kserve.md:
--------------------------------------------------------------------------------
1 | # KServe
2 | 
3 | vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
4 | 
5 | Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
6 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.configs -}}
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: "{{ .Release.Name }}-configs"
 6 |   namespace: {{ .Release.Namespace }}
 7 | data:
 8 |   {{- with .Values.configs }}
 9 |   {{- toYaml . | nindent 2 }}
10 |   {{- end }}
11 | {{- end -}}


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.distributed.kv_transfer.kv_connector.v1.base import (
4 |     KVConnectorBase_V1, KVConnectorRole)
5 | 
6 | __all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
7 | 


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen2.5-1.5B-Instruct.yaml
2 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
4 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
5 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
6 | Qwen1.5-MoE-W4A16-compressed-tensors.yaml
7 | 


--------------------------------------------------------------------------------
/examples/online_serving/disaggregated_serving/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Serving
2 | 
3 | This example contains scripts that demonstrate the disaggregated serving features of vLLM.
4 | 
5 | ## Files
6 | 
7 | - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
8 | - `kv_events.sh` - Demonstrates KV cache event publishing.
9 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/ipex_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.lora.ops.ipex_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 |                                              bgmv_shrink)
6 | 
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/xla_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | 
4 | from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 |                                             bgmv_shrink)
6 | 
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "mypy",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "severity": 3,
11 |           "message": 4
12 |         }
13 |       ]
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from ..utils import compare_two_settings
 5 | 
 6 | 
 7 | def test_cpu_offload():
 8 |     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
 9 |                          ["--cpu-offload-gb", "1"])
10 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from lmdeploy.serve.openai.api_client import APIClient
 5 | 
 6 | api_client = APIClient("http://localhost:8000")
 7 | model_name = api_client.available_models[0]
 8 | 
 9 | print(model_name)
10 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/llmaz.md:
--------------------------------------------------------------------------------
1 | # llmaz
2 | 
3 | [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
4 | 
5 | Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
6 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/tpu/config_v6e_1.env:
--------------------------------------------------------------------------------
 1 | # Environment config
 2 | TEST_NAME=llama8b
 3 | CONTAINER_NAME=tpu-test
 4 | 
 5 | # vllm config
 6 | MODEL=meta-llama/Llama-3.1-8B-Instruct
 7 | MAX_NUM_SEQS=256
 8 | MAX_NUM_BATCHED_TOKENS=1024
 9 | TENSOR_PARALLEL_SIZE=1
10 | MAX_MODEL_LEN=2048
11 | DOWNLOAD_DIR=/mnt/disks/persist
12 | EXPECTED_THROUGHPUT=8.0
13 | INPUT_LEN=1800
14 | OUTPUT_LEN=128
15 | 


--------------------------------------------------------------------------------
/requirements/rocm-build.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | --extra-index-url https://download.pytorch.org/whl/rocm6.3
 5 | torch==2.8.0
 6 | torchvision==0.23.0
 7 | torchaudio==2.8.0
 8 | 
 9 | triton==3.3.0
10 | cmake>=3.26.1,<4
11 | packaging>=24.2
12 | setuptools>=77.0.3,<80.0.0
13 | setuptools-scm>=8
14 | wheel
15 | jinja2>=3.1.6
16 | amdsmi==6.2.4
17 | timm>=1.0.17
18 | 


--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from ..entrypoints.openai.test_oot_registration import (
 5 |     run_and_test_dummy_opt_api_server)
 6 | 
 7 | 
 8 | def test_distributed_oot(dummy_opt_path: str):
 9 |     run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
10 | 


--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 5 | from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
 6 | 
 7 | __all__ = [
 8 |     "PunicaWrapperBase",
 9 |     "get_punica_wrapper",
10 | ]
11 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/pvc.yaml:
--------------------------------------------------------------------------------
 1 | {{-   if .Values.extraInit  }}
 2 | apiVersion: v1
 3 | kind: PersistentVolumeClaim
 4 | metadata:
 5 |   name: "{{ .Release.Name }}-storage-claim"
 6 |   namespace: {{ .Release.Namespace }}
 7 | spec:
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: {{ .Values.extraInit.pvcStorage }}
13 | {{- end }}


--------------------------------------------------------------------------------
/docs/configuration/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration Options
 2 | 
 3 | This section lists the most common options for running vLLM.
 4 | 
 5 | There are three main levels of configuration, from highest priority to lowest priority:
 6 | 
 7 | - [Request parameters][completions-api] and [input arguments][sampling-params]
 8 | - [Engine arguments](./engine_args.md)
 9 | - [Environment variables](./env_vars.md)
10 | 


--------------------------------------------------------------------------------
/requirements/cpu-build.txt:
--------------------------------------------------------------------------------
 1 | # Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
 2 | # see https://github.com/pytorch/pytorch/pull/151218
 3 | cmake>=3.26.1
 4 | ninja
 5 | packaging>=24.2
 6 | setuptools>=77.0.3,<80.0.0
 7 | setuptools-scm>=8
 8 | --extra-index-url https://download.pytorch.org/whl/cpu
 9 | torch==2.6.0+cpu
10 | wheel
11 | jinja2>=3.1.6
12 | regex
13 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
 1 | mkdocs
 2 | mkdocs-api-autonav
 3 | mkdocs-material
 4 | mkdocstrings-python
 5 | mkdocs-gen-files
 6 | mkdocs-awesome-nav
 7 | mkdocs-glightbox
 8 | mkdocs-git-revision-date-localized-plugin
 9 | mkdocs-minify-plugin
10 | regex
11 | ruff
12 | 
13 | # Required for argparse hook only
14 | -f https://download.pytorch.org/whl/cpu
15 | cachetools
16 | msgspec
17 | pydantic
18 | torch
19 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/constants.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | Shared constants for vLLM entrypoints.
 5 | """
 6 | 
 7 | # HTTP header limits for h11 parser
 8 | # These constants help mitigate header abuse attacks
 9 | H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
10 | H11_MAX_HEADER_COUNT_DEFAULT = 256
11 | 


--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml:
--------------------------------------------------------------------------------
 1 | local_cpu: False
 2 | max_local_cpu_size: 0
 3 | #local_disk: 
 4 | max_local_disk_size: 0
 5 | remote_serde: NULL
 6 | 
 7 | enable_nixl: True
 8 | nixl_role: "receiver"
 9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 | 


--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml:
--------------------------------------------------------------------------------
 1 | local_cpu: False
 2 | max_local_cpu_size: 0
 3 | #local_disk: 
 4 | max_local_disk_size: 0
 5 | remote_serde: NULL
 6 | 
 7 | enable_nixl: True
 8 | nixl_role: "sender"
 9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/tpu/quantized_v6e_1.env:
--------------------------------------------------------------------------------
 1 | # Environment config
 2 | TEST_NAME=llama8bw8a8
 3 | CONTAINER_NAME=tpu-test
 4 | 
 5 | # vllm config
 6 | MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
 7 | MAX_NUM_SEQS=128
 8 | MAX_NUM_BATCHED_TOKENS=1024
 9 | TENSOR_PARALLEL_SIZE=1
10 | MAX_MODEL_LEN=2048
11 | DOWNLOAD_DIR=/mnt/disks/persist
12 | EXPECTED_THROUGHPUT=10.0
13 | INPUT_LEN=1800
14 | OUTPUT_LEN=128
15 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/markdownlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "markdownlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "code": 4,
12 |           "message": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }


--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import vllm
 5 | 
 6 | 
 7 | def test_embedded_commit_defined():
 8 |     assert hasattr(vllm, "__version__")
 9 |     assert hasattr(vllm, "__version_tuple__")
10 |     assert vllm.__version__ != "dev"
11 |     assert vllm.__version_tuple__ != (0, 0, "dev")
12 | 


--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
 3 | 
 4 | if ! git diff --quiet; then
 5 | 	echo "Repo is dirty" >&2
 6 | 
 7 | 	exit 1
 8 | fi
 9 | 
10 | if ! git describe --tags; then
11 | 	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
12 | 
13 | 	exit 1
14 | fi
15 | 


--------------------------------------------------------------------------------
/docs/cli/json_tip.inc.md:
--------------------------------------------------------------------------------
1 | When passing JSON CLI arguments, the following sets of arguments are equivalent:
2 | 
3 | - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
4 | - `--json-arg.key1 value1 --json-arg.key2.key3 value2`
5 | 
6 | Additionally, list elements can be passed individually using `+`:
7 | 
8 | - `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
9 | - `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_blip2.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'Question: ' + message['content'] + ' ' -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Answer: ' + message['content'] + ' ' -}}
 6 |     {%- endif -%}
 7 | {%- endfor -%}
 8 | 
 9 | {%- if add_generation_prompt -%}
10 |     {{- 'Answer:' -}}
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 2 | model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.54
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.59
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/docs/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | vLLM's examples are split into three categories:
4 | 
5 | - If you are using vLLM from within Python code, see the *Offline Inference* section.
6 | - If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
7 | - For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
8 | 


--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/run.sh:
--------------------------------------------------------------------------------
 1 | rm -rf local_storage/
 2 | 
 3 | if [ -f "output.txt" ]; then
 4 |     rm output.txt
 5 | fi
 6 | 
 7 | # The directory of current script
 8 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 9 | 
10 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
11 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
12 | 


--------------------------------------------------------------------------------
/tests/vllm_test_utils/vllm_test_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | vllm_utils is a package for vLLM testing utilities.
 5 | It does not import any vLLM modules.
 6 | """
 7 | 
 8 | from .blame import BlameResult, blame
 9 | from .monitor import MonitoredValues, monitor
10 | 
11 | __all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
12 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from typing import Optional
 5 | 
 6 | 
 7 | def dummy_platform_plugin() -> Optional[str]:
 8 |     return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
 9 | 
10 | 
11 | def register_ops():
12 |     import vllm_add_dummy_platform.dummy_custom_ops  # noqa
13 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fla/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | # SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
4 | #
5 | # This file contains code copied from the flash-linear-attention project.
6 | # The original source code was licensed under the MIT license and included
7 | # the following copyright notice:
8 | # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
9 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from .quark_scheme import QuarkScheme
 5 | from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
 6 | from .quark_w8a8_fp8 import QuarkW8A8Fp8
 7 | from .quark_w8a8_int8 import QuarkW8A8Int8
 8 | 
 9 | __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
10 | 


--------------------------------------------------------------------------------
/vllm/tasks.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from typing import Literal, get_args
 4 | 
 5 | GenerationTask = Literal["generate", "transcription"]
 6 | GENERATION_TASKS = get_args(GenerationTask)
 7 | 
 8 | PoolingTask = Literal["encode", "embed", "classify", "score"]
 9 | POOLING_TASKS = get_args(PoolingTask)
10 | 
11 | SupportedTask = Literal[GenerationTask, PoolingTask]
12 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/triton.md:
--------------------------------------------------------------------------------
1 | # NVIDIA Triton
2 | 
3 | The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
4 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(name='vllm_add_dummy_model',
 7 |       version='0.1',
 8 |       packages=['vllm_add_dummy_model'],
 9 |       entry_points={
10 |           'vllm.general_plugins':
11 |           ["register_dummy_model = vllm_add_dummy_model:register"]
12 |       })
13 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """Defines the base type for KV cache connectors."""
 4 | 
 5 | from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 6 | 
 7 | KVConnectorBase = KVConnectorBase_V1
 8 | KVConnectorBaseType = KVConnectorBase_V1
 9 | 
10 | __all__ = ["KVConnectorBase", "KVConnectorBaseType"]
11 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
 5 |                       truncate_tool_call_ids, validate_request_params)
 6 | 
 7 | __all__ = [
 8 |     "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
 9 |     "validate_request_params"
10 | ]
11 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation/python_env_setup.inc.md:
--------------------------------------------------------------------------------
1 | It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
2 | 
3 | ```bash
4 | uv venv --python 3.12 --seed
5 | source .venv/bin/activate
6 | ```
7 | 


--------------------------------------------------------------------------------
/docs/models/extensions/fastsafetensor.md:
--------------------------------------------------------------------------------
1 | Loading Model weights with fastsafetensors
2 | ===================================================================
3 | 
4 | Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
5 | 
6 | To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
7 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.attention.backends.placeholder_attn import (
 5 |     PlaceholderAttentionBackend)
 6 | 
 7 | 
 8 | class DummyAttentionBackend(PlaceholderAttentionBackend):
 9 | 
10 |     @staticmethod
11 |     def get_name() -> str:
12 |         return "Dummy_Backend"
13 | 


--------------------------------------------------------------------------------
/vllm/lora/ops/triton_ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 5 | from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
 6 | from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
 7 | 
 8 | __all__ = [
 9 |     "lora_expand",
10 |     "lora_shrink",
11 |     "LoRAKernelMeta",
12 | ]
13 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_chatml.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
 3 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 4 |         {{- '<|im_end|>' + '\n' -}}
 5 |     {%- endif -%}
 6 | {%- endfor -%}
 7 | 
 8 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
 9 |     {{- '<|im_start|>assistant\n' -}}
10 | {%- endif -%}
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from typing import NamedTuple
 4 | 
 5 | from compressed_tensors.transform import TransformArgs, TransformScheme
 6 | 
 7 | __all__ = ["TransformTuple"]
 8 | 
 9 | 
10 | class TransformTuple(NamedTuple):
11 |     scheme_name: str
12 |     scheme: TransformScheme
13 |     args: TransformArgs
14 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 3 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.792
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.824
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 2 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.335
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.323
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 2 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.47
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.64
10 | limit: 1319
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/bentoml.md:
--------------------------------------------------------------------------------
1 | # BentoML
2 | 
3 | [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
4 | 
5 | For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
6 | 


--------------------------------------------------------------------------------
/docs/usage/README.md:
--------------------------------------------------------------------------------
 1 | # Using vLLM
 2 | 
 3 | First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
 4 | 
 5 | Then, vLLM supports the following usage patterns:
 6 | 
 7 | - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
 8 | - [Deployment](../deployment/docker.md): Scale up model instances for production.
 9 | - [Training](../training/rlhf.md): Train or fine-tune a model.
10 | 


--------------------------------------------------------------------------------
/examples/template_teleflm.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages %}
 2 |     {%- if message['role'] == 'user' %}
 3 |         {{- '<_user>' + message['content']|trim }}
 4 |     {%- elif message['role'] == 'system' %}
 5 |         {{- '<_system>' + message['content']|trim }}
 6 |     {%- elif message['role'] == 'assistant' %}
 7 |         {{- '<_bot>' + message['content'] }}
 8 |     {%- endif %}
 9 | {%- endfor %}
10 | {%- if add_generation_prompt %}
11 |     {{- '<_bot>' }}
12 | {%- endif %}
13 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/rerun-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage: ./rerun_test.sh path/to/test.py::test_name
 4 | 
 5 | # Check if argument is given
 6 | if [ $# -lt 1 ]; then
 7 |     echo "Usage: $0 path/to/test.py::test_name"
 8 |     echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
 9 |     exit 1
10 | fi
11 | 
12 | TEST=$1
13 | COUNT=1
14 | 
15 | while pytest -sv "$TEST"; do
16 |     COUNT=$((COUNT + 1))
17 |     echo "RUN NUMBER ${COUNT}"
18 | done
19 | 


--------------------------------------------------------------------------------
/csrc/cutlass_extensions/common.cpp:
--------------------------------------------------------------------------------
 1 | #include "cutlass_extensions/common.hpp"
 2 | 
 3 | int32_t get_sm_version_num() {
 4 |   int32_t major_capability, minor_capability;
 5 |   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
 6 |                          0);
 7 |   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
 8 |                          0);
 9 |   int32_t version_num = major_capability * 10 + minor_capability;
10 |   return version_num;
11 | }


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: "{{ .Release.Name }}-service"
 5 |   namespace: {{ .Release.Namespace }}
 6 | spec:
 7 |   type: ClusterIP
 8 |   ports:
 9 |     - name: {{ include "chart.service-port-name" . }}
10 |       port: {{ include "chart.service-port" . }}
11 |       targetPort: {{ include "chart.container-port-name" . }}
12 |       protocol: TCP
13 |   selector:
14 |   {{- include "chart.labels" . | nindent 4 }}


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 3 | model_name: "mgoin/Minitron-4B-Base-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.231
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.22
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/csrc/cub_helpers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef USE_ROCM
 4 |   #include <cub/cub.cuh>
 5 |   #if CUB_VERSION >= 200800
 6 |     #include <cuda/std/functional>
 7 | using CubAddOp = cuda::std::plus<>;
 8 | using CubMaxOp = cuda::maximum<>;
 9 |   #else   // if CUB_VERSION < 200800
10 | using CubAddOp = cub::Sum;
11 | using CubMaxOp = cub::Max;
12 |   #endif  // CUB_VERSION
13 | #else
14 |   #include <hipcub/hipcub.hpp>
15 | using CubAddOp = cub::Sum;
16 | using CubMaxOp = cub::Max;
17 | #endif  // USE_ROCM
18 | 


--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure that *.excalidraw.png files have the excalidraw metadata
 4 | # embedded in them. This ensures they can be loaded back into
 5 | # the tool and edited in the future.
 6 | 
 7 | find . -iname '*.excalidraw.png' | while read -r file; do
 8 | 	if git check-ignore -q "$file"; then
 9 | 		continue
10 | 	fi
11 | 	if ! grep -q "excalidraw+json" "$file"; then
12 | 		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
13 | 		exit 1
14 | 	fi
15 | done
16 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 3 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.756
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.752
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | /.venv
 2 | /build
 3 | dist
 4 | vllm/*.so
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | 
11 | .mypy_cache
12 | 
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 3 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.671
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.664
11 | limit: 1000
12 | num_fewshot: 5
13 | trust_remote_code: True


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 3 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.892
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.892
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 3 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.624
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.624
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/tests/plugins/prithvi_io_processor_plugin/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name="prithvi_io_processor_plugin",
 8 |     version="0.1",
 9 |     packages=["prithvi_io_processor"],
10 |     entry_points={
11 |         "vllm.io_processor_plugins": [
12 |             "prithvi_to_tiff = prithvi_io_processor:register_prithvi",  # noqa: E501
13 |         ]
14 |     },
15 | )
16 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 3 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.616
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.632
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/ci-clean-log.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage: ./ci_clean_log.sh ci.log
 3 | # This script strips timestamps and color codes from CI log files.
 4 | 
 5 | # Check if argument is given
 6 | if [ $# -lt 1 ]; then
 7 |     echo "Usage: $0 ci.log"
 8 |     exit 1
 9 | fi
10 | 
11 | INPUT_FILE="$1"
12 | 
13 | # Strip timestamps
14 | sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
15 | 
16 | # Strip colorization
17 | sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
18 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.12"
10 |   jobs:
11 |     post_checkout:
12 |       - git fetch --unshallow || true
13 | 
14 | mkdocs:
15 |   configuration: mkdocs.yaml
16 | 
17 | # Optionally declare the Python requirements required to build your docs
18 | python:
19 |   install:
20 |     - requirements: requirements/docs.txt
21 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.578
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.585
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/tests/kernels/attention/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.utils import (create_kv_caches_with_random,
 7 |                         create_kv_caches_with_random_flash)
 8 | 
 9 | 
10 | @pytest.fixture()
11 | def kv_cache_factory():
12 |     return create_kv_caches_with_random
13 | 
14 | 
15 | @pytest.fixture()
16 | def kv_cache_factory_flashinfer():
17 |     return create_kv_caches_with_random_flash
18 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the CPU docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Setup cleanup
 8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 9 | trap remove_docker_container EXIT
10 | remove_docker_container
11 | 
12 | # Try building the docker image
13 | docker build -t cpu-test -f docker/Dockerfile.s390x .
14 | 


--------------------------------------------------------------------------------
/docs/mkdocs/javascript/mathjax.js:
--------------------------------------------------------------------------------
 1 | // Enables MathJax rendering
 2 | window.MathJax = {
 3 |   tex: {
 4 |     inlineMath: [["\\(", "\\)"]],
 5 |     displayMath: [["\\[", "\\]"]],
 6 |     processEscapes: true,
 7 |     processEnvironments: true
 8 |   },
 9 |   options: {
10 |     ignoreHtmlClass: ".*|",
11 |     processHtmlClass: "arithmatex"
12 |   }
13 | };
14 | 
15 | document$.subscribe(() => { 
16 |   MathJax.startup.output.clearCache()
17 |   MathJax.typesetClear()
18 |   MathJax.texReset()
19 |   MathJax.typesetPromise()
20 | })
21 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 3 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.753
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 3 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.86
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.86
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.shellcheckrc:
--------------------------------------------------------------------------------
 1 | # rules currently disabled:
 2 | #
 3 | #   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
 4 | #   SC2004 (style): $/${} is unnecessary on arithmetic variables.
 5 | #   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
 6 | #   SC2155 (warning): Declare and assign separately to avoid masking return values.
 7 | #   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
 8 | #
 9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164
10 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_TYPES_HPP
 2 | #define CPU_TYPES_HPP
 3 | 
 4 | #if defined(__x86_64__)
 5 |   // x86 implementation
 6 |   #include "cpu_types_x86.hpp"
 7 | #elif defined(__POWER9_VECTOR__)
 8 |   // ppc implementation
 9 |   #include "cpu_types_vsx.hpp"
10 | #elif defined(__s390x__)
11 |   // s390 implementation
12 |   #include "cpu_types_vxe.hpp"
13 | #elif defined(__aarch64__)
14 |   // arm implementation
15 |   #include "cpu_types_arm.hpp"
16 | #else
17 |   #warning "unsupported vLLM cpu implementation"
18 | #endif
19 | 
20 | #endif


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
 4 | from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
 5 | from vllm.entrypoints.cli.benchmark.throughput import (
 6 |     BenchmarkThroughputSubcommand)
 7 | 
 8 | __all__: list[str] = [
 9 |     "BenchmarkLatencySubcommand",
10 |     "BenchmarkServingSubcommand",
11 |     "BenchmarkThroughputSubcommand",
12 | ]


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.30
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.465
11 | limit: 1319
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_gpt.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | from tests.models.utils import GenerateModelInfo
 6 | 
 7 | from .ppl_utils import wikitext_ppl_test
 8 | 
 9 | MODELS = [GenerateModelInfo("openai-community/gpt2-large")]
10 | 
11 | 
12 | @pytest.mark.parametrize("model_info", MODELS)
13 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
14 |     wikitext_ppl_test(hf_runner, vllm_runner, model_info)
15 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.593
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.588
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "4": {
 3 |         "BLOCK_SIZE_M": 16,
 4 |         "BLOCK_SIZE_N": 32,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "8": {
11 |         "BLOCK_SIZE_M": 16,
12 |         "BLOCK_SIZE_N": 32,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 4
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.595
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.582
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | # Adapted from llama.py
 5 | """Inference-only Phi3 model code inherit from Llama.py"""
 6 | 
 7 | from vllm.model_executor.models.llama import LlamaForCausalLM
 8 | 
 9 | 
10 | class Phi3ForCausalLM(LlamaForCausalLM):
11 | 
12 |     packed_modules_mapping = {
13 |         "qkv_proj": [
14 |             "qkv_proj",
15 |         ],
16 |         "gate_up_proj": [
17 |             "gate_up_proj",
18 |         ],
19 |     }
20 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # For hf script, without -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 3 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.905
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.905
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.753
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 3 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.356
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.358
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/vllm/scripts.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.entrypoints.cli.main import main as vllm_main
 5 | from vllm.logger import init_logger
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | # Backwards compatibility for the move from vllm.scripts to
11 | # vllm.entrypoints.cli.main
12 | def main():
13 |     logger.warning("vllm.scripts.main() is deprecated. Please re-install "
14 |                    "vllm or use vllm.entrypoints.cli.main.main() instead.")
15 |     vllm_main()
16 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.755
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.755
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/docs/deployment/frameworks/lobe-chat.md:
--------------------------------------------------------------------------------
 1 | # Lobe Chat
 2 | 
 3 | [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
 4 | 
 5 | Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
 6 | 
 7 | One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
 8 | 
 9 | It supports vLLM as an AI model provider to efficiently serve large language models.
10 | 
11 | For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.758
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.759
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_serving.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     print("""DEPRECATED: This script has been moved to the vLLM CLI.
 7 | 
 8 | Please use the following command instead:
 9 |     vllm bench serve
10 | 
11 | For help with the new command, run:
12 |     vllm bench serve --help
13 | 
14 | Alternatively, you can run the new command directly with:
15 |     python -m vllm.entrypoints.cli.main bench serve --help
16 | """)
17 |     sys.exit(1)
18 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     print("""DEPRECATED: This script has been moved to the vLLM CLI.
 7 | 
 8 | Please use the following command instead:
 9 |     vllm bench latency
10 | 
11 | For help with the new command, run:
12 |     vllm bench latency --help
13 | 
14 | Alternatively, you can run the new command directly with:
15 |     python -m vllm.entrypoints.cli.main bench latency --help
16 | """)
17 |     sys.exit(1)
18 | 


--------------------------------------------------------------------------------
/tools/ep_kernels/configure_system_drivers.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | # turn on IBGDA
 4 | echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
 5 | 
 6 | if command -v update-initramfs &> /dev/null; then
 7 |     # for Debian/Ubuntu
 8 |     sudo update-initramfs -u
 9 | elif command -v dracut &> /dev/null; then
10 |     # for Fedora/CentOS
11 |     sudo dracut --force
12 | else
13 |     echo "No supported initramfs update tool found."
14 |     exit 1
15 | fi
16 | 
17 | echo "Please reboot the system to apply the changes"
18 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 3 | model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.6353
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.637
11 | limit: null
12 | num_fewshot: null 
13 | 


--------------------------------------------------------------------------------
/csrc/quantization/per_token_group_quant_8bit.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/all.h>
 3 | 
 4 | // TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
 5 | // 8-bit per-token-group quantization helper used by both FP8 and INT8
 6 | void per_token_group_quant_8bit(const torch::Tensor& input,
 7 |                                 torch::Tensor& output_q,
 8 |                                 torch::Tensor& output_s, int64_t group_size,
 9 |                                 double eps, double min_8bit, double max_8bit,
10 |                                 bool scale_ue8m0 = false);


--------------------------------------------------------------------------------
/benchmarks/benchmark_throughput.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     print("""DEPRECATED: This script has been moved to the vLLM CLI.
 7 | 
 8 | Please use the following command instead:
 9 |     vllm bench throughput
10 | 
11 | For help with the new command, run:
12 |     vllm bench throughput --help
13 | 
14 | Alternatively, you can run the new command directly with:
15 |     python -m vllm.entrypoints.cli.main bench throughput --help
16 | """)
17 |     sys.exit(1)
18 | 


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/benchmarks/structured_schemas/structured_schema_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |       "name": { "type": "string" },
 5 |       "email": { "type": "string" },
 6 |       "street": { "type": "string" },
 7 |       "city": { "type": "string" },
 8 |       "state": { "type": "string" },
 9 |       "zip": { "type": "string" },
10 |       "phone": { "type": "string" },
11 |       "website": { "type": "string" },
12 |       "company": { "type": "string" },
13 |       "age": { "type": "integer" }
14 |     },
15 |     "required": [
16 |       "name",
17 |       "email"
18 |     ]
19 | }
20 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.distributed.kv_transfer.kv_transfer_state import (
 5 |     KVConnectorBaseType, ensure_kv_transfer_initialized,
 6 |     ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group,
 7 |     is_v1_kv_transfer_group)
 8 | 
 9 | __all__ = [
10 |     "get_kv_transfer_group", "has_kv_transfer_group",
11 |     "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
12 |     "ensure_kv_transfer_shutdown", "KVConnectorBaseType"
13 | ]
14 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.728
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.728
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name='vllm_add_dummy_platform',
 8 |     version='0.1',
 9 |     packages=['vllm_add_dummy_platform'],
10 |     entry_points={
11 |         'vllm.platform_plugins': [
12 |             "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
13 |         ],
14 |         "vllm.general_plugins":
15 |         ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
16 |     })
17 | 


--------------------------------------------------------------------------------
/vllm/v1/worker/ubatch_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from dataclasses import dataclass
 4 | 
 5 | from typing_extensions import TypeAlias
 6 | 
 7 | 
 8 | @dataclass
 9 | class UBatchSlice:
10 |     request_slice: slice
11 |     token_slice: slice
12 | 
13 | 
14 | UBatchSlices: TypeAlias = list[UBatchSlice]
15 | 
16 | 
17 | def is_second_ubatch_empty(orig_num_tokens_per_ubatch: int,
18 |                            padded_num_tokens_per_ubatch: int) -> bool:
19 |     return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch
20 | 


--------------------------------------------------------------------------------
/examples/offline_inference/openai_batch/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
3 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.752
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.754
11 | limit: 1000
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
 1 | # For vllm script, with -t option (tensor parallel size).
 2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 4 | tasks:
 5 | - name: "gsm8k"
 6 |   metrics:
 7 |   - name: "exact_match,strict-match"
 8 |     value: 0.764
 9 |   - name: "exact_match,flexible-extract"
10 |     value: 0.764
11 | limit: 250
12 | num_fewshot: 5
13 | 


--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
5 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
6 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
7 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
8 | compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main


--------------------------------------------------------------------------------
/vllm/lora/ops/torch_ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 5 | from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
 6 |                                               sgmv_expand, sgmv_expand_slice,
 7 |                                               sgmv_shrink)
 8 | 
 9 | __all__ = [
10 |     "bgmv_expand",
11 |     "bgmv_expand_slice",
12 |     "bgmv_shrink",
13 |     "sgmv_expand",
14 |     "sgmv_expand_slice",
15 |     "sgmv_shrink",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
 5 |                                          TritonPlaceholder)
 6 | 
 7 | if HAS_TRITON:
 8 |     import triton
 9 |     import triton.language as tl
10 |     import triton.language.extra.libdevice as tldevice
11 | else:
12 |     triton = TritonPlaceholder()
13 |     tl = TritonLanguagePlaceholder()
14 |     tldevice = TritonLanguagePlaceholder()
15 | 
16 | __all__ = ["HAS_TRITON", "triton", "tl", "tldevice"]
17 | 


--------------------------------------------------------------------------------
/requirements/rocm.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 5 | numba == 0.61.2; python_version > '3.9'
 6 | 
 7 | # Dependencies for AMD GPUs
 8 | boto3
 9 | botocore
10 | datasets
11 | ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
12 | peft
13 | pytest-asyncio
14 | tensorizer==2.10.1
15 | packaging>=24.2
16 | setuptools>=77.0.3,<80.0.0
17 | setuptools-scm>=8
18 | runai-model-streamer==0.11.0
19 | runai-model-streamer-s3==0.11.0
20 | conch-triton-kernels==1.2.1
21 | timm>=1.0.17


--------------------------------------------------------------------------------
/vllm/ray/lazy_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | 
 5 | def is_ray_initialized():
 6 |     """Check if Ray is initialized."""
 7 |     try:
 8 |         import ray
 9 |         return ray.is_initialized()
10 |     except ImportError:
11 |         return False
12 | 
13 | 
14 | def is_in_ray_actor():
15 |     """Check if we are in a Ray actor."""
16 | 
17 |     try:
18 |         import ray
19 |         return (ray.is_initialized()
20 |                 and ray.get_runtime_context().get_actor_id() is not None)
21 |     except ImportError:
22 |         return False
23 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = vllm
 3 | omit =
 4 |     */tests/*
 5 |     */test_*
 6 |     */__pycache__/*
 7 |     */build/*
 8 |     */dist/*
 9 |     */vllm.egg-info/*
10 |     */third_party/*
11 |     */examples/*
12 |     */benchmarks/*
13 |     */docs/*
14 | 
15 | [report]
16 | exclude_lines =
17 |     pragma: no cover
18 |     def __repr__
19 |     if self.debug:
20 |     if settings.DEBUG
21 |     raise AssertionError
22 |     raise NotImplementedError
23 |     if 0:
24 |     if __name__ == .__main__.:
25 |     class .*\bProtocol\):
26 |     @(abc\.)?abstractmethod
27 | 
28 | [html]
29 | directory = htmlcov
30 | 
31 | [xml]
32 | output = coverage.xml
33 | 


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_gemma.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | from tests.models.utils import GenerateModelInfo
 6 | 
 7 | from .ppl_utils import wikitext_ppl_test
 8 | 
 9 | MODELS = [
10 |     GenerateModelInfo("google/gemma-2b"),
11 |     GenerateModelInfo("google/gemma-2-2b"),
12 |     GenerateModelInfo("google/gemma-3-4b-it"),
13 | ]
14 | 
15 | 
16 | @pytest.mark.parametrize("model_info", MODELS)
17 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
18 |     wikitext_ppl_test(hf_runner, vllm_runner, model_info)
19 | 


--------------------------------------------------------------------------------
/tests/test_outputs.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | def test_request_output_forward_compatible():
 8 |     output = RequestOutput(request_id="test_request_id",
 9 |                            prompt="test prompt",
10 |                            prompt_token_ids=[1, 2, 3],
11 |                            prompt_logprobs=None,
12 |                            outputs=[],
13 |                            finished=False,
14 |                            example_arg_added_in_new_version="some_value")
15 |     assert output is not None
16 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.model_executor.parameter import (BasevLLMParameter,
 5 |                                            PackedvLLMParameter)
 6 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
 7 |                                                    SamplingMetadataCache)
 8 | from vllm.model_executor.utils import set_random_seed
 9 | 
10 | __all__ = [
11 |     "SamplingMetadata",
12 |     "SamplingMetadataCache",
13 |     "set_random_seed",
14 |     "BasevLLMParameter",
15 |     "PackedvLLMParameter",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 
12 | See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
13 | 


--------------------------------------------------------------------------------
/vllm/v1/spec_decode/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from vllm.sampling_params import SamplingParams
 4 | 
 5 | _SAMPLING_EPS = 1e-5
 6 | 
 7 | 
 8 | def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
 9 |     """True if request is incompatible with speculative decoding"""
10 |     return (sampling_params.frequency_penalty != 0.0
11 |             or sampling_params.presence_penalty != 0.0
12 |             or sampling_params.repetition_penalty != 1.0
13 |             or sampling_params.min_p > _SAMPLING_EPS
14 |             or sampling_params.logprobs is not None)
15 | 


--------------------------------------------------------------------------------
/requirements/xpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | ray>=2.9
 5 | cmake>=3.26.1
 6 | packaging>=24.2
 7 | setuptools-scm>=8
 8 | setuptools>=77.0.3,<80.0.0
 9 | wheel
10 | jinja2>=3.1.6
11 | datasets # for benchmark scripts
12 | numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
13 | nixl==0.3.0 # for PD disaggregation
14 | torch==2.8.0+xpu
15 | torchaudio
16 | torchvision
17 | --extra-index-url=https://download.pytorch.org/whl/xpu
18 | 
19 | intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
20 | 


--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "llama8B_tp1_genai_perf",
 4 |         "qps_list": [4,8,16,32],
 5 |         "common_parameters": {
 6 |             "model": "meta-llama/Meta-Llama-3-8B-Instruct",
 7 |             "tp": 1,
 8 |             "port": 8000,
 9 |             "num_prompts": 500,
10 |             "reuse_server": false
11 |         },
12 |         "vllm_server_parameters": {
13 |             "disable_log_stats": "",
14 |             "gpu_memory_utilization": 0.9,
15 |             "max_num_seqs": 512,
16 |             "dtype": "bfloat16"
17 |         },
18 |         "genai_perf_input_parameters": {
19 |         }
20 |     }
21 | ]


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | python_executable=python$1
 5 | cuda_home=/usr/local/cuda-$2
 6 | 
 7 | # Update paths
 8 | PATH=${cuda_home}/bin:$PATH
 9 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
10 | 
11 | # Install requirements
12 | $python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure release wheels are built for the following architectures
17 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
18 | 
19 | bash tools/check_repo.sh
20 | 
21 | # Build
22 | $python_executable setup.py bdist_wheel --dist-dir=dist
23 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_do_lower_case.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.transformers_utils.tokenizer import get_tokenizer
 7 | 
 8 | TOKENIZER_NAMES = ["BAAI/bge-base-en"]
 9 | 
10 | 
11 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
12 | @pytest.mark.parametrize("n_tokens", [510])
13 | def test_special_tokens(tokenizer_name: str, n_tokens: int):
14 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
15 | 
16 |     prompts = '[UNK]' * n_tokens
17 |     prompt_token_ids = tokenizer.encode(prompts)
18 |     assert len(prompt_token_ids) == n_tokens + 2
19 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/config_parser_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from abc import ABC, abstractmethod
 5 | from pathlib import Path
 6 | from typing import Optional, Union
 7 | 
 8 | from transformers import PretrainedConfig
 9 | 
10 | 
11 | class ConfigParserBase(ABC):
12 | 
13 |     @abstractmethod
14 |     def parse(self,
15 |               model: Union[str, Path],
16 |               trust_remote_code: bool,
17 |               revision: Optional[str] = None,
18 |               code_revision: Optional[str] = None,
19 |               **kwargs) -> tuple[dict, PretrainedConfig]:
20 |         raise NotImplementedError
21 | 


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/docs/serving/integrations/llamaindex.md:
--------------------------------------------------------------------------------
 1 | # LlamaIndex
 2 | 
 3 | vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 4 | 
 5 | To install LlamaIndex, run
 6 | 
 7 | ```bash
 8 | pip install llama-index-llms-vllm -q
 9 | ```
10 | 
11 | To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
12 | 
13 | ```python
14 | from llama_index.llms.vllm import Vllm
15 | 
16 | llm = Vllm(
17 |     model="microsoft/Orca-2-7b",
18 |     tensor_parallel_size=4,
19 |     max_new_tokens=100,
20 |     vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
21 | )
22 | ```
23 | 
24 | Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
25 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_latency_cli.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import subprocess
 4 | 
 5 | import pytest
 6 | 
 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 8 | 
 9 | 
10 | @pytest.mark.benchmark
11 | def test_bench_latency():
12 |     command = [
13 |         "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
14 |         "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 |     ]
16 |     result = subprocess.run(command, capture_output=True, text=True)
17 |     print(result.stdout)
18 |     print(result.stderr)
19 | 
20 |     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_throughput_cli.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import subprocess
 4 | 
 5 | import pytest
 6 | 
 7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 8 | 
 9 | 
10 | @pytest.mark.benchmark
11 | def test_bench_throughput():
12 |     command = [
13 |         "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
14 |         "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 |     ]
16 |     result = subprocess.run(command, capture_output=True, text=True)
17 |     print(result.stdout)
18 |     print(result.stderr)
19 | 
20 |     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 | 


--------------------------------------------------------------------------------
/tests/kernels/core/test_permute_cols.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | import torch
 6 | 
 7 | from tests.kernels.utils import opcheck
 8 | from vllm._custom_ops import permute_cols
 9 | 
10 | 
11 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
12 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
13 | def test_permute_cols(shape, dtype):
14 |     x = torch.randn(shape, dtype=dtype).cuda()
15 |     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
16 |     opcheck(torch.ops._C.permute_cols, (x, perm))
17 |     y = permute_cols(x, perm)
18 |     torch.testing.assert_close(y, x[:, perm])


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import torch
 5 | 
 6 | # Reference default values of atol and rtol are from
 7 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 8 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 9 | default_rtol = {
10 |     torch.float16: 1e-3,
11 |     torch.bfloat16: 1.6e-2,
12 |     torch.float: 1.3e-6
13 | }
14 | 
15 | 
16 | def get_default_atol(output) -> float:
17 |     return default_atol[output.dtype]
18 | 
19 | 
20 | def get_default_rtol(output) -> float:
21 |     return default_rtol[output.dtype]
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses GitHub's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/tests/evals/gpt_oss/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | Pytest configuration for GPT-OSS evaluation tests.
 5 | """
 6 | 
 7 | 
 8 | def pytest_addoption(parser):
 9 |     """Add command line options for pytest."""
10 |     parser.addoption("--model", action="store", help="Model name to evaluate")
11 |     parser.addoption("--metric",
12 |                      action="store",
13 |                      type=float,
14 |                      help="Expected metric threshold")
15 |     parser.addoption("--server-args",
16 |                      action="store",
17 |                      default="",
18 |                      help="Additional server arguments")
19 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """
 4 | Multi-modal processors may be defined in this directory for the following
 5 | reasons:
 6 | 
 7 | - There is no processing file defined by HF Hub or Transformers library.
 8 | - There is a need to override the existing processor to support vLLM.
 9 | """
10 | 
11 | from vllm.transformers_utils.processors.deepseek_vl2 import (
12 |     DeepseekVLV2Processor)
13 | from vllm.transformers_utils.processors.ovis import OvisProcessor
14 | from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
15 | 
16 | __all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
17 | 


--------------------------------------------------------------------------------
/benchmarks/multi_turn/bench_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import logging
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class Color(Enum):
 8 |     RED = "\033[91m"
 9 |     GREEN = "\033[92m"
10 |     BLUE = "\033[94m"
11 |     PURPLE = "\033[95m"
12 |     CYAN = "\033[96m"
13 |     YELLOW = "\033[93m"
14 |     RESET = "\033[0m"
15 | 
16 |     def __str__(self):
17 |         return self.value
18 | 
19 | 
20 | TEXT_SEPARATOR = "-" * 100
21 | 
22 | # Configure the logger
23 | logging.basicConfig(
24 |     level=logging.INFO,
25 |     format="%(asctime)s [%(levelname)s] - %(message)s",
26 |     datefmt="%d-%m-%Y %H:%M:%S",
27 | )
28 | logger = logging.getLogger(__name__)
29 | 


--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/README.md:
--------------------------------------------------------------------------------
 1 | # Disaggregated Prefill V1
 2 | 
 3 | This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
 4 | 
 5 | ## Files
 6 | 
 7 | - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
 8 |     - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 9 | - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
10 | - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
11 | 


--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import glob
 5 | 
 6 | requires_files = glob.glob('requirements/*.txt')
 7 | requires_files += ["pyproject.toml"]
 8 | for file in requires_files:
 9 |     print(f">>> cleaning {file}")
10 |     with open(file) as f:
11 |         lines = f.readlines()
12 |     if "torch" in "".join(lines).lower():
13 |         print("removed:")
14 |         with open(file, 'w') as f:
15 |             for line in lines:
16 |                 if 'torch' not in line.lower():
17 |                     f.write(line)
18 |                 else:
19 |                     print(line.strip())
20 |     print(f"<<< done cleaning {file}")
21 |     print()


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fla/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | # SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
 4 | #
 5 | # This file contains code copied from the flash-linear-attention project.
 6 | # The original source code was licensed under the MIT license and included
 7 | # the following copyright notice:
 8 | # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 9 | from .chunk import chunk_gated_delta_rule
10 | from .fused_recurrent import fused_recurrent_gated_delta_rule
11 | from .layernorm_guard import RMSNormGated
12 | 
13 | __all__ = [
14 |     "RMSNormGated",
15 |     "chunk_gated_delta_rule",
16 |     "fused_recurrent_gated_delta_rule",
17 | ]
18 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/serve.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import argparse
 4 | 
 5 | from vllm.benchmarks.serve import add_cli_args, main
 6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 7 | 
 8 | 
 9 | class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
10 |     """ The `serve` subcommand for vllm bench. """
11 | 
12 |     name = "serve"
13 |     help = "Benchmark the online serving throughput."
14 | 
15 |     @classmethod
16 |     def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 |         add_cli_args(parser)
18 | 
19 |     @staticmethod
20 |     def cmd(args: argparse.Namespace) -> None:
21 |         main(args)
22 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.logger import init_logger
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
12 | 
13 |     try:
14 |         from flashinfer import mxfp8_quantize
15 |     except ImportError as err:
16 |         raise ImportError("The package `flashinfer` is required to do "
17 |                           "MX-FP8 quantization. Please install it with" \
18 |                           "`pip install flashinfer`") from err
19 | 
20 |     return mxfp8_quantize(x, is_sf_swizzled_layout=False)
21 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | UseTab: Never
 3 | IndentWidth: 2
 4 | ColumnLimit: 80
 5 | 
 6 | # Force pointers to the type for C++.
 7 | DerivePointerAlignment: false
 8 | PointerAlignment: Left
 9 | 
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 | 
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 | 
18 | IncludeCategories:
19 |   - Regex:           '^<'
20 |     Priority:        4
21 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 |     Priority:        3
23 |   - Regex:           '^"(qoda|\.\.)/'
24 |     Priority:        2
25 |   - Regex:           '.*'
26 |     Priority:        1
27 | 


--------------------------------------------------------------------------------
/examples/template_vlm2vec.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages | length > 1 -%}
 2 |     {{ raise_exception('Embedding models should only embed one message at a time') }}
 3 | {%- endif -%}
 4 | 
 5 | {% set vars = namespace(parts=[], next_image_id=1) %}
 6 | {%- for message in messages -%}
 7 |     {%- for content in message['content'] -%}
 8 |         {%- if content['type'] == 'text' -%}
 9 |             {%- set vars.parts = vars.parts + [content['text']] %}
10 |         {%- elif content['type'] == 'image' -%}
11 |             {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
12 |             {%- set vars.next_image_id = vars.next_image_id + 1 %}
13 |         {%- endif -%}
14 |     {%- endfor -%}
15 | {%- endfor -%}
16 | {{ vars.parts | join(' ') }}
17 | 


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.attention.backends.abstract import (AttentionBackend,
 5 |                                               AttentionMetadata,
 6 |                                               AttentionMetadataBuilder,
 7 |                                               AttentionState, AttentionType)
 8 | from vllm.attention.layer import Attention
 9 | from vllm.attention.selector import get_attn_backend
10 | 
11 | __all__ = [
12 |     "Attention",
13 |     "AttentionBackend",
14 |     "AttentionMetadata",
15 |     "AttentionType",
16 |     "AttentionMetadataBuilder",
17 |     "AttentionState",
18 |     "get_attn_backend",
19 | ]
20 | 


--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_qwen.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.models.utils import GenerateModelInfo
 7 | 
 8 | from .ppl_utils import wikitext_ppl_test
 9 | 
10 | MODELS = [
11 |     GenerateModelInfo("Qwen/Qwen3-0.6B"),
12 |     GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"),
13 |     # transformers:
14 |     # Loading a GPTQ quantized model requires optimum, gptqmodel
15 |     # GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
16 | ]
17 | 
18 | 
19 | @pytest.mark.parametrize("model_info", MODELS)
20 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
21 |     wikitext_ppl_test(hf_runner, vllm_runner, model_info)
22 | 


--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.model_executor.layers.quantization import get_quantization_config
 5 | from vllm.platforms import current_platform
 6 | 
 7 | 
 8 | def is_quant_method_supported(quant_method: str) -> bool:
 9 |     # Currently, all quantization methods require Nvidia or AMD GPUs
10 |     if not (current_platform.is_cuda() or current_platform.is_rocm()):
11 |         return False
12 | 
13 |     capability = current_platform.get_device_capability()
14 |     assert capability is not None
15 | 
16 |     min_capability = get_quantization_config(quant_method).get_min_capability()
17 | 
18 |     return capability.to_int() >= min_capability
19 | 


--------------------------------------------------------------------------------
/.github/scale-config.yml:
--------------------------------------------------------------------------------
 1 | # scale-config.yml:
 2 | #   Powers what instance types are available for GHA auto-scaled
 3 | #   runners. Runners listed here will be available as self hosted
 4 | #   runners, configuration is directly pulled from the main branch.
 5 | # runner_types:
 6 | #   runner_label:
 7 | #     instance_type: m4.large
 8 | #     os: linux
 9 | #     # min_available defaults to the global cfg in the ALI Terraform
10 | #     min_available: undefined
11 | #     # when max_available value is not defined, no max runners is enforced
12 | #     max_available: undefined
13 | #     disk_size: 50
14 | #     is_ephemeral: true
15 | 
16 | runner_types:
17 |   linux.2xlarge:
18 |     disk_size: 150
19 |     instance_type: c5.2xlarge
20 |     is_ephemeral: true
21 |     os: linux
22 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/latency.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import argparse
 4 | 
 5 | from vllm.benchmarks.latency import add_cli_args, main
 6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 7 | 
 8 | 
 9 | class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
10 |     """ The `latency` subcommand for vllm bench. """
11 | 
12 |     name = "latency"
13 |     help = "Benchmark the latency of a single batch of requests."
14 | 
15 |     @classmethod
16 |     def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 |         add_cli_args(parser)
18 | 
19 |     @staticmethod
20 |     def cmd(args: argparse.Namespace) -> None:
21 |         main(args)
22 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/throughput.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import argparse
 4 | 
 5 | from vllm.benchmarks.throughput import add_cli_args, main
 6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
 7 | 
 8 | 
 9 | class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
10 |     """ The `throughput` subcommand for vllm bench. """
11 | 
12 |     name = "throughput"
13 |     help = "Benchmark offline inference throughput."
14 | 
15 |     @classmethod
16 |     def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 |         add_cli_args(parser)
18 | 
19 |     @staticmethod
20 |     def cmd(args: argparse.Namespace) -> None:
21 |         main(args)
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo "$1" | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo "$2" | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/requirements/cuda.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | 
 4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 5 | numba == 0.61.2; python_version > '3.9'
 6 | 
 7 | # Dependencies for NVIDIA GPUs
 8 | ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 9 | torch==2.8.0
10 | torchaudio==2.8.0
11 | # These must be updated alongside torch
12 | torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
13 | # https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
14 | xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
15 | 


--------------------------------------------------------------------------------
/tests/entrypoints/openai/conftest.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import pytest
 4 | 
 5 | from vllm.assets.audio import AudioAsset
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def mary_had_lamb():
10 |     path = AudioAsset('mary_had_lamb').get_local_path()
11 |     with open(str(path), "rb") as f:
12 |         yield f
13 | 
14 | 
15 | @pytest.fixture
16 | def winning_call():
17 |     path = AudioAsset('winning_call').get_local_path()
18 |     with open(str(path), "rb") as f:
19 |         yield f
20 | 
21 | 
22 | @pytest.fixture
23 | def foscolo():
24 |     # Test translation it->en
25 |     path = AudioAsset('azacinto_foscolo').get_local_path()
26 |     with open(str(path), "rb") as f:
27 |         yield f
28 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 16,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 4
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 8,
24 |         "num_stages": 5
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 16,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }


--------------------------------------------------------------------------------
/requirements/rocm-test.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r common.txt
 3 | tblib==3.1.0
 4 | 
 5 | # entrypoints test
 6 | # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
 7 | audioread==3.0.1
 8 | cffi==1.17.1
 9 | decorator==5.2.1
10 | lazy-loader==0.4
11 | platformdirs==4.3.6
12 | pooch==1.8.2
13 | #pycparse==2.22
14 | soundfile==0.13.1
15 | soxr==0.5.0.post1
16 | librosa==0.10.2.post1
17 | 
18 | # entrypoints test
19 | #vllm[video] # required by entrypoints/openai/test_video.py
20 | decord==0.6.0
21 | 
22 | # entrypoints test
23 | #sentence-transformers # required by entrypoints/openai/test_score.py
24 | sentence-transformers==3.4.1
25 | 
26 | # Basic Models Test
27 | matplotlib==3.10.3
28 | 
29 | # Multi-Modal Models Test (Extended) 3
30 | blobfile==3.0.0
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 1,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 64,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 64,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 16,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 64,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 32,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 64,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 128,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 3
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 64,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 128,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 64,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 128,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 1,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 16,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 32,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 64,
15 |         "num_warps": 4,
16 |         "num_stages": 2
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 1,
23 |         "num_warps": 4,
24 |         "num_stages": 2
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2048": {
 3 |         "BLOCK_SIZE_M": 128,
 4 |         "BLOCK_SIZE_N": 128,
 5 |         "BLOCK_SIZE_K": 64,
 6 |         "GROUP_SIZE_M": 64,
 7 |         "num_warps": 4,
 8 |         "num_stages": 2
 9 |     },
10 |     "3072": {
11 |         "BLOCK_SIZE_M": 128,
12 |         "BLOCK_SIZE_N": 128,
13 |         "BLOCK_SIZE_K": 64,
14 |         "GROUP_SIZE_M": 32,
15 |         "num_warps": 4,
16 |         "num_stages": 3
17 |     },
18 |     "4096": {
19 |         "BLOCK_SIZE_M": 128,
20 |         "BLOCK_SIZE_N": 128,
21 |         "BLOCK_SIZE_K": 64,
22 |         "GROUP_SIZE_M": 64,
23 |         "num_warps": 4,
24 |         "num_stages": 3
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/test_fastsafetensors_loader.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm import SamplingParams
 5 | 
 6 | test_model = "openai-community/gpt2"
 7 | 
 8 | prompts = [
 9 |     "Hello, my name is",
10 |     "The president of the United States is",
11 |     "The capital of France is",
12 |     "The future of AI is",
13 | ]
14 | # Create a sampling params object.
15 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
16 | 
17 | 
18 | def test_model_loader_download_files(vllm_runner):
19 |     with vllm_runner(test_model, load_format="fastsafetensors") as llm:
20 |         deserialized_outputs = llm.generate(prompts, sampling_params)
21 |         assert deserialized_outputs
22 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 7 | 
 8 | 
 9 | # Register CustomRotaryEmbedding to CustomOP.
10 | @RotaryEmbedding.register_oot
11 | class DummyRotaryEmbedding(RotaryEmbedding):
12 |     """Original rotary positional embedding."""
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super().__init__(*args, **kwargs)
16 |         self.addition_config = True
17 | 
18 |     def forward_oot(self, *args,
19 |                     **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
20 |         return super().forward_oot(*args, **kwargs)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import argparse
 4 | 
 5 | from vllm.entrypoints.cli.types import CLISubcommand
 6 | 
 7 | 
 8 | class BenchmarkSubcommandBase(CLISubcommand):
 9 |     """ The base class of subcommands for vllm bench. """
10 | 
11 |     help: str
12 | 
13 |     @classmethod
14 |     def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
15 |         """Add the CLI arguments to the parser."""
16 |         raise NotImplementedError
17 | 
18 |     @staticmethod
19 |     def cmd(args: argparse.Namespace) -> None:
20 |         """Run the benchmark.
21 | 
22 |         Args:
23 |             args: The arguments to the command.
24 |         """
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/vllm/v1/engine/exceptions.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | class EngineGenerateError(Exception):
 4 |     """Raised when a AsyncLLM.generate() fails. Recoverable."""
 5 |     pass
 6 | 
 7 | 
 8 | class EngineDeadError(Exception):
 9 |     """Raised when the EngineCore dies. Unrecoverable."""
10 | 
11 |     def __init__(self, *args, suppress_context: bool = False, **kwargs):
12 |         ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
13 | 
14 |         super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
15 |         # Make stack trace clearer when using with LLMEngine by
16 |         # silencing irrelevant ZMQError.
17 |         self.__suppress_context__ = suppress_context
18 | 


--------------------------------------------------------------------------------
/docs/deployment/integrations/kubeai.md:
--------------------------------------------------------------------------------
 1 | # KubeAI
 2 | 
 3 | [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 4 | 
 5 | Please see the Installation Guides for environment specific instructions:
 6 | 
 7 | - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
 8 | - [EKS](https://www.kubeai.org/installation/eks/)
 9 | - [GKE](https://www.kubeai.org/installation/gke/)
10 | 
11 | Once you have KubeAI installed, you can
12 | [configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
13 | using vLLM.
14 | 


--------------------------------------------------------------------------------
/docs/mkdocs/javascript/run_llm_widget.js:
--------------------------------------------------------------------------------
 1 | // Add RunLLM widget
 2 | document.addEventListener("DOMContentLoaded", function () {
 3 |     var script = document.createElement("script");
 4 |     script.type = "module";
 5 |     script.id = "runllm-widget-script"
 6 |   
 7 |     script.src = "https://widget.runllm.com";
 8 |   
 9 |     script.setAttribute("version", "stable");
10 |     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
11 |     script.setAttribute("runllm-name", "vLLM");
12 |     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
13 |     script.setAttribute("runllm-position-y", "120px");
14 |     script.setAttribute("runllm-position-x", "20px");
15 |     script.setAttribute("runllm-assistant-id", "207");
16 |   
17 |     script.async = true;
18 |     document.head.appendChild(script);
19 |   });
20 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/attention_layer_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | """Base class for attention-like layers."""
 4 | from abc import ABC, abstractmethod
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from vllm.attention.backends.abstract import AttentionBackend
 9 | 
10 | 
11 | class AttentionLayerBase(ABC):
12 |     """
13 |     Base class for attention-like layers (Attention, Mamba, etc.) 
14 |     that support the v1 engine.
15 |     
16 |     This provides a common interface for getting attention backends 
17 |     from different layer types.
18 |     """
19 | 
20 |     @abstractmethod
21 |     def get_attn_backend(self) -> type["AttentionBackend"]:
22 |         """Get the attention backend class for this layer."""
23 |         pass
24 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from typing import Optional
 5 | 
 6 | import torch
 7 | 
 8 | from vllm.model_executor.models.opt import OPTForCausalLM
 9 | from vllm.model_executor.sampling_metadata import SamplingMetadata
10 | 
11 | 
12 | class MyOPTForCausalLM(OPTForCausalLM):
13 | 
14 |     def compute_logits(
15 |             self, hidden_states: torch.Tensor,
16 |             sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
17 |         # this dummy model always predicts the first token
18 |         logits = super().compute_logits(hidden_states, sampling_metadata)
19 |         if logits is not None:
20 |             logits.zero_()
21 |             logits[:, 0] += 1.0
22 |         return logits
23 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 3 | if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
 4 |     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 5 | else
 6 |     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 7 | fi
 8 | 
 9 | TIMEOUT_SECONDS=10
10 | 
11 | retries=0
12 | while [ $retries -lt 1000 ]; do
13 |     if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
14 |         exit 0
15 |     fi
16 | 
17 |     echo "Waiting for image to be available..."
18 | 
19 |     retries=$((retries + 1))
20 |     sleep 5
21 | done
22 | 
23 | exit 1
24 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | from transformers import PreTrainedTokenizerBase
 6 | 
 7 | from vllm.transformers_utils.tokenizer import get_tokenizer
 8 | 
 9 | TOKENIZER_NAMES = [
10 |     "facebook/opt-125m",
11 |     "gpt2",
12 | ]
13 | 
14 | 
15 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
16 | def test_tokenizer_revision(tokenizer_name: str):
17 |     # Assume that "main" branch always exists
18 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
19 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
20 | 
21 |     # Assume that "never" branch always does not exist
22 |     with pytest.raises(OSError, match='not a valid git identifier'):
23 |         get_tokenizer(tokenizer_name, revision="never")
24 | 


--------------------------------------------------------------------------------
/docs/mkdocs/overrides/partials/toc-item.html:
--------------------------------------------------------------------------------
 1 | <!-- Enables the use of toc_depth in document frontmatter https://github.com/squidfunk/mkdocs-material/issues/4827#issuecomment-1869812019 -->
 2 | <li class="md-nav__item">
 3 |     <a href="{{ toc_item.url }}" class="md-nav__link">
 4 |       <span class="md-ellipsis">
 5 |         {{ toc_item.title }}
 6 |       </span>
 7 |     </a>
 8 |   
 9 |     <!-- Table of contents list -->
10 |     {% if toc_item.children %}
11 |       <nav class="md-nav" aria-label="{{ toc_item.title | striptags }}">
12 |         <ul class="md-nav__list">
13 |           {% for toc_item in toc_item.children %}
14 |           {% if not page.meta.toc_depth or toc_item.level <= page.meta.toc_depth %}
15 |             {% include "partials/toc-item.html" %}
16 |           {% endif %}
17 |           {% endfor %}
18 |         </ul>
19 |       </nav>
20 |     {% endif %}
21 |   </li>


--------------------------------------------------------------------------------
/tests/standalone_tests/python_only_compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script tests if the python only compilation works correctly
 3 | # for users who do not have any compilers installed on their system
 4 | 
 5 | set -e
 6 | set -x
 7 | 
 8 | cd /vllm-workspace/
 9 | 
10 | # uninstall vllm
11 | pip3 uninstall -y vllm
12 | # restore the original files
13 | mv src/vllm ./vllm
14 | 
15 | # remove all compilers
16 | apt remove --purge build-essential -y
17 | apt autoremove -y
18 | 
19 | echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
20 | 
21 | VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
22 | 
23 | # Run the script
24 | python3 -c 'import vllm'
25 | 
26 | # Check if the clangd log file was created
27 | if [ ! -f /tmp/changed.file ]; then
28 |     echo "changed.file was not created, python only compilation failed"
29 |     exit 1
30 | fi
31 | 


--------------------------------------------------------------------------------
/vllm/env_override.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | from vllm.logger import init_logger
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | # set some common config/environment variables that should be set
12 | # for all processes created by vllm and all processes
13 | # that interact with vllm workers.
14 | # they are executed whenever `import vllm` is called.
15 | 
16 | # see https://github.com/vllm-project/vllm/pull/15951
17 | # it avoids unintentional cuda initialization from torch.cuda.is_available()
18 | os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
19 | 
20 | # see https://github.com/vllm-project/vllm/issues/10480
21 | os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
22 | # see https://github.com/vllm-project/vllm/issues/10619
23 | torch._inductor.config.compile_threads = 1
24 | 


--------------------------------------------------------------------------------
/docs/mkdocs/hooks/remove_announcement.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import os
 4 | from pathlib import Path
 5 | from typing import Literal
 6 | 
 7 | 
 8 | def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 9 |     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
10 |     if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
11 |         # remove the warning banner if the version is a tagged release
12 |         mkdocs_dir = Path(__file__).parent.parent
13 |         announcement_path = mkdocs_dir / "overrides/main.html"
14 |         # The file might be removed already if the build is triggered multiple
15 |         # times (readthedocs build both HTML and PDF versions separately)
16 |         if announcement_path.exists():
17 |             os.remove(announcement_path)
18 | 


--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/punica_selector.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from vllm.logger import init_logger
 5 | from vllm.platforms import current_platform
 6 | from vllm.utils import resolve_obj_by_qualname
 7 | 
 8 | from .punica_base import PunicaWrapperBase
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
14 |     punica_wrapper_qualname = current_platform.get_punica_wrapper()
15 |     punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
16 |     punica_wrapper = punica_wrapper_cls(*args, **kwargs)
17 |     assert punica_wrapper is not None, \
18 |         "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
19 |     logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
20 |     return punica_wrapper
21 | 


--------------------------------------------------------------------------------
/csrc/core/math.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <climits>
 4 | #include <iostream>
 5 | 
 6 | inline constexpr uint32_t next_pow_2(uint32_t const num) {
 7 |   if (num <= 1) return num;
 8 |   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 9 | }
10 | 
11 | template <typename A, typename B>
12 | static inline constexpr auto div_ceil(A a, B b) {
13 |   return (a + b - 1) / b;
14 | }
15 | 
16 | // Round a down to the next multiple of b. The caller is responsible for making
17 | // sure that b is non-zero
18 | template <typename T>
19 | inline constexpr T round_to_previous_multiple_of(T a, T b) {
20 |   return a % b == 0 ? a : (a / b) * b;
21 | }
22 | 
23 | // Round a up to the next multiple of b. The caller is responsible for making
24 | // sure that b is non-zero
25 | template <typename T>
26 | inline constexpr T round_to_next_multiple_of(T a, T b) {
27 |   return a % b == 0 ? a : ((a / b) + 1) * b;
28 | }
29 | 


--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/README.md:
--------------------------------------------------------------------------------
 1 | # LoRA Resolver Plugins
 2 | 
 3 | This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
 4 | via the LoRAResolver plugin framework.
 5 | 
 6 | Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
 7 | to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
 8 | 
 9 | ## lora_filesystem_resolver
10 | 
11 | This LoRA Resolver is installed with vLLM by default.
12 | To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
13 | for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
14 | for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
15 | load that adapter, and then service the request as normal. That adapter will then be available
16 | for future requests as normal.
17 | 


--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
 1 | name: Add label on auto-merge enabled
 2 | permissions:
 3 |     pull-requests: write
 4 | on:
 5 |     pull_request_target:
 6 |         types:
 7 |             - auto_merge_enabled
 8 | jobs:
 9 |     add-label-on-auto-merge:
10 |         runs-on: ubuntu-latest
11 |         steps:
12 |             -   name: Add label
13 |                 uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
14 |                 with:
15 |                     script: |
16 |                         github.rest.issues.addLabels({
17 |                             owner: context.repo.owner,
18 |                             repo: context.repo.repo,
19 |                             issue_number: context.issue.number,
20 |                             labels: ['ready']
21 |                         })
22 |                 env:
23 |                     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/tests/v1/test_request.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | from vllm.v1.request import RequestStatus
 4 | 
 5 | 
 6 | def test_request_status_fmt_str():
 7 |     """Test that the string representation of RequestStatus is correct."""
 8 |     assert f"{RequestStatus.WAITING}" == "WAITING"
 9 |     assert f"{RequestStatus.WAITING_FOR_FSM}" == "WAITING_FOR_FSM"
10 |     assert f"{RequestStatus.WAITING_FOR_REMOTE_KVS}" == "WAITING_FOR_REMOTE_KVS"
11 |     assert f"{RequestStatus.RUNNING}" == "RUNNING"
12 |     assert f"{RequestStatus.PREEMPTED}" == "PREEMPTED"
13 |     assert f"{RequestStatus.FINISHED_STOPPED}" == "FINISHED_STOPPED"
14 |     assert f"{RequestStatus.FINISHED_LENGTH_CAPPED}" == "FINISHED_LENGTH_CAPPED"
15 |     assert f"{RequestStatus.FINISHED_ABORTED}" == "FINISHED_ABORTED"
16 |     assert f"{RequestStatus.FINISHED_IGNORED}" == "FINISHED_IGNORED"
17 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/cli/types.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | import argparse
 7 | import typing
 8 | 
 9 | if typing.TYPE_CHECKING:
10 |     from vllm.utils import FlexibleArgumentParser
11 | 
12 | 
13 | class CLISubcommand:
14 |     """Base class for CLI argument handlers."""
15 | 
16 |     name: str
17 | 
18 |     @staticmethod
19 |     def cmd(args: argparse.Namespace) -> None:
20 |         raise NotImplementedError("Subclasses should implement this method")
21 | 
22 |     def validate(self, args: argparse.Namespace) -> None:
23 |         # No validation by default
24 |         pass
25 | 
26 |     def subparser_init(
27 |             self,
28 |             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
29 |         raise NotImplementedError("Subclasses should implement this method")
30 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages[0]['role'] == 'system' -%}
 2 |     {%- set system_message = messages[0]['content'] -%}
 3 |     {%- set messages = messages[1:] -%}
 4 | {%- else -%}
 5 |     {% set system_message = '' -%}
 6 | {%- endif -%}
 7 | 
 8 | {{ bos_token + system_message }}
 9 | {%- for message in messages -%}
10 |     {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 |         {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 |     {%- endif -%}
13 | 
14 |     {%- if message['role'] == 'user' -%}
15 |         {{ '<|User|>: ' + message['content'] + '\n\n' }}
16 |     {%- elif message['role'] == 'assistant' -%}
17 |         {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
18 |     {%- endif -%}
19 | {%- endfor -%}
20 | 
21 | {%- if add_generation_prompt -%}
22 |     {{ '<|Assistant|>: ' }}
23 | {%- endif -%}
24 | 


--------------------------------------------------------------------------------
/tests/test_seed_behavior.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import random
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | from vllm.platforms.interface import Platform
 9 | 
10 | 
11 | def test_seed_behavior():
12 |     # Test with a specific seed
13 |     Platform.seed_everything(42)
14 |     random_value_1 = random.randint(0, 100)
15 |     np_random_value_1 = np.random.randint(0, 100)
16 |     torch_random_value_1 = torch.randint(0, 100, (1, )).item()
17 | 
18 |     Platform.seed_everything(42)
19 |     random_value_2 = random.randint(0, 100)
20 |     np_random_value_2 = np.random.randint(0, 100)
21 |     torch_random_value_2 = torch.randint(0, 100, (1, )).item()
22 | 
23 |     assert random_value_1 == random_value_2
24 |     assert np_random_value_1 == np_random_value_2
25 |     assert torch_random_value_1 == torch_random_value_2
26 | 


--------------------------------------------------------------------------------
/docs/serving/integrations/langchain.md:
--------------------------------------------------------------------------------
 1 | # LangChain
 2 | 
 3 | vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 4 | 
 5 | To install LangChain, run
 6 | 
 7 | ```bash
 8 | pip install langchain langchain_community -q
 9 | ```
10 | 
11 | To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
12 | 
13 | ??? code
14 | 
15 |     ```python
16 |     from langchain_community.llms import VLLM
17 | 
18 |     llm = VLLM(model="mosaicml/mpt-7b",
19 |             trust_remote_code=True,  # mandatory for hf models
20 |             max_new_tokens=128,
21 |             top_k=10,
22 |             top_p=0.95,
23 |             temperature=0.8,
24 |             # tensor_parallel_size=... # for distributed inference
25 |     )
26 | 
27 |     print(llm("What is the capital of France ?"))
28 |     ```
29 | 
30 | Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
31 | 


--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: chart-vllm
 3 | description: Chart vllm
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.0.1
19 | 
20 | maintainers:
21 |   - name: mfournioux
22 | 


--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm import LLM
 7 | 
 8 | 
 9 | @pytest.fixture(autouse=True)
10 | def v1(run_with_both_engines):
11 |     # Simple autouse wrapper to run both engines for each test
12 |     # This can be promoted up to conftest.py to run for every
13 |     # test in a package
14 |     pass
15 | 
16 | 
17 | def test_empty_prompt():
18 |     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
19 |     with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
20 |         llm.generate([""])
21 | 
22 | 
23 | @pytest.mark.skip_v1
24 | def test_out_of_vocab_token():
25 |     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
26 |     with pytest.raises(ValueError, match='out of vocabulary'):
27 |         llm.generate({"prompt_token_ids": [999999]})
28 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | 
 4 | import argparse
 5 | 
 6 | from transformers import AutoTokenizer
 7 | 
 8 | 
 9 | def main(model, cachedir):
10 |     # Load the tokenizer and save it to the specified directory
11 |     tokenizer = AutoTokenizer.from_pretrained(model)
12 |     tokenizer.save_pretrained(cachedir)
13 |     print(f"Tokenizer saved to {cachedir}")
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser(
18 |         description="Download and save Hugging Face tokenizer"
19 |     )
20 |     parser.add_argument("--model", type=str, required=True, help="Name of the model")
21 |     parser.add_argument(
22 |         "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
23 |     )
24 | 
25 |     args = parser.parse_args()
26 |     main(args.model, args.cachedir)
27 | 


--------------------------------------------------------------------------------
/.github/workflows/cleanup_pr_body.yml:
--------------------------------------------------------------------------------
 1 | name: Cleanup PR Body
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, reopened, edited]
 6 | 
 7 | permissions:
 8 |   pull-requests: write
 9 | 
10 | jobs:
11 |   update-description:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
20 |         with:
21 |           python-version: '3.12'
22 | 
23 |       - name: Install Python dependencies
24 |         run: |
25 |           python3 -m pip install --upgrade pip
26 |           python3 -m pip install regex
27 | 
28 |       - name: Update PR description
29 |         env:
30 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31 |         run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
32 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu:
--------------------------------------------------------------------------------
 1 | #include "c3x/scaled_mm_helper.hpp"
 2 | #include "c3x/scaled_mm_kernels.hpp"
 3 | 
 4 | /*
 5 |    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
 6 |    NVIDIA GPUs with sm100 (Blackwell).
 7 | */
 8 | 
 9 | #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
10 | 
11 | void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
12 |                              torch::Tensor const& b,
13 |                              torch::Tensor const& a_scales,
14 |                              torch::Tensor const& b_scales,
15 |                              std::optional<torch::Tensor> const& bias) {
16 |   dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 |                      vllm::cutlass_scaled_mm_sm100_fp8,
18 |                      nullptr,  // int8 not supported on SM100
19 |                      vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu:
--------------------------------------------------------------------------------
 1 | #include "c3x/scaled_mm_helper.hpp"
 2 | #include "c3x/scaled_mm_kernels.hpp"
 3 | 
 4 | /*
 5 |    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
 6 |    NVIDIA GPUs with sm120 (Blackwell).
 7 | */
 8 | 
 9 | #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
10 | 
11 | void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
12 |                              torch::Tensor const& b,
13 |                              torch::Tensor const& a_scales,
14 |                              torch::Tensor const& b_scales,
15 |                              std::optional<torch::Tensor> const& bias) {
16 |   dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 |                      vllm::cutlass_scaled_mm_sm120_fp8,
18 |                      nullptr,  // int8 not supported on SM120
19 |                      vllm::cutlass_scaled_mm_blockwise_sm120_fp8);
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/benchmarks/multi_turn/generate_multi_turn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "filetype": "generate_conversations",
 3 |     "num_conversations": 24,
 4 |     "text_files": ["pg1184.txt"],
 5 |     "print_stats": false,
 6 |     "prompt_input": {
 7 |         "num_turns": {
 8 |             "distribution": "uniform",
 9 |             "min": 12,
10 |             "max": 18
11 |         },
12 |         "common_prefix_num_tokens": {
13 |             "distribution": "constant",
14 |             "value": 500
15 |         },
16 |         "prefix_num_tokens": {
17 |             "distribution": "lognormal",
18 |             "average": 1000,
19 |             "max": 5000
20 |         },
21 |         "num_tokens": {
22 |             "distribution": "uniform",
23 |             "min": 120,
24 |             "max": 160
25 |         }
26 |     },
27 |     "prompt_output": {
28 |         "num_tokens": {
29 |             "distribution": "uniform",
30 |             "min": 80,
31 |             "max": 120
32 |         }
33 |     }
34 | }


--------------------------------------------------------------------------------