├── tests
├── __init__.py
├── lora
│ └── __init__.py
├── tools
│ └── __init__.py
├── tpu
│ ├── __init__.py
│ └── lora
│ │ └── __init__.py
├── v1
│ ├── __init__.py
│ ├── core
│ │ └── __init__.py
│ ├── e2e
│ │ └── __init__.py
│ ├── engine
│ │ └── __init__.py
│ ├── sample
│ │ └── __init__.py
│ ├── tpu
│ │ ├── __init__.py
│ │ └── worker
│ │ │ └── __init__.py
│ ├── tracing
│ │ └── __init__.py
│ ├── worker
│ │ └── __init__.py
│ ├── cudagraph
│ │ └── __init__.py
│ ├── entrypoints
│ │ ├── __init__.py
│ │ ├── llm
│ │ │ └── __init__.py
│ │ └── openai
│ │ │ └── responses
│ │ │ └── __init__.py
│ ├── executor
│ │ └── __init__.py
│ ├── kv_connector
│ │ ├── __init__.py
│ │ └── unit
│ │ │ └── __init__.py
│ ├── logits_processors
│ │ └── __init__.py
│ ├── structured_output
│ │ └── __init__.py
│ ├── shutdown
│ │ └── utils.py
│ └── test_request.py
├── benchmarks
│ ├── __init__.py
│ ├── test_latency_cli.py
│ └── test_throughput_cli.py
├── compile
│ ├── __init__.py
│ └── piecewise
│ │ └── __init__.py
├── engine
│ └── __init__.py
├── kernels
│ ├── __init__.py
│ ├── moe
│ │ ├── __init__.py
│ │ └── modular_kernel_tools
│ │ │ └── __init__.py
│ ├── attention
│ │ └── conftest.py
│ ├── core
│ │ └── test_permute_cols.py
│ └── allclose_default.py
├── models
│ ├── __init__.py
│ ├── language
│ │ ├── __init__.py
│ │ ├── pooling
│ │ │ └── __init__.py
│ │ ├── generation
│ │ │ └── __init__.py
│ │ ├── generation_ppl_test
│ │ │ ├── __init__.py
│ │ │ ├── test_gpt.py
│ │ │ ├── test_gemma.py
│ │ │ └── test_qwen.py
│ │ └── pooling_mteb_test
│ │ │ └── __init__.py
│ ├── multimodal
│ │ ├── __init__.py
│ │ ├── pooling
│ │ │ └── __init__.py
│ │ ├── generation
│ │ │ ├── __init__.py
│ │ │ └── vlm_utils
│ │ │ │ └── __init__.py
│ │ └── processing
│ │ │ └── __init__.py
│ └── quantization
│ │ └── __init__.py
├── multimodal
│ ├── __init__.py
│ └── assets
│ │ ├── rgba.png
│ │ ├── image1.png
│ │ └── image2.png
├── reasoning
│ └── __init__.py
├── samplers
│ └── __init__.py
├── tool_use
│ └── __init__.py
├── detokenizer
│ └── __init__.py
├── distributed
│ ├── __init__.py
│ └── test_distributed_oot.py
├── entrypoints
│ ├── __init__.py
│ ├── llm
│ │ ├── __init__.py
│ │ └── test_prompt_validation.py
│ ├── openai
│ │ ├── __init__.py
│ │ ├── correctness
│ │ │ └── __init__.py
│ │ ├── tool_parsers
│ │ │ └── __init__.py
│ │ └── conftest.py
│ ├── pooling
│ │ ├── __init__.py
│ │ ├── llm
│ │ │ └── __init__.py
│ │ ├── openai
│ │ │ └── __init__.py
│ │ └── correctness
│ │ │ └── __init__.py
│ └── offline_mode
│ │ └── __init__.py
├── model_executor
│ ├── __init__.py
│ └── model_loader
│ │ └── __init__.py
├── quantization
│ ├── __init__.py
│ └── utils.py
├── tokenization
│ ├── __init__.py
│ ├── test_do_lower_case.py
│ └── test_tokenizer.py
├── basic_correctness
│ ├── __init__.py
│ └── test_cpu_offload.py
├── mistral_tool_use
│ └── __init__.py
├── tensorizer_loader
│ └── __init__.py
├── transformers_utils
│ └── __init__.py
├── fastsafetensors_loader
│ ├── __init__.py
│ └── test_fastsafetensors_loader.py
├── plugins
│ ├── lora_resolvers
│ │ └── __init__.py
│ ├── prithvi_io_processor_plugin
│ │ ├── prithvi_io_processor
│ │ │ └── __init__.py
│ │ └── setup.py
│ ├── vllm_add_dummy_platform
│ │ ├── vllm_add_dummy_platform
│ │ │ ├── __init__.py
│ │ │ ├── dummy_attention_backend.py
│ │ │ └── dummy_custom_ops.py
│ │ └── setup.py
│ └── vllm_add_dummy_model
│ │ ├── setup.py
│ │ └── vllm_add_dummy_model
│ │ └── my_opt.py
├── runai_model_streamer_test
│ └── __init__.py
├── config
│ ├── test_config.yaml
│ └── test_config_with_model.yaml
├── evals
│ ├── gpt_oss
│ │ ├── __init__.py
│ │ └── conftest.py
│ └── gsm8k
│ │ ├── __init__.py
│ │ └── configs
│ │ ├── Qwen3-0.6B-FP8.yaml
│ │ ├── Qwen1.5-MoE-W4A16-CT.yaml
│ │ ├── Llama-3.2-1B-Instruct-INT8-CT.yaml
│ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
│ │ ├── DeepSeek-V2-Lite-Instruct-FP8.yaml
│ │ ├── Llama-3-8B-Instruct-nonuniform-CT.yaml
│ │ └── models-small.txt
├── kv_transfer
│ ├── test_lookup_buffer.sh
│ └── test_send_recv.sh
├── utils_
│ └── __init__.py
├── vllm_test_utils
│ ├── setup.py
│ └── vllm_test_utils
│ │ └── __init__.py
├── test_embedded_commit.py
├── weight_loading
│ └── models-large.txt
├── test_outputs.py
├── prompts
│ └── example.txt
├── standalone_tests
│ └── python_only_compile.sh
└── test_seed_behavior.py
├── vllm
├── assets
│ └── __init__.py
├── core
│ ├── __init__.py
│ └── block
│ │ └── __init__.py
├── engine
│ ├── __init__.py
│ └── output_processor
│ │ └── __init__.py
├── lora
│ ├── __init__.py
│ ├── ops
│ │ ├── __init__.py
│ │ ├── ipex_ops
│ │ │ └── __init__.py
│ │ ├── xla_ops
│ │ │ └── __init__.py
│ │ ├── triton_ops
│ │ │ └── __init__.py
│ │ └── torch_ops
│ │ │ └── __init__.py
│ ├── layers
│ │ └── qkv_x_parallel_linear.py
│ └── punica_wrapper
│ │ ├── __init__.py
│ │ └── punica_selector.py
├── ray
│ ├── __init__.py
│ └── lazy_utils.py
├── usage
│ └── __init__.py
├── v1
│ ├── __init__.py
│ ├── core
│ │ ├── __init__.py
│ │ └── sched
│ │ │ └── __init__.py
│ ├── executor
│ │ └── __init__.py
│ ├── metrics
│ │ └── __init__.py
│ ├── pool
│ │ └── __init__.py
│ ├── sample
│ │ ├── __init__.py
│ │ ├── ops
│ │ │ └── __init__.py
│ │ └── tpu
│ │ │ └── __init__.py
│ ├── worker
│ │ ├── __init__.py
│ │ └── ubatch_utils.py
│ ├── attention
│ │ ├── __init__.py
│ │ └── backends
│ │ │ ├── __init__.py
│ │ │ └── mla
│ │ │ └── __init__.py
│ ├── spec_decode
│ │ ├── __init__.py
│ │ └── utils.py
│ └── engine
│ │ └── exceptions.py
├── worker
│ └── __init__.py
├── benchmarks
│ ├── __init__.py
│ └── lib
│ │ └── __init__.py
├── compilation
│ └── __init__.py
├── entrypoints
│ ├── __init__.py
│ ├── openai
│ │ └── __init__.py
│ ├── cli
│ │ ├── benchmark
│ │ │ ├── __init__.py
│ │ │ ├── serve.py
│ │ │ ├── latency.py
│ │ │ ├── throughput.py
│ │ │ └── base.py
│ │ ├── __init__.py
│ │ └── types.py
│ └── constants.py
├── executor
│ └── __init__.py
├── profiler
│ └── __init__.py
├── third_party
│ └── __init__.py
├── attention
│ ├── layers
│ │ └── __init__.py
│ ├── ops
│ │ └── __init__.py
│ ├── utils
│ │ └── __init__.py
│ ├── backends
│ │ ├── __init__.py
│ │ └── mla
│ │ │ └── __init__.py
│ └── __init__.py
├── device_allocator
│ └── __init__.py
├── vllm_flash_attn
│ └── .gitkeep
├── model_executor
│ ├── layers
│ │ ├── __init__.py
│ │ ├── mamba
│ │ │ ├── __init__.py
│ │ │ └── ops
│ │ │ │ └── __init__.py
│ │ ├── quantization
│ │ │ ├── quark
│ │ │ │ ├── __init__.py
│ │ │ │ └── schemes
│ │ │ │ │ └── __init__.py
│ │ │ ├── kernels
│ │ │ │ └── __init__.py
│ │ │ ├── compressed_tensors
│ │ │ │ ├── __init__.py
│ │ │ │ └── transform
│ │ │ │ │ └── utils.py
│ │ │ └── utils
│ │ │ │ ├── configs
│ │ │ │ ├── README.md
│ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ └── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
│ │ │ │ ├── __init__.py
│ │ │ │ └── mxfp8_utils.py
│ │ ├── shared_fused_moe
│ │ │ └── __init__.py
│ │ ├── fla
│ │ │ ├── __init__.py
│ │ │ └── ops
│ │ │ │ └── __init__.py
│ │ ├── fused_moe
│ │ │ └── configs
│ │ │ │ └── README
│ │ └── attention_layer_base.py
│ ├── warmup
│ │ └── __init__.py
│ ├── models
│ │ └── phi3.py
│ └── __init__.py
├── plugins
│ └── lora_resolvers
│ │ ├── __init__.py
│ │ └── README.md
├── distributed
│ ├── kv_transfer
│ │ ├── kv_pipe
│ │ │ └── __init__.py
│ │ ├── kv_connector
│ │ │ ├── __init__.py
│ │ │ ├── v1
│ │ │ │ ├── p2p
│ │ │ │ │ └── __init__.py
│ │ │ │ └── __init__.py
│ │ │ └── base.py
│ │ ├── kv_lookup_buffer
│ │ │ └── __init__.py
│ │ ├── disagg_prefill_workflow.jpg
│ │ └── __init__.py
│ ├── device_communicators
│ │ └── __init__.py
│ ├── __init__.py
│ └── eplb
│ │ └── __init__.py
├── py.typed
├── transformers_utils
│ ├── chat_templates
│ │ ├── template_basic.jinja
│ │ ├── template_fuyu.jinja
│ │ ├── __init__.py
│ │ ├── template_blip2.jinja
│ │ ├── template_chatml.jinja
│ │ └── template_deepseek_vl2.jinja
│ ├── configs
│ │ └── speculators
│ │ │ └── __init__.py
│ ├── tokenizers
│ │ └── __init__.py
│ ├── config_parser_base.py
│ └── processors
│ │ └── __init__.py
├── logging_utils
│ └── __init__.py
├── tasks.py
├── scripts.py
├── triton_utils
│ └── __init__.py
└── env_override.py
├── docs
├── cli
│ ├── .meta.yml
│ ├── chat.md
│ ├── complete.md
│ ├── .nav.yml
│ ├── serve.md
│ ├── run-batch.md
│ ├── bench
│ │ ├── serve.md
│ │ ├── latency.md
│ │ └── throughput.md
│ └── json_tip.inc.md
├── api
│ └── vllm
│ │ └── .meta.yml
├── community
│ └── contact_us.md
├── assets
│ ├── design
│ │ ├── hierarchy.png
│ │ ├── tpu
│ │ │ └── most_model_len.png
│ │ ├── metrics
│ │ │ ├── intervals-1.png
│ │ │ ├── intervals-2.png
│ │ │ └── intervals-3.png
│ │ ├── paged_attention
│ │ │ ├── key.png
│ │ │ ├── k_vecs.png
│ │ │ ├── q_vecs.png
│ │ │ ├── query.png
│ │ │ ├── v_vec.png
│ │ │ ├── value.png
│ │ │ └── logits_vec.png
│ │ ├── prefix_caching
│ │ │ ├── free.png
│ │ │ ├── overview.png
│ │ │ ├── example-time-1.png
│ │ │ ├── example-time-3.png
│ │ │ ├── example-time-4.png
│ │ │ ├── example-time-5.png
│ │ │ ├── example-time-6.png
│ │ │ └── example-time-7.png
│ │ ├── hybrid_kv_cache_manager
│ │ │ ├── full_attn.png
│ │ │ ├── overview.png
│ │ │ ├── sw_attn.png
│ │ │ ├── memory_layout.png
│ │ │ └── basic_grouping_example.png
│ │ ├── arch_overview
│ │ │ ├── llm_engine.excalidraw.png
│ │ │ └── entrypoints.excalidraw.png
│ │ └── fused_moe_modular_kernel
│ │ │ ├── fused_moe_batched.png
│ │ │ ├── fused_experts_blocks.png
│ │ │ ├── fused_moe_non_batched.png
│ │ │ └── prepare_and_finalize_blocks.png
│ ├── deployment
│ │ ├── dify-chat.png
│ │ ├── open_webui.png
│ │ ├── chatbox-chat.png
│ │ ├── dify-settings.png
│ │ ├── dp_external_lb.png
│ │ ├── dp_internal_lb.png
│ │ ├── streamlit-chat.png
│ │ ├── chatbox-settings.png
│ │ ├── dify-create-chatbot.png
│ │ ├── anything-llm-provider.png
│ │ ├── anything-llm-upload-doc.png
│ │ ├── anything-llm-chat-with-doc.png
│ │ ├── anything-llm-chat-without-doc.png
│ │ └── architecture_helm_deployment.png
│ ├── logos
│ │ ├── vllm-logo-text-dark.png
│ │ ├── vllm-logo-only-light.ico
│ │ ├── vllm-logo-only-light.png
│ │ └── vllm-logo-text-light.png
│ ├── features
│ │ └── disagg_prefill
│ │ │ ├── overview.jpg
│ │ │ ├── workflow.png
│ │ │ ├── abstraction.jpg
│ │ │ └── high_level_design.png
│ └── contributing
│ │ └── dockerfile-stages-dependency.png
├── getting_started
│ └── installation
│ │ ├── .nav.yml
│ │ ├── device.template.md
│ │ └── python_env_setup.inc.md
├── mkdocs
│ ├── overrides
│ │ ├── main.html
│ │ └── partials
│ │ │ └── toc-item.html
│ ├── javascript
│ │ ├── mathjax.js
│ │ └── run_llm_widget.js
│ └── hooks
│ │ └── remove_announcement.py
├── deployment
│ ├── frameworks
│ │ ├── modal.md
│ │ ├── triton.md
│ │ ├── bentoml.md
│ │ └── lobe-chat.md
│ └── integrations
│ │ ├── kserve.md
│ │ ├── llmaz.md
│ │ └── kubeai.md
├── configuration
│ └── README.md
├── examples
│ └── README.md
├── models
│ └── extensions
│ │ └── fastsafetensor.md
├── usage
│ └── README.md
└── serving
│ └── integrations
│ ├── llamaindex.md
│ └── langchain.md
├── benchmarks
├── kernels
│ └── requirements.txt
├── multi_turn
│ ├── requirements.txt
│ ├── bench_utils.py
│ └── generate_multi_turn.json
├── benchmark_serving.py
├── benchmark_latency.py
├── benchmark_throughput.py
└── structured_schemas
│ └── structured_schema_1.json
├── csrc
├── moe
│ └── marlin_moe_wna16
│ │ └── .gitignore
├── quantization
│ ├── gptq_marlin
│ │ └── .gitignore
│ ├── per_token_group_quant_8bit.h
│ ├── cutlass_w8a8
│ │ ├── scaled_mm_c3x_sm100.cu
│ │ └── scaled_mm_c3x_sm120.cu
│ └── gptq
│ │ └── qdq_8.cuh
├── core
│ ├── exception.hpp
│ └── math.hpp
├── attention
│ ├── attention_dtypes.h
│ └── dtype_fp8.cuh
├── cutlass_extensions
│ └── common.cpp
├── cub_helpers.h
└── cpu
│ └── cpu_types.hpp
├── requirements
├── lint.txt
├── kv_connectors.txt
├── dev.txt
├── build.txt
├── rocm-build.txt
├── cpu-build.txt
├── docs.txt
├── rocm.txt
├── xpu.txt
├── cuda.txt
└── rocm-test.txt
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ └── config.yml
├── workflows
│ ├── matchers
│ │ ├── mypy.json
│ │ ├── markdownlint.json
│ │ └── actionlint.json
│ ├── scripts
│ │ ├── build.sh
│ │ ├── create_release.js
│ │ ├── cuda-install.sh
│ │ └── pytorch-install.sh
│ ├── add_label_automerge.yml
│ └── cleanup_pr_body.yml
└── scale-config.yml
├── .yapfignore
├── examples
├── online_serving
│ ├── chart-helm
│ │ ├── ct.yaml
│ │ ├── .helmignore
│ │ ├── templates
│ │ │ ├── custom-objects.yaml
│ │ │ ├── poddisruptionbudget.yaml
│ │ │ ├── secrets.yaml
│ │ │ ├── configmap.yaml
│ │ │ ├── pvc.yaml
│ │ │ └── service.yaml
│ │ └── Chart.yaml
│ ├── prometheus_grafana
│ │ ├── prometheus.yaml
│ │ └── docker-compose.yaml
│ ├── structured_outputs
│ │ └── pyproject.toml
│ └── disaggregated_serving
│ │ └── README.md
├── template_chatml.jinja
├── others
│ └── lmcache
│ │ └── disagg_prefill_lmcache_v1
│ │ └── configs
│ │ ├── lmcache-decoder-config.yaml
│ │ └── lmcache-prefiller-config.yaml
├── offline_inference
│ ├── disaggregated-prefill-v1
│ │ ├── run.sh
│ │ └── README.md
│ └── openai_batch
│ │ └── openai_example_batch.jsonl
├── template_teleflm.jinja
├── template_falcon.jinja
├── template_baichuan.jinja
├── template_falcon_180b.jinja
├── template_chatglm.jinja
├── template_chatglm2.jinja
├── template_vlm2vec.jinja
└── template_alpaca.jinja
├── tools
├── profiler
│ └── nsys_profile_tools
│ │ └── images
│ │ ├── csv1.png
│ │ ├── html.png
│ │ └── html_tbl.png
├── check_repo.sh
├── png-lint.sh
└── ep_kernels
│ └── configure_system_drivers.sh
├── CONTRIBUTING.md
├── .buildkite
├── lm-eval-harness
│ └── configs
│ │ ├── models-large.txt
│ │ ├── models-small.txt
│ │ ├── Qwen2.5-1.5B-Instruct.yaml
│ │ ├── Qwen2-57B-A14-Instruct.yaml
│ │ ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
│ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
│ │ ├── Minitron-4B-Base-FP8.yaml
│ │ ├── Meta-Llama-3-8B-Instruct.yaml
│ │ ├── DeepSeek-V2-Lite-Chat.yaml
│ │ ├── Meta-Llama-3-70B-Instruct.yaml
│ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
│ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml
│ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml
│ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
│ │ ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml
│ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
│ │ ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
│ │ ├── SparseLlama3.1_2of4_fp8_compressed.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
│ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
│ │ └── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
├── nightly-benchmarks
│ ├── scripts
│ │ ├── get-lmdeploy-modelname.py
│ │ ├── wait-for-image.sh
│ │ └── download-tokenizer.py
│ └── tests
│ │ └── genai-perf-tests.json
└── scripts
│ ├── tpu
│ ├── config_v6e_1.env
│ └── quantized_v6e_1.env
│ ├── rerun-test.sh
│ ├── ci-clean-log.sh
│ └── hardware_ci
│ └── run-cpu-test-s390x.sh
├── .markdownlint.yaml
├── MANIFEST.in
├── format.sh
├── .gemini
└── config.yaml
├── .dockerignore
├── .readthedocs.yaml
├── .shellcheckrc
├── .coveragerc
├── use_existing_torch.py
└── .clang-format
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/ray/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/reasoning/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tpu/lora/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/e2e/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/engine/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/sample/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/tpu/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/tracing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/profiler/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/third_party/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/executor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/metrics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/pool/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/sample/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/cli/.meta.yml:
--------------------------------------------------------------------------------
1 | toc_depth: 3
--------------------------------------------------------------------------------
/tests/detokenizer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/kernels/moe/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/language/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/cudagraph/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/executor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/kv_connector/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/tpu/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/attention/layers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/attention/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/device_allocator/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/attention/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/core/sched/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/sample/ops/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/sample/tpu/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/spec_decode/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/compile/piecewise/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/pooling/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/mistral_tool_use/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/quantization/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/kv_connector/unit/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/logits_processors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/structured_output/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/warmup/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/attention/backends/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/pooling/llm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/language/pooling/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/multimodal/pooling/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/runai_model_streamer_test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/v1/attention/backends/mla/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/csrc/moe/marlin_moe_wna16/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu
--------------------------------------------------------------------------------
/tests/entrypoints/openai/correctness/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/pooling/openai/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/language/generation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/multimodal/generation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/multimodal/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_pipe/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/csrc/quantization/gptq_marlin/.gitignore:
--------------------------------------------------------------------------------
1 | kernel_*.cu
--------------------------------------------------------------------------------
/docs/api/vllm/.meta.yml:
--------------------------------------------------------------------------------
1 | search:
2 | boost: 0.5
3 |
--------------------------------------------------------------------------------
/tests/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/entrypoints/pooling/correctness/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/kernels/moe/modular_kernel_tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/language/pooling_mteb_test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/v1/entrypoints/openai/responses/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/multimodal/generation/vlm_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements/lint.txt:
--------------------------------------------------------------------------------
1 | # formatting
2 | pre-commit==4.0.1
3 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/kernels/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: vllm
3 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | vllm/model_executor/layers/fla/ops/*.py
3 |
--------------------------------------------------------------------------------
/docs/community/contact_us.md:
--------------------------------------------------------------------------------
1 | # Contact Us
2 |
3 | --8<-- "README.md:contact-us"
4 |
--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 |
--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 |
--------------------------------------------------------------------------------
/docs/cli/chat.md:
--------------------------------------------------------------------------------
1 | # vllm chat
2 |
3 | ## Options
4 |
5 | --8<-- "docs/argparse/chat.md"
6 |
--------------------------------------------------------------------------------
/requirements/kv_connectors.txt:
--------------------------------------------------------------------------------
1 | lmcache
2 | nixl >= 0.5.1 # Required for disaggregated prefill
3 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/ct.yaml:
--------------------------------------------------------------------------------
1 | chart-dirs:
2 | - charts
3 | validate-maintainers: false
--------------------------------------------------------------------------------
/docs/cli/complete.md:
--------------------------------------------------------------------------------
1 | # vllm complete
2 |
3 | ## Options
4 |
5 | --8<-- "docs/argparse/complete.md"
6 |
--------------------------------------------------------------------------------
/docs/assets/design/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hierarchy.png
--------------------------------------------------------------------------------
/tests/multimodal/assets/rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/rgba.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-chat.png
--------------------------------------------------------------------------------
/tests/multimodal/assets/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image1.png
--------------------------------------------------------------------------------
/tests/multimodal/assets/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image2.png
--------------------------------------------------------------------------------
/docs/assets/deployment/open_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/open_webui.png
--------------------------------------------------------------------------------
/docs/getting_started/installation/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 | - README.md
3 | - gpu.md
4 | - cpu.md
5 | - google_tpu.md
6 |
--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/chatbox-chat.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-settings.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dp_external_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dp_external_lb.png
--------------------------------------------------------------------------------
/docs/assets/deployment/dp_internal_lb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dp_internal_lb.png
--------------------------------------------------------------------------------
/docs/assets/deployment/streamlit-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/streamlit-chat.png
--------------------------------------------------------------------------------
/docs/assets/design/tpu/most_model_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/tpu/most_model_len.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-text-dark.png
--------------------------------------------------------------------------------
/benchmarks/multi_turn/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.24
2 | pandas>=2.0.0
3 | aiohttp>=3.10
4 | transformers>=4.46
5 | xlsxwriter>=3.2.1
--------------------------------------------------------------------------------
/docs/assets/deployment/chatbox-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/chatbox-settings.png
--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-1.png
--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-2.png
--------------------------------------------------------------------------------
/docs/assets/design/metrics/intervals-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/metrics/intervals-3.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/key.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/free.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/free.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-only-light.ico
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-only-light.png
--------------------------------------------------------------------------------
/docs/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/logos/vllm-logo-text-light.png
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/.helmignore:
--------------------------------------------------------------------------------
1 | *.png
2 | .git/
3 | ct.yaml
4 | lintconf.yaml
5 | values.schema.json
6 | /workflows
--------------------------------------------------------------------------------
/tests/config/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | trust_remote_code: true
5 |
--------------------------------------------------------------------------------
/docs/assets/deployment/dify-create-chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-create-chatbot.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/k_vecs.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/q_vecs.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/query.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/v_vec.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/value.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/overview.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-provider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-provider.png
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/overview.jpg
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/workflow.png
--------------------------------------------------------------------------------
/tests/evals/gpt_oss/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--------------------------------------------------------------------------------
/tests/evals/gsm8k/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-upload-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-upload-doc.png
--------------------------------------------------------------------------------
/docs/assets/design/paged_attention/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/paged_attention/logits_vec.png
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/abstraction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/abstraction.jpg
--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/csv1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/csv1.png
--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/html.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/html.png
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-with-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-chat-with-doc.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-1.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-3.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-4.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-5.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-6.png
--------------------------------------------------------------------------------
/docs/assets/design/prefix_caching/example-time-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/prefix_caching/example-time-7.png
--------------------------------------------------------------------------------
/tools/profiler/nsys_profile_tools/images/html_tbl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/profiler/nsys_profile_tools/images/html_tbl.png
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_basic.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- message['content'] -}}
3 | {%- endfor -%}
4 |
--------------------------------------------------------------------------------
/docs/assets/deployment/anything-llm-chat-without-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/anything-llm-chat-without-doc.png
--------------------------------------------------------------------------------
/docs/assets/deployment/architecture_helm_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/architecture_helm_deployment.png
--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/full_attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/overview.png
--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_fuyu.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- message['content'] + '\n' -}}
3 | {%- endfor -%}
4 |
--------------------------------------------------------------------------------
/docs/assets/contributing/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/contributing/dockerfile-stages-dependency.png
--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/llm_engine.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/arch_overview/llm_engine.excalidraw.png
--------------------------------------------------------------------------------
/docs/assets/features/disagg_prefill/high_level_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/features/disagg_prefill/high_level_design.png
--------------------------------------------------------------------------------
/docs/cli/.nav.yml:
--------------------------------------------------------------------------------
1 | nav:
2 | - README.md
3 | - serve.md
4 | - chat.md
5 | - complete.md
6 | - run-batch.md
7 | - vllm bench:
8 | - bench/*.md
9 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 |
3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
4 |
--------------------------------------------------------------------------------
/docs/assets/design/arch_overview/entrypoints.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/arch_overview/entrypoints.excalidraw.png
--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r lint.txt
2 | -r test.txt
3 |
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/speculators/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml:
--------------------------------------------------------------------------------
1 | model_name: "Qwen/Qwen3-0.6B-FP8"
2 | accuracy_threshold: 0.375
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/README.md:
--------------------------------------------------------------------------------
1 | # Quantization Kernel Config
2 |
3 | Use scripts under `benchmarks/kernels/` to generate these config files.
4 |
--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png
--------------------------------------------------------------------------------
/docs/cli/serve.md:
--------------------------------------------------------------------------------
1 | # vllm serve
2 |
3 | ## JSON CLI Arguments
4 |
5 | --8<-- "docs/cli/json_tip.inc.md"
6 |
7 | ## Options
8 |
9 | --8<-- "docs/argparse/serve.md"
10 |
--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png
--------------------------------------------------------------------------------
/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
--------------------------------------------------------------------------------
/vllm/benchmarks/lib/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Benchmark library utilities."""
4 |
--------------------------------------------------------------------------------
/docs/cli/run-batch.md:
--------------------------------------------------------------------------------
1 | # vllm run-batch
2 |
3 | ## JSON CLI Arguments
4 |
5 | --8<-- "docs/cli/json_tip.inc.md"
6 |
7 | ## Options
8 |
9 | --8<-- "docs/argparse/run-batch.md"
10 |
--------------------------------------------------------------------------------
/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png
--------------------------------------------------------------------------------
/docs/cli/bench/serve.md:
--------------------------------------------------------------------------------
1 | # vllm bench serve
2 |
3 | ## JSON CLI Arguments
4 |
5 | --8<-- "docs/cli/json_tip.inc.md"
6 |
7 | ## Options
8 |
9 | --8<-- "docs/argparse/bench_serve.md"
10 |
--------------------------------------------------------------------------------
/docs/cli/bench/latency.md:
--------------------------------------------------------------------------------
1 | # vllm bench latency
2 |
3 | ## JSON CLI Arguments
4 |
5 | --8<-- "docs/cli/json_tip.inc.md"
6 |
7 | ## Options
8 |
9 | --8<-- "docs/argparse/bench_latency.md"
10 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/custom-objects.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.customObjects }}
2 | {{- range .Values.customObjects }}
3 | {{- tpl (. | toYaml) $ }}
4 | ---
5 | {{- end }}
6 | {{- end }}
--------------------------------------------------------------------------------
/tests/kv_transfer/test_lookup_buffer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python3 test_lookup_buffer.py &
3 | PID0=$!
4 | RANK=1 python3 test_lookup_buffer.py &
5 | PID1=$!
6 |
7 | wait $PID0
8 | wait $PID1
9 |
--------------------------------------------------------------------------------
/tests/kv_transfer/test_send_recv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | RANK=0 python3 test_send_recv.py &
4 | PID0=$!
5 | RANK=1 python3 test_send_recv.py &
6 | PID1=$!
7 |
8 | wait $PID0
9 | wait $PID1
10 |
--------------------------------------------------------------------------------
/docs/cli/bench/throughput.md:
--------------------------------------------------------------------------------
1 | # vllm bench throughput
2 |
3 | ## JSON CLI Arguments
4 |
5 | --8<-- "docs/cli/json_tip.inc.md"
6 |
7 | ## Options
8 |
9 | --8<-- "docs/argparse/bench_throughput.md"
10 |
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
2 | accuracy_threshold: 0.45
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
2 | accuracy_threshold: 0.31
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
2 | accuracy_threshold: 0.60
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Questions
4 | url: https://discuss.vllm.ai
5 | about: Ask questions and discuss with other vLLM community members
6 |
--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 |
--------------------------------------------------------------------------------
/tests/config/test_config_with_model.yaml:
--------------------------------------------------------------------------------
1 | # Same as test_config.yaml but with model specified
2 | model: config-model
3 | port: 12312
4 | served_model_name: mymodel
5 | tensor_parallel_size: 2
6 | trust_remote_code: true
7 |
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
1 | model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
2 | accuracy_threshold: 0.72
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
6 |
7 |
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml:
--------------------------------------------------------------------------------
1 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
2 | accuracy_threshold: 0.74
3 | num_questions: 1319
4 | num_fewshot: 5
5 | max_model_len: 4096
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 |
--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | # Should be mirrored in pyproject.toml
2 | cmake>=3.26.1
3 | ninja
4 | packaging>=24.2
5 | setuptools>=77.0.3,<80.0.0
6 | setuptools-scm>=8
7 | torch==2.8.0
8 | wheel
9 | jinja2>=3.1.6
10 | regex
11 | build
12 |
--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .communication_op import *
5 | from .parallel_state import *
6 | from .utils import *
7 |
--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | MD007:
2 | indent: 4
3 | MD013: false
4 | MD024:
5 | siblings_only: true
6 | MD033: false
7 | MD042: false
8 | MD045: false
9 | MD046: false
10 | MD051: false
11 | MD052: false
12 | MD053: false
13 | MD059: false
14 |
--------------------------------------------------------------------------------
/tests/utils_/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | This module is named `utils_` instead of `utils` to avoid obscuring
5 | `tests/utils.py`.
6 | """
7 |
--------------------------------------------------------------------------------
/tests/v1/shutdown/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Shutdown test utils"""
4 |
5 | SHUTDOWN_TEST_TIMEOUT_SEC = 120
6 | SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
7 |
--------------------------------------------------------------------------------
/tests/evals/gsm8k/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen3-0.6B-FP8.yaml
2 | Llama-3.2-1B-Instruct-INT8-CT.yaml
3 | Llama-3-8B-Instruct-nonuniform-CT.yaml
4 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
5 | Qwen1.5-MoE-W4A16-CT.yaml
6 | DeepSeek-V2-Lite-Instruct-FP8.yaml
7 |
--------------------------------------------------------------------------------
/vllm/logging_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.logging_utils.formatter import NewLineFormatter
5 |
6 | __all__ = [
7 | "NewLineFormatter",
8 | ]
9 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements/common.txt
3 | include requirements/cuda.txt
4 | include requirements/rocm.txt
5 | include requirements/cpu.txt
6 | include CMakeLists.txt
7 |
8 | recursive-include cmake *
9 | recursive-include csrc *
10 |
--------------------------------------------------------------------------------
/docs/mkdocs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block announce %}
4 |
You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.
5 | {% endblock %}
6 |
--------------------------------------------------------------------------------
/vllm/distributed/eplb/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | '''
4 | Expert parallelism load balancer (EPLB).
5 | '''
6 |
7 | from .eplb_state import *
8 | from .rebalance_algo import *
9 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .registry import get_chat_template_fallback_path
4 |
5 | __all__ = ["get_chat_template_fallback_path"]
6 |
--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/prometheus.yaml:
--------------------------------------------------------------------------------
1 | # prometheus.yaml
2 | global:
3 | scrape_interval: 5s
4 | evaluation_interval: 30s
5 |
6 | scrape_configs:
7 | - job_name: vllm
8 | static_configs:
9 | - targets:
10 | - 'host.docker.internal:8000'
11 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: policy/v1
2 | kind: PodDisruptionBudget
3 | metadata:
4 | name: "{{ .Release.Name }}-pdb"
5 | namespace: {{ .Release.Namespace }}
6 | spec:
7 | maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
--------------------------------------------------------------------------------
/vllm/lora/layers/qkv_x_parallel_linear.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from .base import BaseLayerWithLoRA
4 |
5 |
6 | #TODO: Implement this
7 | class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
8 | pass
9 |
--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
4 | echo "Please run 'pip install -r requirements/lint.txt', followed by"
5 | echo "'pre-commit install' to install the pre-commit hooks."
6 | echo "Then linters will run automatically before each commit."
--------------------------------------------------------------------------------
/tests/vllm_test_utils/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(
7 | name='vllm_test_utils',
8 | version='0.1',
9 | packages=['vllm_test_utils'],
10 | )
11 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/shared_fused_moe/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import (
4 | SharedFusedMoE)
5 |
6 | __all__ = ["SharedFusedMoE"]
7 |
--------------------------------------------------------------------------------
/examples/online_serving/structured_outputs/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "examples-online-structured-outputs"
3 | requires-python = ">=3.9, <3.13"
4 | dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
5 | version = "0.0.0"
6 |
7 | [project.scripts]
8 | structured-outputs = "structured_outputs:main"
9 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .layer_utils import replace_parameter, update_tensor_inplace
5 |
6 | __all__ = ['update_tensor_inplace', 'replace_parameter']
7 |
--------------------------------------------------------------------------------
/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 |
5 | def register_prithvi():
6 | return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor" # noqa: E501
7 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/modal.md:
--------------------------------------------------------------------------------
1 | # Modal
2 |
3 | vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
4 |
5 | For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
6 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/secrets.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: "{{ .Release.Name }}-secrets"
5 | namespace: {{ .Release.Namespace }}
6 | type: Opaque
7 | data:
8 | {{- range $key, $val := .Values.secrets }}
9 | {{ $key }}: {{ $val | b64enc | quote }}
10 | {{- end }}
--------------------------------------------------------------------------------
/.gemini/config.yaml:
--------------------------------------------------------------------------------
1 | # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
2 | have_fun: false # Just review the code
3 | code_review:
4 | comment_severity_threshold: HIGH # Reduce quantity of comments
5 | pull_request_opened:
6 | summary: false # Don't summarize the PR in a separate comment
7 |
--------------------------------------------------------------------------------
/docs/getting_started/installation/device.template.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Requirements
4 |
5 | ## Set up using Python
6 |
7 | ### Pre-built wheels
8 |
9 | ### Build wheel from source
10 |
11 | ## Set up using Docker
12 |
13 | ### Pre-built images
14 |
15 | ### Build image from source
16 |
17 | ## Extra information
18 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/kserve.md:
--------------------------------------------------------------------------------
1 | # KServe
2 |
3 | vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
4 |
5 | Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
6 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.configs -}}
2 | apiVersion: v1
3 | kind: ConfigMap
4 | metadata:
5 | name: "{{ .Release.Name }}-configs"
6 | namespace: {{ .Release.Namespace }}
7 | data:
8 | {{- with .Values.configs }}
9 | {{- toYaml . | nindent 2 }}
10 | {{- end }}
11 | {{- end -}}
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.distributed.kv_transfer.kv_connector.v1.base import (
4 | KVConnectorBase_V1, KVConnectorRole)
5 |
6 | __all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
7 |
--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
1 | Qwen2.5-1.5B-Instruct.yaml
2 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
4 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
5 | Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
6 | Qwen1.5-MoE-W4A16-compressed-tensors.yaml
7 |
--------------------------------------------------------------------------------
/examples/online_serving/disaggregated_serving/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Serving
2 |
3 | This example contains scripts that demonstrate the disaggregated serving features of vLLM.
4 |
5 | ## Files
6 |
7 | - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
8 | - `kv_events.sh` - Demonstrates KV cache event publishing.
9 |
--------------------------------------------------------------------------------
/vllm/lora/ops/ipex_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.ipex_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 | bgmv_shrink)
6 |
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 |
--------------------------------------------------------------------------------
/vllm/lora/ops/xla_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
5 | bgmv_shrink)
6 |
7 | __all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"]
8 |
--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "mypy",
5 | "pattern": [
6 | {
7 | "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
8 | "file": 1,
9 | "line": 2,
10 | "severity": 3,
11 | "message": 4
12 | }
13 | ]
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from ..utils import compare_two_settings
5 |
6 |
7 | def test_cpu_offload():
8 | compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
9 | ["--cpu-offload-gb", "1"])
10 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from lmdeploy.serve.openai.api_client import APIClient
5 |
6 | api_client = APIClient("http://localhost:8000")
7 | model_name = api_client.available_models[0]
8 |
9 | print(model_name)
10 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/llmaz.md:
--------------------------------------------------------------------------------
1 | # llmaz
2 |
3 | [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
4 |
5 | Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
6 |
--------------------------------------------------------------------------------
/.buildkite/scripts/tpu/config_v6e_1.env:
--------------------------------------------------------------------------------
1 | # Environment config
2 | TEST_NAME=llama8b
3 | CONTAINER_NAME=tpu-test
4 |
5 | # vllm config
6 | MODEL=meta-llama/Llama-3.1-8B-Instruct
7 | MAX_NUM_SEQS=256
8 | MAX_NUM_BATCHED_TOKENS=1024
9 | TENSOR_PARALLEL_SIZE=1
10 | MAX_MODEL_LEN=2048
11 | DOWNLOAD_DIR=/mnt/disks/persist
12 | EXPECTED_THROUGHPUT=8.0
13 | INPUT_LEN=1800
14 | OUTPUT_LEN=128
15 |
--------------------------------------------------------------------------------
/requirements/rocm-build.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | --extra-index-url https://download.pytorch.org/whl/rocm6.3
5 | torch==2.8.0
6 | torchvision==0.23.0
7 | torchaudio==2.8.0
8 |
9 | triton==3.3.0
10 | cmake>=3.26.1,<4
11 | packaging>=24.2
12 | setuptools>=77.0.3,<80.0.0
13 | setuptools-scm>=8
14 | wheel
15 | jinja2>=3.1.6
16 | amdsmi==6.2.4
17 | timm>=1.0.17
18 |
--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from ..entrypoints.openai.test_oot_registration import (
5 | run_and_test_dummy_opt_api_server)
6 |
7 |
8 | def test_distributed_oot(dummy_opt_path: str):
9 | run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
10 |
--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
5 | from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
6 |
7 | __all__ = [
8 | "PunicaWrapperBase",
9 | "get_punica_wrapper",
10 | ]
11 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/pvc.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.extraInit }}
2 | apiVersion: v1
3 | kind: PersistentVolumeClaim
4 | metadata:
5 | name: "{{ .Release.Name }}-storage-claim"
6 | namespace: {{ .Release.Namespace }}
7 | spec:
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: {{ .Values.extraInit.pvcStorage }}
13 | {{- end }}
--------------------------------------------------------------------------------
/docs/configuration/README.md:
--------------------------------------------------------------------------------
1 | # Configuration Options
2 |
3 | This section lists the most common options for running vLLM.
4 |
5 | There are three main levels of configuration, from highest priority to lowest priority:
6 |
7 | - [Request parameters][completions-api] and [input arguments][sampling-params]
8 | - [Engine arguments](./engine_args.md)
9 | - [Environment variables](./env_vars.md)
10 |
--------------------------------------------------------------------------------
/requirements/cpu-build.txt:
--------------------------------------------------------------------------------
1 | # Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
2 | # see https://github.com/pytorch/pytorch/pull/151218
3 | cmake>=3.26.1
4 | ninja
5 | packaging>=24.2
6 | setuptools>=77.0.3,<80.0.0
7 | setuptools-scm>=8
8 | --extra-index-url https://download.pytorch.org/whl/cpu
9 | torch==2.6.0+cpu
10 | wheel
11 | jinja2>=3.1.6
12 | regex
13 |
--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-api-autonav
3 | mkdocs-material
4 | mkdocstrings-python
5 | mkdocs-gen-files
6 | mkdocs-awesome-nav
7 | mkdocs-glightbox
8 | mkdocs-git-revision-date-localized-plugin
9 | mkdocs-minify-plugin
10 | regex
11 | ruff
12 |
13 | # Required for argparse hook only
14 | -f https://download.pytorch.org/whl/cpu
15 | cachetools
16 | msgspec
17 | pydantic
18 | torch
19 |
--------------------------------------------------------------------------------
/vllm/entrypoints/constants.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | Shared constants for vLLM entrypoints.
5 | """
6 |
7 | # HTTP header limits for h11 parser
8 | # These constants help mitigate header abuse attacks
9 | H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
10 | H11_MAX_HEADER_COUNT_DEFAULT = 256
11 |
--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml:
--------------------------------------------------------------------------------
1 | local_cpu: False
2 | max_local_cpu_size: 0
3 | #local_disk:
4 | max_local_disk_size: 0
5 | remote_serde: NULL
6 |
7 | enable_nixl: True
8 | nixl_role: "receiver"
9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 |
--------------------------------------------------------------------------------
/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml:
--------------------------------------------------------------------------------
1 | local_cpu: False
2 | max_local_cpu_size: 0
3 | #local_disk:
4 | max_local_disk_size: 0
5 | remote_serde: NULL
6 |
7 | enable_nixl: True
8 | nixl_role: "sender"
9 | nixl_peer_host: "localhost"
10 | nixl_peer_port: 55555
11 | nixl_buffer_size: 1073741824 # 1GB
12 | nixl_buffer_device: "cuda"
13 | nixl_enable_gc: True
14 |
--------------------------------------------------------------------------------
/.buildkite/scripts/tpu/quantized_v6e_1.env:
--------------------------------------------------------------------------------
1 | # Environment config
2 | TEST_NAME=llama8bw8a8
3 | CONTAINER_NAME=tpu-test
4 |
5 | # vllm config
6 | MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
7 | MAX_NUM_SEQS=128
8 | MAX_NUM_BATCHED_TOKENS=1024
9 | TENSOR_PARALLEL_SIZE=1
10 | MAX_MODEL_LEN=2048
11 | DOWNLOAD_DIR=/mnt/disks/persist
12 | EXPECTED_THROUGHPUT=10.0
13 | INPUT_LEN=1800
14 | OUTPUT_LEN=128
15 |
--------------------------------------------------------------------------------
/.github/workflows/matchers/markdownlint.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "markdownlint",
5 | "pattern": [
6 | {
7 | "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
8 | "file": 1,
9 | "line": 2,
10 | "column": 3,
11 | "code": 4,
12 | "message": 5
13 | }
14 | ]
15 | }
16 | ]
17 | }
--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import vllm
5 |
6 |
7 | def test_embedded_commit_defined():
8 | assert hasattr(vllm, "__version__")
9 | assert hasattr(vllm, "__version_tuple__")
10 | assert vllm.__version__ != "dev"
11 | assert vllm.__version_tuple__ != (0, 0, "dev")
12 |
--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
3 |
4 | if ! git diff --quiet; then
5 | echo "Repo is dirty" >&2
6 |
7 | exit 1
8 | fi
9 |
10 | if ! git describe --tags; then
11 | echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
12 |
13 | exit 1
14 | fi
15 |
--------------------------------------------------------------------------------
/docs/cli/json_tip.inc.md:
--------------------------------------------------------------------------------
1 | When passing JSON CLI arguments, the following sets of arguments are equivalent:
2 |
3 | - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
4 | - `--json-arg.key1 value1 --json-arg.key2.key3 value2`
5 |
6 | Additionally, list elements can be passed individually using `+`:
7 |
8 | - `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
9 | - `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_blip2.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'user' -%}
3 | {{- 'Question: ' + message['content'] + ' ' -}}
4 | {%- elif message['role'] == 'assistant' -%}
5 | {{- 'Answer: ' + message['content'] + ' ' -}}
6 | {%- endif -%}
7 | {%- endfor -%}
8 |
9 | {%- if add_generation_prompt -%}
10 | {{- 'Answer:' -}}
11 | {% endif %}
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
2 | model_name: "Qwen/Qwen2.5-1.5B-Instruct"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.54
8 | - name: "exact_match,flexible-extract"
9 | value: 0.59
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/docs/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | vLLM's examples are split into three categories:
4 |
5 | - If you are using vLLM from within Python code, see the *Offline Inference* section.
6 | - If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
7 | - For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
8 |
--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/run.sh:
--------------------------------------------------------------------------------
1 | rm -rf local_storage/
2 |
3 | if [ -f "output.txt" ]; then
4 | rm output.txt
5 | fi
6 |
7 | # The directory of current script
8 | SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
9 |
10 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py"
11 | VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py"
12 |
--------------------------------------------------------------------------------
/tests/vllm_test_utils/vllm_test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | vllm_utils is a package for vLLM testing utilities.
5 | It does not import any vLLM modules.
6 | """
7 |
8 | from .blame import BlameResult, blame
9 | from .monitor import MonitoredValues, monitor
10 |
11 | __all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
12 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from typing import Optional
5 |
6 |
7 | def dummy_platform_plugin() -> Optional[str]:
8 | return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
9 |
10 |
11 | def register_ops():
12 | import vllm_add_dummy_platform.dummy_custom_ops # noqa
13 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/fla/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | # SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
4 | #
5 | # This file contains code copied from the flash-linear-attention project.
6 | # The original source code was licensed under the MIT license and included
7 | # the following copyright notice:
8 | # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
9 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/quark/schemes/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .quark_scheme import QuarkScheme
5 | from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
6 | from .quark_w8a8_fp8 import QuarkW8A8Fp8
7 | from .quark_w8a8_int8 import QuarkW8A8Int8
8 |
9 | __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
10 |
--------------------------------------------------------------------------------
/vllm/tasks.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from typing import Literal, get_args
4 |
5 | GenerationTask = Literal["generate", "transcription"]
6 | GENERATION_TASKS = get_args(GenerationTask)
7 |
8 | PoolingTask = Literal["encode", "embed", "classify", "score"]
9 | POOLING_TASKS = get_args(PoolingTask)
10 |
11 | SupportedTask = Literal[GenerationTask, PoolingTask]
12 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/triton.md:
--------------------------------------------------------------------------------
1 | # NVIDIA Triton
2 |
3 | The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
4 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(name='vllm_add_dummy_model',
7 | version='0.1',
8 | packages=['vllm_add_dummy_model'],
9 | entry_points={
10 | 'vllm.general_plugins':
11 | ["register_dummy_model = vllm_add_dummy_model:register"]
12 | })
13 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/base.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Defines the base type for KV cache connectors."""
4 |
5 | from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
6 |
7 | KVConnectorBase = KVConnectorBase_V1
8 | KVConnectorBaseType = KVConnectorBase_V1
9 |
10 | __all__ = ["KVConnectorBase", "KVConnectorBaseType"]
11 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
5 | truncate_tool_call_ids, validate_request_params)
6 |
7 | __all__ = [
8 | "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
9 | "validate_request_params"
10 | ]
11 |
--------------------------------------------------------------------------------
/docs/getting_started/installation/python_env_setup.inc.md:
--------------------------------------------------------------------------------
1 | It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
2 |
3 | ```bash
4 | uv venv --python 3.12 --seed
5 | source .venv/bin/activate
6 | ```
7 |
--------------------------------------------------------------------------------
/docs/models/extensions/fastsafetensor.md:
--------------------------------------------------------------------------------
1 | Loading Model weights with fastsafetensors
2 | ===================================================================
3 |
4 | Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
5 |
6 | To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
7 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.attention.backends.placeholder_attn import (
5 | PlaceholderAttentionBackend)
6 |
7 |
8 | class DummyAttentionBackend(PlaceholderAttentionBackend):
9 |
10 | @staticmethod
11 | def get_name() -> str:
12 | return "Dummy_Backend"
13 |
--------------------------------------------------------------------------------
/vllm/lora/ops/triton_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
5 | from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
6 | from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
7 |
8 | __all__ = [
9 | "lora_expand",
10 | "lora_shrink",
11 | "LoRAKernelMeta",
12 | ]
13 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
3 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
4 | {{- '<|im_end|>' + '\n' -}}
5 | {%- endif -%}
6 | {%- endfor -%}
7 |
8 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
9 | {{- '<|im_start|>assistant\n' -}}
10 | {%- endif -%}
11 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from typing import NamedTuple
4 |
5 | from compressed_tensors.transform import TransformArgs, TransformScheme
6 |
7 | __all__ = ["TransformTuple"]
8 |
9 |
10 | class TransformTuple(NamedTuple):
11 | scheme_name: str
12 | scheme: TransformScheme
13 | args: TransformArgs
14 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
3 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.792
9 | - name: "exact_match,flexible-extract"
10 | value: 0.824
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
2 | model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.335
8 | - name: "exact_match,flexible-extract"
9 | value: 0.323
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml:
--------------------------------------------------------------------------------
1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
2 | model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
3 | tasks:
4 | - name: "gsm8k"
5 | metrics:
6 | - name: "exact_match,strict-match"
7 | value: 0.47
8 | - name: "exact_match,flexible-extract"
9 | value: 0.64
10 | limit: 1319
11 | num_fewshot: 5
12 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/bentoml.md:
--------------------------------------------------------------------------------
1 | # BentoML
2 |
3 | [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
4 |
5 | For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
6 |
--------------------------------------------------------------------------------
/docs/usage/README.md:
--------------------------------------------------------------------------------
1 | # Using vLLM
2 |
3 | First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
4 |
5 | Then, vLLM supports the following usage patterns:
6 |
7 | - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
8 | - [Deployment](../deployment/docker.md): Scale up model instances for production.
9 | - [Training](../training/rlhf.md): Train or fine-tune a model.
10 |
--------------------------------------------------------------------------------
/examples/template_teleflm.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages %}
2 | {%- if message['role'] == 'user' %}
3 | {{- '<_user>' + message['content']|trim }}
4 | {%- elif message['role'] == 'system' %}
5 | {{- '<_system>' + message['content']|trim }}
6 | {%- elif message['role'] == 'assistant' %}
7 | {{- '<_bot>' + message['content'] }}
8 | {%- endif %}
9 | {%- endfor %}
10 | {%- if add_generation_prompt %}
11 | {{- '<_bot>' }}
12 | {%- endif %}
13 |
--------------------------------------------------------------------------------
/.buildkite/scripts/rerun-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Usage: ./rerun_test.sh path/to/test.py::test_name
4 |
5 | # Check if argument is given
6 | if [ $# -lt 1 ]; then
7 | echo "Usage: $0 path/to/test.py::test_name"
8 | echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
9 | exit 1
10 | fi
11 |
12 | TEST=$1
13 | COUNT=1
14 |
15 | while pytest -sv "$TEST"; do
16 | COUNT=$((COUNT + 1))
17 | echo "RUN NUMBER ${COUNT}"
18 | done
19 |
--------------------------------------------------------------------------------
/csrc/cutlass_extensions/common.cpp:
--------------------------------------------------------------------------------
1 | #include "cutlass_extensions/common.hpp"
2 |
3 | int32_t get_sm_version_num() {
4 | int32_t major_capability, minor_capability;
5 | cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
6 | 0);
7 | cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
8 | 0);
9 | int32_t version_num = major_capability * 10 + minor_capability;
10 | return version_num;
11 | }
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: "{{ .Release.Name }}-service"
5 | namespace: {{ .Release.Namespace }}
6 | spec:
7 | type: ClusterIP
8 | ports:
9 | - name: {{ include "chart.service-port-name" . }}
10 | port: {{ include "chart.service-port" . }}
11 | targetPort: {{ include "chart.container-port-name" . }}
12 | protocol: TCP
13 | selector:
14 | {{- include "chart.labels" . | nindent 4 }}
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
3 | model_name: "mgoin/Minitron-4B-Base-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.231
9 | - name: "exact_match,flexible-extract"
10 | value: 0.22
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/csrc/cub_helpers.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #ifndef USE_ROCM
4 | #include
5 | #if CUB_VERSION >= 200800
6 | #include
7 | using CubAddOp = cuda::std::plus<>;
8 | using CubMaxOp = cuda::maximum<>;
9 | #else // if CUB_VERSION < 200800
10 | using CubAddOp = cub::Sum;
11 | using CubMaxOp = cub::Max;
12 | #endif // CUB_VERSION
13 | #else
14 | #include
15 | using CubAddOp = cub::Sum;
16 | using CubMaxOp = cub::Max;
17 | #endif // USE_ROCM
18 |
--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Ensure that *.excalidraw.png files have the excalidraw metadata
4 | # embedded in them. This ensures they can be loaded back into
5 | # the tool and edited in the future.
6 |
7 | find . -iname '*.excalidraw.png' | while read -r file; do
8 | if git check-ignore -q "$file"; then
9 | continue
10 | fi
11 | if ! grep -q "excalidraw+json" "$file"; then
12 | echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
13 | exit 1
14 | fi
15 | done
16 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
3 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.756
9 | - name: "exact_match,flexible-extract"
10 | value: 0.752
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | /.venv
2 | /build
3 | dist
4 | vllm/*.so
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | .mypy_cache
12 |
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
3 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.671
9 | - name: "exact_match,flexible-extract"
10 | value: 0.664
11 | limit: 1000
12 | num_fewshot: 5
13 | trust_remote_code: True
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
3 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.892
9 | - name: "exact_match,flexible-extract"
10 | value: 0.892
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
3 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.624
9 | - name: "exact_match,flexible-extract"
10 | value: 0.624
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/tests/plugins/prithvi_io_processor_plugin/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(
7 | name="prithvi_io_processor_plugin",
8 | version="0.1",
9 | packages=["prithvi_io_processor"],
10 | entry_points={
11 | "vllm.io_processor_plugins": [
12 | "prithvi_to_tiff = prithvi_io_processor:register_prithvi", # noqa: E501
13 | ]
14 | },
15 | )
16 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
3 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.616
9 | - name: "exact_match,flexible-extract"
10 | value: 0.632
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/scripts/ci-clean-log.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Usage: ./ci_clean_log.sh ci.log
3 | # This script strips timestamps and color codes from CI log files.
4 |
5 | # Check if argument is given
6 | if [ $# -lt 1 ]; then
7 | echo "Usage: $0 ci.log"
8 | exit 1
9 | fi
10 |
11 | INPUT_FILE="$1"
12 |
13 | # Strip timestamps
14 | sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
15 |
16 | # Strip colorization
17 | sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
18 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | version: 2
5 |
6 | build:
7 | os: ubuntu-22.04
8 | tools:
9 | python: "3.12"
10 | jobs:
11 | post_checkout:
12 | - git fetch --unshallow || true
13 |
14 | mkdocs:
15 | configuration: mkdocs.yaml
16 |
17 | # Optionally declare the Python requirements required to build your docs
18 | python:
19 | install:
20 | - requirements: requirements/docs.txt
21 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.578
9 | - name: "exact_match,flexible-extract"
10 | value: 0.585
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/tests/kernels/attention/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from vllm.utils import (create_kv_caches_with_random,
7 | create_kv_caches_with_random_flash)
8 |
9 |
10 | @pytest.fixture()
11 | def kv_cache_factory():
12 | return create_kv_caches_with_random
13 |
14 |
15 | @pytest.fixture()
16 | def kv_cache_factory_flashinfer():
17 | return create_kv_caches_with_random_flash
18 |
--------------------------------------------------------------------------------
/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script build the CPU docker image and run the offline inference inside the container.
4 | # It serves a sanity check for compilation and basic model usage.
5 | set -ex
6 |
7 | # Setup cleanup
8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
9 | trap remove_docker_container EXIT
10 | remove_docker_container
11 |
12 | # Try building the docker image
13 | docker build -t cpu-test -f docker/Dockerfile.s390x .
14 |
--------------------------------------------------------------------------------
/docs/mkdocs/javascript/mathjax.js:
--------------------------------------------------------------------------------
1 | // Enables MathJax rendering
2 | window.MathJax = {
3 | tex: {
4 | inlineMath: [["\\(", "\\)"]],
5 | displayMath: [["\\[", "\\]"]],
6 | processEscapes: true,
7 | processEnvironments: true
8 | },
9 | options: {
10 | ignoreHtmlClass: ".*|",
11 | processHtmlClass: "arithmatex"
12 | }
13 | };
14 |
15 | document$.subscribe(() => {
16 | MathJax.startup.output.clearCache()
17 | MathJax.typesetClear()
18 | MathJax.texReset()
19 | MathJax.typesetPromise()
20 | })
21 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
3 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.753
9 | - name: "exact_match,flexible-extract"
10 | value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
3 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.86
9 | - name: "exact_match,flexible-extract"
10 | value: 0.86
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.shellcheckrc:
--------------------------------------------------------------------------------
1 | # rules currently disabled:
2 | #
3 | # SC1091 (info): Not following: was not specified as input (see shellcheck -x)
4 | # SC2004 (style): $/${} is unnecessary on arithmetic variables.
5 | # SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
6 | # SC2155 (warning): Declare and assign separately to avoid masking return values.
7 | # SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
8 | #
9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164
10 |
--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "actionlint",
5 | "pattern": [
6 | {
7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
8 | "file": 1,
9 | "line": 2,
10 | "column": 3,
11 | "message": 4,
12 | "code": 5
13 | }
14 | ]
15 | }
16 | ]
17 | }
18 |
--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
1 | #ifndef CPU_TYPES_HPP
2 | #define CPU_TYPES_HPP
3 |
4 | #if defined(__x86_64__)
5 | // x86 implementation
6 | #include "cpu_types_x86.hpp"
7 | #elif defined(__POWER9_VECTOR__)
8 | // ppc implementation
9 | #include "cpu_types_vsx.hpp"
10 | #elif defined(__s390x__)
11 | // s390 implementation
12 | #include "cpu_types_vxe.hpp"
13 | #elif defined(__aarch64__)
14 | // arm implementation
15 | #include "cpu_types_arm.hpp"
16 | #else
17 | #warning "unsupported vLLM cpu implementation"
18 | #endif
19 |
20 | #endif
--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'user' -%}
3 | {{- 'User: ' + message['content'] -}}
4 | {%- elif message['role'] == 'assistant' -%}
5 | {{- 'Assistant: ' + message['content'] -}}
6 | {%- endif -%}
7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
8 | {{- '\n' -}}
9 | {%- endif -%}
10 | {%- endfor -%}
11 |
12 |
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 | {{- 'Assistant:' -}}
15 | {% endif %}
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
4 | from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
5 | from vllm.entrypoints.cli.benchmark.throughput import (
6 | BenchmarkThroughputSubcommand)
7 |
8 | __all__: list[str] = [
9 | "BenchmarkLatencySubcommand",
10 | "BenchmarkServingSubcommand",
11 | "BenchmarkThroughputSubcommand",
12 | ]
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
3 | model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.30
9 | - name: "exact_match,flexible-extract"
10 | value: 0.465
11 | limit: 1319
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_gpt.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 | from tests.models.utils import GenerateModelInfo
6 |
7 | from .ppl_utils import wikitext_ppl_test
8 |
9 | MODELS = [GenerateModelInfo("openai-community/gpt2-large")]
10 |
11 |
12 | @pytest.mark.parametrize("model_info", MODELS)
13 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
14 | wikitext_ppl_test(hf_runner, vllm_runner, model_info)
15 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.593
9 | - name: "exact_match,flexible-extract"
10 | value: 0.588
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "BLOCK_SIZE_M": 16,
4 | "BLOCK_SIZE_N": 32,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "8": {
11 | "BLOCK_SIZE_M": 16,
12 | "BLOCK_SIZE_N": 32,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 4
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.595
9 | - name: "exact_match,flexible-extract"
10 | value: 0.582
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | # Adapted from llama.py
5 | """Inference-only Phi3 model code inherit from Llama.py"""
6 |
7 | from vllm.model_executor.models.llama import LlamaForCausalLM
8 |
9 |
10 | class Phi3ForCausalLM(LlamaForCausalLM):
11 |
12 | packed_modules_mapping = {
13 | "qkv_proj": [
14 | "qkv_proj",
15 | ],
16 | "gate_up_proj": [
17 | "gate_up_proj",
18 | ],
19 | }
20 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
1 | # For hf script, without -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
3 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.905
9 | - name: "exact_match,flexible-extract"
10 | value: 0.905
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.753
9 | - name: "exact_match,flexible-extract"
10 | value: 0.753
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
3 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.356
9 | - name: "exact_match,flexible-extract"
10 | value: 0.358
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/vllm/scripts.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.entrypoints.cli.main import main as vllm_main
5 | from vllm.logger import init_logger
6 |
7 | logger = init_logger(__name__)
8 |
9 |
10 | # Backwards compatibility for the move from vllm.scripts to
11 | # vllm.entrypoints.cli.main
12 | def main():
13 | logger.warning("vllm.scripts.main() is deprecated. Please re-install "
14 | "vllm or use vllm.entrypoints.cli.main.main() instead.")
15 | vllm_main()
16 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.755
9 | - name: "exact_match,flexible-extract"
10 | value: 0.755
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/docs/deployment/frameworks/lobe-chat.md:
--------------------------------------------------------------------------------
1 | # Lobe Chat
2 |
3 | [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
4 |
5 | Supports speech-synthesis, multi-modal, and extensible (function call) plugin system.
6 |
7 | One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
8 |
9 | It supports vLLM as an AI model provider to efficiently serve large language models.
10 |
11 | For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
12 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.758
9 | - name: "exact_match,flexible-extract"
10 | value: 0.759
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/benchmarks/benchmark_serving.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | print("""DEPRECATED: This script has been moved to the vLLM CLI.
7 |
8 | Please use the following command instead:
9 | vllm bench serve
10 |
11 | For help with the new command, run:
12 | vllm bench serve --help
13 |
14 | Alternatively, you can run the new command directly with:
15 | python -m vllm.entrypoints.cli.main bench serve --help
16 | """)
17 | sys.exit(1)
18 |
--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | print("""DEPRECATED: This script has been moved to the vLLM CLI.
7 |
8 | Please use the following command instead:
9 | vllm bench latency
10 |
11 | For help with the new command, run:
12 | vllm bench latency --help
13 |
14 | Alternatively, you can run the new command directly with:
15 | python -m vllm.entrypoints.cli.main bench latency --help
16 | """)
17 | sys.exit(1)
18 |
--------------------------------------------------------------------------------
/tools/ep_kernels/configure_system_drivers.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | # turn on IBGDA
4 | echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
5 |
6 | if command -v update-initramfs &> /dev/null; then
7 | # for Debian/Ubuntu
8 | sudo update-initramfs -u
9 | elif command -v dracut &> /dev/null; then
10 | # for Fedora/CentOS
11 | sudo dracut --force
12 | else
13 | echo "No supported initramfs update tool found."
14 | exit 1
15 | fi
16 |
17 | echo "Please reboot the system to apply the changes"
18 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
3 | model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.6353
9 | - name: "exact_match,flexible-extract"
10 | value: 0.637
11 | limit: null
12 | num_fewshot: null
13 |
--------------------------------------------------------------------------------
/csrc/quantization/per_token_group_quant_8bit.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | // TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
5 | // 8-bit per-token-group quantization helper used by both FP8 and INT8
6 | void per_token_group_quant_8bit(const torch::Tensor& input,
7 | torch::Tensor& output_q,
8 | torch::Tensor& output_s, int64_t group_size,
9 | double eps, double min_8bit, double max_8bit,
10 | bool scale_ue8m0 = false);
--------------------------------------------------------------------------------
/benchmarks/benchmark_throughput.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | print("""DEPRECATED: This script has been moved to the vLLM CLI.
7 |
8 | Please use the following command instead:
9 | vllm bench throughput
10 |
11 | For help with the new command, run:
12 | vllm bench throughput --help
13 |
14 | Alternatively, you can run the new command directly with:
15 | python -m vllm.entrypoints.cli.main bench throughput --help
16 | """)
17 | sys.exit(1)
18 |
--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
2 |
3 | {%- for message in messages -%}
4 | {%- if message['role'] == 'user' -%}
5 | {{- '' + message['content'] -}}
6 | {%- elif message['role'] == 'assistant' -%}
7 | {{- '' + message['content'] -}}
8 | {%- endif -%}
9 | {%- endfor -%}
10 |
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 | {{- '' -}}
13 | {% endif %}
--------------------------------------------------------------------------------
/benchmarks/structured_schemas/structured_schema_1.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "object",
3 | "properties": {
4 | "name": { "type": "string" },
5 | "email": { "type": "string" },
6 | "street": { "type": "string" },
7 | "city": { "type": "string" },
8 | "state": { "type": "string" },
9 | "zip": { "type": "string" },
10 | "phone": { "type": "string" },
11 | "website": { "type": "string" },
12 | "company": { "type": "string" },
13 | "age": { "type": "integer" }
14 | },
15 | "required": [
16 | "name",
17 | "email"
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.distributed.kv_transfer.kv_transfer_state import (
5 | KVConnectorBaseType, ensure_kv_transfer_initialized,
6 | ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group,
7 | is_v1_kv_transfer_group)
8 |
9 | __all__ = [
10 | "get_kv_transfer_group", "has_kv_transfer_group",
11 | "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
12 | "ensure_kv_transfer_shutdown", "KVConnectorBaseType"
13 | ]
14 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.728
9 | - name: "exact_match,flexible-extract"
10 | value: 0.728
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/setup.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from setuptools import setup
5 |
6 | setup(
7 | name='vllm_add_dummy_platform',
8 | version='0.1',
9 | packages=['vllm_add_dummy_platform'],
10 | entry_points={
11 | 'vllm.platform_plugins': [
12 | "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa
13 | ],
14 | "vllm.general_plugins":
15 | ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
16 | })
17 |
--------------------------------------------------------------------------------
/vllm/v1/worker/ubatch_utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from dataclasses import dataclass
4 |
5 | from typing_extensions import TypeAlias
6 |
7 |
8 | @dataclass
9 | class UBatchSlice:
10 | request_slice: slice
11 | token_slice: slice
12 |
13 |
14 | UBatchSlices: TypeAlias = list[UBatchSlice]
15 |
16 |
17 | def is_second_ubatch_empty(orig_num_tokens_per_ubatch: int,
18 | padded_num_tokens_per_ubatch: int) -> bool:
19 | return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch
20 |
--------------------------------------------------------------------------------
/examples/offline_inference/openai_batch/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
3 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.752
9 | - name: "exact_match,flexible-extract"
10 | value: 0.754
11 | limit: 1000
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
1 | # For vllm script, with -t option (tensor parallel size).
2 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
3 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
4 | tasks:
5 | - name: "gsm8k"
6 | metrics:
7 | - name: "exact_match,strict-match"
8 | value: 0.764
9 | - name: "exact_match,flexible-extract"
10 | value: 0.764
11 | limit: 250
12 | num_fewshot: 5
13 |
--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
5 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
6 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
7 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
8 | compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
--------------------------------------------------------------------------------
/vllm/lora/ops/torch_ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401
5 | from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
6 | sgmv_expand, sgmv_expand_slice,
7 | sgmv_shrink)
8 |
9 | __all__ = [
10 | "bgmv_expand",
11 | "bgmv_expand_slice",
12 | "bgmv_shrink",
13 | "sgmv_expand",
14 | "sgmv_expand_slice",
15 | "sgmv_shrink",
16 | ]
17 |
--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
5 | TritonPlaceholder)
6 |
7 | if HAS_TRITON:
8 | import triton
9 | import triton.language as tl
10 | import triton.language.extra.libdevice as tldevice
11 | else:
12 | triton = TritonPlaceholder()
13 | tl = TritonLanguagePlaceholder()
14 | tldevice = TritonLanguagePlaceholder()
15 |
16 | __all__ = ["HAS_TRITON", "triton", "tl", "tldevice"]
17 |
--------------------------------------------------------------------------------
/requirements/rocm.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
5 | numba == 0.61.2; python_version > '3.9'
6 |
7 | # Dependencies for AMD GPUs
8 | boto3
9 | botocore
10 | datasets
11 | ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
12 | peft
13 | pytest-asyncio
14 | tensorizer==2.10.1
15 | packaging>=24.2
16 | setuptools>=77.0.3,<80.0.0
17 | setuptools-scm>=8
18 | runai-model-streamer==0.11.0
19 | runai-model-streamer-s3==0.11.0
20 | conch-triton-kernels==1.2.1
21 | timm>=1.0.17
--------------------------------------------------------------------------------
/vllm/ray/lazy_utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 |
5 | def is_ray_initialized():
6 | """Check if Ray is initialized."""
7 | try:
8 | import ray
9 | return ray.is_initialized()
10 | except ImportError:
11 | return False
12 |
13 |
14 | def is_in_ray_actor():
15 | """Check if we are in a Ray actor."""
16 |
17 | try:
18 | import ray
19 | return (ray.is_initialized()
20 | and ray.get_runtime_context().get_actor_id() is not None)
21 | except ImportError:
22 | return False
23 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = vllm
3 | omit =
4 | */tests/*
5 | */test_*
6 | */__pycache__/*
7 | */build/*
8 | */dist/*
9 | */vllm.egg-info/*
10 | */third_party/*
11 | */examples/*
12 | */benchmarks/*
13 | */docs/*
14 |
15 | [report]
16 | exclude_lines =
17 | pragma: no cover
18 | def __repr__
19 | if self.debug:
20 | if settings.DEBUG
21 | raise AssertionError
22 | raise NotImplementedError
23 | if 0:
24 | if __name__ == .__main__.:
25 | class .*\bProtocol\):
26 | @(abc\.)?abstractmethod
27 |
28 | [html]
29 | directory = htmlcov
30 |
31 | [xml]
32 | output = coverage.xml
33 |
--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
1 | {%- for message in messages -%}
2 | {%- if message['role'] == 'system' -%}
3 | {{- 'System: ' + message['content'] -}}
4 | {%- elif message['role'] == 'user' -%}
5 | {{- 'User: ' + message['content'] -}}
6 | {%- elif message['role'] == 'assistant' -%}
7 | {{- 'Falcon: ' + message['content'] -}}
8 | {%- endif -%}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n' -}}
11 | {%- endif -%}
12 | {%- endfor -%}
13 |
14 |
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 | {{- 'Falcon:' -}}
17 | {% endif %}
--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_gemma.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 | from tests.models.utils import GenerateModelInfo
6 |
7 | from .ppl_utils import wikitext_ppl_test
8 |
9 | MODELS = [
10 | GenerateModelInfo("google/gemma-2b"),
11 | GenerateModelInfo("google/gemma-2-2b"),
12 | GenerateModelInfo("google/gemma-3-4b-it"),
13 | ]
14 |
15 |
16 | @pytest.mark.parametrize("model_info", MODELS)
17 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
18 | wikitext_ppl_test(hf_runner, vllm_runner, model_info)
19 |
--------------------------------------------------------------------------------
/tests/test_outputs.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.outputs import RequestOutput
5 |
6 |
7 | def test_request_output_forward_compatible():
8 | output = RequestOutput(request_id="test_request_id",
9 | prompt="test prompt",
10 | prompt_token_ids=[1, 2, 3],
11 | prompt_logprobs=None,
12 | outputs=[],
13 | finished=False,
14 | example_arg_added_in_new_version="some_value")
15 | assert output is not None
16 |
--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.model_executor.parameter import (BasevLLMParameter,
5 | PackedvLLMParameter)
6 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
7 | SamplingMetadataCache)
8 | from vllm.model_executor.utils import set_random_seed
9 |
10 | __all__ = [
11 | "SamplingMetadata",
12 | "SamplingMetadataCache",
13 | "set_random_seed",
14 | "BasevLLMParameter",
15 | "PackedvLLMParameter",
16 | ]
17 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
2 | For different settings of
3 | - E (number of experts)
4 | - N (intermediate size)
5 | - device_name (torch.cuda.get_device_name())
6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
7 |
8 | The example configurations provided are for the Mixtral model for TP2 on H100
9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 |
12 | See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
13 |
--------------------------------------------------------------------------------
/vllm/v1/spec_decode/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.sampling_params import SamplingParams
4 |
5 | _SAMPLING_EPS = 1e-5
6 |
7 |
8 | def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
9 | """True if request is incompatible with speculative decoding"""
10 | return (sampling_params.frequency_penalty != 0.0
11 | or sampling_params.presence_penalty != 0.0
12 | or sampling_params.repetition_penalty != 1.0
13 | or sampling_params.min_p > _SAMPLING_EPS
14 | or sampling_params.logprobs is not None)
15 |
--------------------------------------------------------------------------------
/requirements/xpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | ray>=2.9
5 | cmake>=3.26.1
6 | packaging>=24.2
7 | setuptools-scm>=8
8 | setuptools>=77.0.3,<80.0.0
9 | wheel
10 | jinja2>=3.1.6
11 | datasets # for benchmark scripts
12 | numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
13 | nixl==0.3.0 # for PD disaggregation
14 | torch==2.8.0+xpu
15 | torchaudio
16 | torchvision
17 | --extra-index-url=https://download.pytorch.org/whl/xpu
18 |
19 | intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
20 |
--------------------------------------------------------------------------------
/examples/online_serving/prometheus_grafana/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # docker-compose.yaml
2 | version: "3"
3 |
4 | services:
5 | prometheus:
6 | image: prom/prometheus:latest
7 | extra_hosts:
8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
9 | ports:
10 | - "9090:9090" # the default port used by Prometheus
11 | volumes:
12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 |
14 | grafana:
15 | image: grafana/grafana:latest
16 | depends_on:
17 | - prometheus
18 | ports:
19 | - "3000:3000" # the default port used by Grafana
20 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "test_name": "llama8B_tp1_genai_perf",
4 | "qps_list": [4,8,16,32],
5 | "common_parameters": {
6 | "model": "meta-llama/Meta-Llama-3-8B-Instruct",
7 | "tp": 1,
8 | "port": 8000,
9 | "num_prompts": 500,
10 | "reuse_server": false
11 | },
12 | "vllm_server_parameters": {
13 | "disable_log_stats": "",
14 | "gpu_memory_utilization": 0.9,
15 | "max_num_seqs": 512,
16 | "dtype": "bfloat16"
17 | },
18 | "genai_perf_input_parameters": {
19 | }
20 | }
21 | ]
--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 |
4 | python_executable=python$1
5 | cuda_home=/usr/local/cuda-$2
6 |
7 | # Update paths
8 | PATH=${cuda_home}/bin:$PATH
9 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
10 |
11 | # Install requirements
12 | $python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
13 |
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure release wheels are built for the following architectures
17 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
18 |
19 | bash tools/check_repo.sh
20 |
21 | # Build
22 | $python_executable setup.py bdist_wheel --dist-dir=dist
23 |
--------------------------------------------------------------------------------
/tests/tokenization/test_do_lower_case.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from vllm.transformers_utils.tokenizer import get_tokenizer
7 |
8 | TOKENIZER_NAMES = ["BAAI/bge-base-en"]
9 |
10 |
11 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
12 | @pytest.mark.parametrize("n_tokens", [510])
13 | def test_special_tokens(tokenizer_name: str, n_tokens: int):
14 | tokenizer = get_tokenizer(tokenizer_name, revision="main")
15 |
16 | prompts = '[UNK]' * n_tokens
17 | prompt_token_ids = tokenizer.encode(prompts)
18 | assert len(prompt_token_ids) == n_tokens + 2
19 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/config_parser_base.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from abc import ABC, abstractmethod
5 | from pathlib import Path
6 | from typing import Optional, Union
7 |
8 | from transformers import PretrainedConfig
9 |
10 |
11 | class ConfigParserBase(ABC):
12 |
13 | @abstractmethod
14 | def parse(self,
15 | model: Union[str, Path],
16 | trust_remote_code: bool,
17 | revision: Optional[str] = None,
18 | code_revision: Optional[str] = None,
19 | **kwargs) -> tuple[dict, PretrainedConfig]:
20 | raise NotImplementedError
21 |
--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
1 | {%- set counter = namespace(index=0) -%}
2 | {%- for message in messages -%}
3 | {%- if message['role'] == 'user' -%}
4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}}
5 | {%- set counter.index = counter.index + 1 -%}
6 | {%- endif -%}
7 | {%- if message['role'] == 'assistant' -%}
8 | {{- '\n答:' + message['content'] -}}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n' -}}
11 | {%- endif -%}
12 | {%- endif -%}
13 | {%- endfor -%}
14 |
15 |
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 | {{- '\n答:' -}}
18 | {%- endif -%}
--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
1 | {%- set counter = namespace(index=1) -%}
2 | {%- for message in messages -%}
3 | {%- if message['role'] == 'user' -%}
4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}}
5 | {%- set counter.index = counter.index + 1 -%}
6 | {%- endif -%}
7 | {%- if message['role'] == 'assistant' -%}
8 | {{- '\n\n答:' + message['content'] -}}
9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 | {{- '\n\n' -}}
11 | {%- endif -%}
12 | {%- endif -%}
13 | {%- endfor -%}
14 |
15 |
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 | {{- '\n\n答:' -}}
18 | {%- endif -%}
--------------------------------------------------------------------------------
/docs/serving/integrations/llamaindex.md:
--------------------------------------------------------------------------------
1 | # LlamaIndex
2 |
3 | vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
4 |
5 | To install LlamaIndex, run
6 |
7 | ```bash
8 | pip install llama-index-llms-vllm -q
9 | ```
10 |
11 | To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
12 |
13 | ```python
14 | from llama_index.llms.vllm import Vllm
15 |
16 | llm = Vllm(
17 | model="microsoft/Orca-2-7b",
18 | tensor_parallel_size=4,
19 | max_new_tokens=100,
20 | vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
21 | )
22 | ```
23 |
24 | Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
25 |
--------------------------------------------------------------------------------
/tests/benchmarks/test_latency_cli.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import subprocess
4 |
5 | import pytest
6 |
7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
8 |
9 |
10 | @pytest.mark.benchmark
11 | def test_bench_latency():
12 | command = [
13 | "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
14 | "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 | ]
16 | result = subprocess.run(command, capture_output=True, text=True)
17 | print(result.stdout)
18 | print(result.stderr)
19 |
20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 |
--------------------------------------------------------------------------------
/tests/benchmarks/test_throughput_cli.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import subprocess
4 |
5 | import pytest
6 |
7 | MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
8 |
9 |
10 | @pytest.mark.benchmark
11 | def test_bench_throughput():
12 | command = [
13 | "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
14 | "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
15 | ]
16 | result = subprocess.run(command, capture_output=True, text=True)
17 | print(result.stdout)
18 | print(result.stderr)
19 |
20 | assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
21 |
--------------------------------------------------------------------------------
/tests/kernels/core/test_permute_cols.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 | import torch
6 |
7 | from tests.kernels.utils import opcheck
8 | from vllm._custom_ops import permute_cols
9 |
10 |
11 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
12 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
13 | def test_permute_cols(shape, dtype):
14 | x = torch.randn(shape, dtype=dtype).cuda()
15 | perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
16 | opcheck(torch.ops._C.permute_cols, (x, perm))
17 | y = permute_cols(x, perm)
18 | torch.testing.assert_close(y, x[:, perm])
--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import torch
5 |
6 | # Reference default values of atol and rtol are from
7 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
8 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
9 | default_rtol = {
10 | torch.float16: 1e-3,
11 | torch.bfloat16: 1.6e-2,
12 | torch.float: 1.3e-6
13 | }
14 |
15 |
16 | def get_default_atol(output) -> float:
17 | return default_atol[output.dtype]
18 |
19 |
20 | def get_default_rtol(output) -> float:
21 | return default_rtol[output.dtype]
22 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
1 | // Uses GitHub's API to create the release and wait for result.
2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
3 |
4 | module.exports = async (github, context, core) => {
5 | try {
6 | const response = await github.rest.repos.createRelease({
7 | draft: false,
8 | generate_release_notes: true,
9 | name: process.env.RELEASE_TAG,
10 | owner: context.repo.owner,
11 | prerelease: true,
12 | repo: context.repo.repo,
13 | tag_name: process.env.RELEASE_TAG,
14 | });
15 |
16 | core.setOutput('upload_url', response.data.upload_url);
17 | } catch (error) {
18 | core.setFailed(error.message);
19 | }
20 | }
--------------------------------------------------------------------------------
/tests/evals/gpt_oss/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | Pytest configuration for GPT-OSS evaluation tests.
5 | """
6 |
7 |
8 | def pytest_addoption(parser):
9 | """Add command line options for pytest."""
10 | parser.addoption("--model", action="store", help="Model name to evaluate")
11 | parser.addoption("--metric",
12 | action="store",
13 | type=float,
14 | help="Expected metric threshold")
15 | parser.addoption("--server-args",
16 | action="store",
17 | default="",
18 | help="Additional server arguments")
19 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """
4 | Multi-modal processors may be defined in this directory for the following
5 | reasons:
6 |
7 | - There is no processing file defined by HF Hub or Transformers library.
8 | - There is a need to override the existing processor to support vLLM.
9 | """
10 |
11 | from vllm.transformers_utils.processors.deepseek_vl2 import (
12 | DeepseekVLV2Processor)
13 | from vllm.transformers_utils.processors.ovis import OvisProcessor
14 | from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
15 |
16 | __all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
17 |
--------------------------------------------------------------------------------
/benchmarks/multi_turn/bench_utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import logging
4 | from enum import Enum
5 |
6 |
7 | class Color(Enum):
8 | RED = "\033[91m"
9 | GREEN = "\033[92m"
10 | BLUE = "\033[94m"
11 | PURPLE = "\033[95m"
12 | CYAN = "\033[96m"
13 | YELLOW = "\033[93m"
14 | RESET = "\033[0m"
15 |
16 | def __str__(self):
17 | return self.value
18 |
19 |
20 | TEXT_SEPARATOR = "-" * 100
21 |
22 | # Configure the logger
23 | logging.basicConfig(
24 | level=logging.INFO,
25 | format="%(asctime)s [%(levelname)s] - %(message)s",
26 | datefmt="%d-%m-%Y %H:%M:%S",
27 | )
28 | logger = logging.getLogger(__name__)
29 |
--------------------------------------------------------------------------------
/examples/offline_inference/disaggregated-prefill-v1/README.md:
--------------------------------------------------------------------------------
1 | # Disaggregated Prefill V1
2 |
3 | This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
4 |
5 | ## Files
6 |
7 | - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
8 | - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
9 | - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
10 | - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
11 |
--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import glob
5 |
6 | requires_files = glob.glob('requirements/*.txt')
7 | requires_files += ["pyproject.toml"]
8 | for file in requires_files:
9 | print(f">>> cleaning {file}")
10 | with open(file) as f:
11 | lines = f.readlines()
12 | if "torch" in "".join(lines).lower():
13 | print("removed:")
14 | with open(file, 'w') as f:
15 | for line in lines:
16 | if 'torch' not in line.lower():
17 | f.write(line)
18 | else:
19 | print(line.strip())
20 | print(f"<<< done cleaning {file}")
21 | print()
--------------------------------------------------------------------------------
/vllm/model_executor/layers/fla/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | # SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
4 | #
5 | # This file contains code copied from the flash-linear-attention project.
6 | # The original source code was licensed under the MIT license and included
7 | # the following copyright notice:
8 | # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
9 | from .chunk import chunk_gated_delta_rule
10 | from .fused_recurrent import fused_recurrent_gated_delta_rule
11 | from .layernorm_guard import RMSNormGated
12 |
13 | __all__ = [
14 | "RMSNormGated",
15 | "chunk_gated_delta_rule",
16 | "fused_recurrent_gated_delta_rule",
17 | ]
18 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/serve.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import argparse
4 |
5 | from vllm.benchmarks.serve import add_cli_args, main
6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
7 |
8 |
9 | class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
10 | """ The `serve` subcommand for vllm bench. """
11 |
12 | name = "serve"
13 | help = "Benchmark the online serving throughput."
14 |
15 | @classmethod
16 | def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 | add_cli_args(parser)
18 |
19 | @staticmethod
20 | def cmd(args: argparse.Namespace) -> None:
21 | main(args)
22 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import torch
5 |
6 | from vllm.logger import init_logger
7 |
8 | logger = init_logger(__name__)
9 |
10 |
11 | def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
12 |
13 | try:
14 | from flashinfer import mxfp8_quantize
15 | except ImportError as err:
16 | raise ImportError("The package `flashinfer` is required to do "
17 | "MX-FP8 quantization. Please install it with" \
18 | "`pip install flashinfer`") from err
19 |
20 | return mxfp8_quantize(x, is_sf_swizzled_layout=False)
21 |
--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | UseTab: Never
3 | IndentWidth: 2
4 | ColumnLimit: 80
5 |
6 | # Force pointers to the type for C++.
7 | DerivePointerAlignment: false
8 | PointerAlignment: Left
9 |
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 |
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 |
18 | IncludeCategories:
19 | - Regex: '^<'
20 | Priority: 4
21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 | Priority: 3
23 | - Regex: '^"(qoda|\.\.)/'
24 | Priority: 2
25 | - Regex: '.*'
26 | Priority: 1
27 |
--------------------------------------------------------------------------------
/examples/template_vlm2vec.jinja:
--------------------------------------------------------------------------------
1 | {%- if messages | length > 1 -%}
2 | {{ raise_exception('Embedding models should only embed one message at a time') }}
3 | {%- endif -%}
4 |
5 | {% set vars = namespace(parts=[], next_image_id=1) %}
6 | {%- for message in messages -%}
7 | {%- for content in message['content'] -%}
8 | {%- if content['type'] == 'text' -%}
9 | {%- set vars.parts = vars.parts + [content['text']] %}
10 | {%- elif content['type'] == 'image' -%}
11 | {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
12 | {%- set vars.next_image_id = vars.next_image_id + 1 %}
13 | {%- endif -%}
14 | {%- endfor -%}
15 | {%- endfor -%}
16 | {{ vars.parts | join(' ') }}
17 |
--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.attention.backends.abstract import (AttentionBackend,
5 | AttentionMetadata,
6 | AttentionMetadataBuilder,
7 | AttentionState, AttentionType)
8 | from vllm.attention.layer import Attention
9 | from vllm.attention.selector import get_attn_backend
10 |
11 | __all__ = [
12 | "Attention",
13 | "AttentionBackend",
14 | "AttentionMetadata",
15 | "AttentionType",
16 | "AttentionMetadataBuilder",
17 | "AttentionState",
18 | "get_attn_backend",
19 | ]
20 |
--------------------------------------------------------------------------------
/tests/models/language/generation_ppl_test/test_qwen.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from tests.models.utils import GenerateModelInfo
7 |
8 | from .ppl_utils import wikitext_ppl_test
9 |
10 | MODELS = [
11 | GenerateModelInfo("Qwen/Qwen3-0.6B"),
12 | GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"),
13 | # transformers:
14 | # Loading a GPTQ quantized model requires optimum, gptqmodel
15 | # GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
16 | ]
17 |
18 |
19 | @pytest.mark.parametrize("model_info", MODELS)
20 | def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
21 | wikitext_ppl_test(hf_runner, vllm_runner, model_info)
22 |
--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.model_executor.layers.quantization import get_quantization_config
5 | from vllm.platforms import current_platform
6 |
7 |
8 | def is_quant_method_supported(quant_method: str) -> bool:
9 | # Currently, all quantization methods require Nvidia or AMD GPUs
10 | if not (current_platform.is_cuda() or current_platform.is_rocm()):
11 | return False
12 |
13 | capability = current_platform.get_device_capability()
14 | assert capability is not None
15 |
16 | min_capability = get_quantization_config(quant_method).get_min_capability()
17 |
18 | return capability.to_int() >= min_capability
19 |
--------------------------------------------------------------------------------
/.github/scale-config.yml:
--------------------------------------------------------------------------------
1 | # scale-config.yml:
2 | # Powers what instance types are available for GHA auto-scaled
3 | # runners. Runners listed here will be available as self hosted
4 | # runners, configuration is directly pulled from the main branch.
5 | # runner_types:
6 | # runner_label:
7 | # instance_type: m4.large
8 | # os: linux
9 | # # min_available defaults to the global cfg in the ALI Terraform
10 | # min_available: undefined
11 | # # when max_available value is not defined, no max runners is enforced
12 | # max_available: undefined
13 | # disk_size: 50
14 | # is_ephemeral: true
15 |
16 | runner_types:
17 | linux.2xlarge:
18 | disk_size: 150
19 | instance_type: c5.2xlarge
20 | is_ephemeral: true
21 | os: linux
22 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/latency.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import argparse
4 |
5 | from vllm.benchmarks.latency import add_cli_args, main
6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
7 |
8 |
9 | class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
10 | """ The `latency` subcommand for vllm bench. """
11 |
12 | name = "latency"
13 | help = "Benchmark the latency of a single batch of requests."
14 |
15 | @classmethod
16 | def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 | add_cli_args(parser)
18 |
19 | @staticmethod
20 | def cmd(args: argparse.Namespace) -> None:
21 | main(args)
22 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/throughput.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import argparse
4 |
5 | from vllm.benchmarks.throughput import add_cli_args, main
6 | from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
7 |
8 |
9 | class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
10 | """ The `throughput` subcommand for vllm bench. """
11 |
12 | name = "throughput"
13 | help = "Benchmark offline inference throughput."
14 |
15 | @classmethod
16 | def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
17 | add_cli_args(parser)
18 |
19 | @staticmethod
20 | def cmd(args: argparse.Namespace) -> None:
21 | main(args)
22 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Replace '.' with '-' ex: 11.8 -> 11-8
4 | cuda_version=$(echo "$1" | tr "." "-")
5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
6 | OS=$(echo "$2" | tr -d ".\-")
7 |
8 | # Installs CUDA
9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
14 | sudo apt clean
15 |
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 |
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 |
--------------------------------------------------------------------------------
/requirements/cuda.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 |
4 | numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
5 | numba == 0.61.2; python_version > '3.9'
6 |
7 | # Dependencies for NVIDIA GPUs
8 | ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
9 | torch==2.8.0
10 | torchaudio==2.8.0
11 | # These must be updated alongside torch
12 | torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
13 | # https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
14 | xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
15 |
--------------------------------------------------------------------------------
/tests/entrypoints/openai/conftest.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import pytest
4 |
5 | from vllm.assets.audio import AudioAsset
6 |
7 |
8 | @pytest.fixture
9 | def mary_had_lamb():
10 | path = AudioAsset('mary_had_lamb').get_local_path()
11 | with open(str(path), "rb") as f:
12 | yield f
13 |
14 |
15 | @pytest.fixture
16 | def winning_call():
17 | path = AudioAsset('winning_call').get_local_path()
18 | with open(str(path), "rb") as f:
19 | yield f
20 |
21 |
22 | @pytest.fixture
23 | def foscolo():
24 | # Test translation it->en
25 | path = AudioAsset('azacinto_foscolo').get_local_path()
26 | with open(str(path), "rb") as f:
27 | yield f
28 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 16,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 4
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 8,
24 | "num_stages": 5
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 16,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
--------------------------------------------------------------------------------
/requirements/rocm-test.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r common.txt
3 | tblib==3.1.0
4 |
5 | # entrypoints test
6 | # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
7 | audioread==3.0.1
8 | cffi==1.17.1
9 | decorator==5.2.1
10 | lazy-loader==0.4
11 | platformdirs==4.3.6
12 | pooch==1.8.2
13 | #pycparse==2.22
14 | soundfile==0.13.1
15 | soxr==0.5.0.post1
16 | librosa==0.10.2.post1
17 |
18 | # entrypoints test
19 | #vllm[video] # required by entrypoints/openai/test_video.py
20 | decord==0.6.0
21 |
22 | # entrypoints test
23 | #sentence-transformers # required by entrypoints/openai/test_score.py
24 | sentence-transformers==3.4.1
25 |
26 | # Basic Models Test
27 | matplotlib==3.10.3
28 |
29 | # Multi-Modal Models Test (Extended) 3
30 | blobfile==3.0.0
31 |
32 |
33 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 1,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 64,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 64,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 16,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 64,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 32,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 64,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 128,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 3
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 64,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 128,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 64,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 128,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 1,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 16,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 32,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 64,
15 | "num_warps": 4,
16 | "num_stages": 2
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 1,
23 | "num_warps": 4,
24 | "num_stages": 2
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json:
--------------------------------------------------------------------------------
1 | {
2 | "2048": {
3 | "BLOCK_SIZE_M": 128,
4 | "BLOCK_SIZE_N": 128,
5 | "BLOCK_SIZE_K": 64,
6 | "GROUP_SIZE_M": 64,
7 | "num_warps": 4,
8 | "num_stages": 2
9 | },
10 | "3072": {
11 | "BLOCK_SIZE_M": 128,
12 | "BLOCK_SIZE_N": 128,
13 | "BLOCK_SIZE_K": 64,
14 | "GROUP_SIZE_M": 32,
15 | "num_warps": 4,
16 | "num_stages": 3
17 | },
18 | "4096": {
19 | "BLOCK_SIZE_M": 128,
20 | "BLOCK_SIZE_N": 128,
21 | "BLOCK_SIZE_K": 64,
22 | "GROUP_SIZE_M": 64,
23 | "num_warps": 4,
24 | "num_stages": 3
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/tests/fastsafetensors_loader/test_fastsafetensors_loader.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm import SamplingParams
5 |
6 | test_model = "openai-community/gpt2"
7 |
8 | prompts = [
9 | "Hello, my name is",
10 | "The president of the United States is",
11 | "The capital of France is",
12 | "The future of AI is",
13 | ]
14 | # Create a sampling params object.
15 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
16 |
17 |
18 | def test_model_loader_download_files(vllm_runner):
19 | with vllm_runner(test_model, load_format="fastsafetensors") as llm:
20 | deserialized_outputs = llm.generate(prompts, sampling_params)
21 | assert deserialized_outputs
22 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import torch
5 |
6 | from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
7 |
8 |
9 | # Register CustomRotaryEmbedding to CustomOP.
10 | @RotaryEmbedding.register_oot
11 | class DummyRotaryEmbedding(RotaryEmbedding):
12 | """Original rotary positional embedding."""
13 |
14 | def __init__(self, *args, **kwargs):
15 | super().__init__(*args, **kwargs)
16 | self.addition_config = True
17 |
18 | def forward_oot(self, *args,
19 | **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
20 | return super().forward_oot(*args, **kwargs)
21 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python_executable=python$1
4 | pytorch_version=$2
5 | cuda_version=$3
6 |
7 | # Install torch
8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
10 |
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 |
--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
2 |
3 | {% for message in messages %}
4 | {% if message['role'] == 'user' %}
5 | ### Instruction:
6 | {{ message['content']|trim -}}
7 | {% if not loop.last %}
8 |
9 |
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 |
16 |
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 |
23 |
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/benchmark/base.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import argparse
4 |
5 | from vllm.entrypoints.cli.types import CLISubcommand
6 |
7 |
8 | class BenchmarkSubcommandBase(CLISubcommand):
9 | """ The base class of subcommands for vllm bench. """
10 |
11 | help: str
12 |
13 | @classmethod
14 | def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
15 | """Add the CLI arguments to the parser."""
16 | raise NotImplementedError
17 |
18 | @staticmethod
19 | def cmd(args: argparse.Namespace) -> None:
20 | """Run the benchmark.
21 |
22 | Args:
23 | args: The arguments to the command.
24 | """
25 | raise NotImplementedError
26 |
--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 |
5 | #include
6 | #ifdef ENABLE_FP8
7 | #ifndef USE_ROCM
8 | #include
9 | #endif // USE_ROCM
10 | #endif // ENABLE_FP8
11 |
12 | namespace vllm {
13 |
14 | enum class Fp8KVCacheDataType {
15 | kAuto = 0,
16 | kFp8E4M3 = 1,
17 | kFp8E5M2 = 2,
18 | };
19 |
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec {
23 | using Type = uint8_t;
24 | };
25 |
26 | template <>
27 | struct Vec {
28 | using Type = uint16_t;
29 | };
30 |
31 | template <>
32 | struct Vec {
33 | using Type = uint32_t;
34 | };
35 |
36 | template <>
37 | struct Vec {
38 | using Type = uint2;
39 | };
40 |
41 | } // namespace vllm
42 |
--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 |
--------------------------------------------------------------------------------
/vllm/v1/engine/exceptions.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | class EngineGenerateError(Exception):
4 | """Raised when a AsyncLLM.generate() fails. Recoverable."""
5 | pass
6 |
7 |
8 | class EngineDeadError(Exception):
9 | """Raised when the EngineCore dies. Unrecoverable."""
10 |
11 | def __init__(self, *args, suppress_context: bool = False, **kwargs):
12 | ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause." # noqa: E501
13 |
14 | super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
15 | # Make stack trace clearer when using with LLMEngine by
16 | # silencing irrelevant ZMQError.
17 | self.__suppress_context__ = suppress_context
18 |
--------------------------------------------------------------------------------
/docs/deployment/integrations/kubeai.md:
--------------------------------------------------------------------------------
1 | # KubeAI
2 |
3 | [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
4 |
5 | Please see the Installation Guides for environment specific instructions:
6 |
7 | - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
8 | - [EKS](https://www.kubeai.org/installation/eks/)
9 | - [GKE](https://www.kubeai.org/installation/gke/)
10 |
11 | Once you have KubeAI installed, you can
12 | [configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
13 | using vLLM.
14 |
--------------------------------------------------------------------------------
/docs/mkdocs/javascript/run_llm_widget.js:
--------------------------------------------------------------------------------
1 | // Add RunLLM widget
2 | document.addEventListener("DOMContentLoaded", function () {
3 | var script = document.createElement("script");
4 | script.type = "module";
5 | script.id = "runllm-widget-script"
6 |
7 | script.src = "https://widget.runllm.com";
8 |
9 | script.setAttribute("version", "stable");
10 | script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
11 | script.setAttribute("runllm-name", "vLLM");
12 | script.setAttribute("runllm-position", "BOTTOM_RIGHT");
13 | script.setAttribute("runllm-position-y", "120px");
14 | script.setAttribute("runllm-position-x", "20px");
15 | script.setAttribute("runllm-assistant-id", "207");
16 |
17 | script.async = true;
18 | document.head.appendChild(script);
19 | });
20 |
--------------------------------------------------------------------------------
/vllm/model_executor/layers/attention_layer_base.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | """Base class for attention-like layers."""
4 | from abc import ABC, abstractmethod
5 | from typing import TYPE_CHECKING
6 |
7 | if TYPE_CHECKING:
8 | from vllm.attention.backends.abstract import AttentionBackend
9 |
10 |
11 | class AttentionLayerBase(ABC):
12 | """
13 | Base class for attention-like layers (Attention, Mamba, etc.)
14 | that support the v1 engine.
15 |
16 | This provides a common interface for getting attention backends
17 | from different layer types.
18 | """
19 |
20 | @abstractmethod
21 | def get_attn_backend(self) -> type["AttentionBackend"]:
22 | """Get the attention backend class for this layer."""
23 | pass
24 |
--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from typing import Optional
5 |
6 | import torch
7 |
8 | from vllm.model_executor.models.opt import OPTForCausalLM
9 | from vllm.model_executor.sampling_metadata import SamplingMetadata
10 |
11 |
12 | class MyOPTForCausalLM(OPTForCausalLM):
13 |
14 | def compute_logits(
15 | self, hidden_states: torch.Tensor,
16 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
17 | # this dummy model always predicts the first token
18 | logits = super().compute_logits(hidden_states, sampling_metadata)
19 | if logits is not None:
20 | logits.zero_()
21 | logits[:, 0] += 1.0
22 | return logits
23 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
3 | if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
4 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
5 | else
6 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
7 | fi
8 |
9 | TIMEOUT_SECONDS=10
10 |
11 | retries=0
12 | while [ $retries -lt 1000 ]; do
13 | if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
14 | exit 0
15 | fi
16 |
17 | echo "Waiting for image to be available..."
18 |
19 | retries=$((retries + 1))
20 | sleep 5
21 | done
22 |
23 | exit 1
24 |
--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 | from transformers import PreTrainedTokenizerBase
6 |
7 | from vllm.transformers_utils.tokenizer import get_tokenizer
8 |
9 | TOKENIZER_NAMES = [
10 | "facebook/opt-125m",
11 | "gpt2",
12 | ]
13 |
14 |
15 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
16 | def test_tokenizer_revision(tokenizer_name: str):
17 | # Assume that "main" branch always exists
18 | tokenizer = get_tokenizer(tokenizer_name, revision="main")
19 | assert isinstance(tokenizer, PreTrainedTokenizerBase)
20 |
21 | # Assume that "never" branch always does not exist
22 | with pytest.raises(OSError, match='not a valid git identifier'):
23 | get_tokenizer(tokenizer_name, revision="never")
24 |
--------------------------------------------------------------------------------
/docs/mkdocs/overrides/partials/toc-item.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ toc_item.title }}
6 |
7 |
8 |
9 |
10 | {% if toc_item.children %}
11 |
20 | {% endif %}
21 |
--------------------------------------------------------------------------------
/tests/standalone_tests/python_only_compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script tests if the python only compilation works correctly
3 | # for users who do not have any compilers installed on their system
4 |
5 | set -e
6 | set -x
7 |
8 | cd /vllm-workspace/
9 |
10 | # uninstall vllm
11 | pip3 uninstall -y vllm
12 | # restore the original files
13 | mv src/vllm ./vllm
14 |
15 | # remove all compilers
16 | apt remove --purge build-essential -y
17 | apt autoremove -y
18 |
19 | echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
20 |
21 | VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
22 |
23 | # Run the script
24 | python3 -c 'import vllm'
25 |
26 | # Check if the clangd log file was created
27 | if [ ! -f /tmp/changed.file ]; then
28 | echo "changed.file was not created, python only compilation failed"
29 | exit 1
30 | fi
31 |
--------------------------------------------------------------------------------
/vllm/env_override.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import os
4 |
5 | import torch
6 |
7 | from vllm.logger import init_logger
8 |
9 | logger = init_logger(__name__)
10 |
11 | # set some common config/environment variables that should be set
12 | # for all processes created by vllm and all processes
13 | # that interact with vllm workers.
14 | # they are executed whenever `import vllm` is called.
15 |
16 | # see https://github.com/vllm-project/vllm/pull/15951
17 | # it avoids unintentional cuda initialization from torch.cuda.is_available()
18 | os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
19 |
20 | # see https://github.com/vllm-project/vllm/issues/10480
21 | os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
22 | # see https://github.com/vllm-project/vllm/issues/10619
23 | torch._inductor.config.compile_threads = 1
24 |
--------------------------------------------------------------------------------
/docs/mkdocs/hooks/remove_announcement.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import os
4 | from pathlib import Path
5 | from typing import Literal
6 |
7 |
8 | def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
9 | # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
10 | if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
11 | # remove the warning banner if the version is a tagged release
12 | mkdocs_dir = Path(__file__).parent.parent
13 | announcement_path = mkdocs_dir / "overrides/main.html"
14 | # The file might be removed already if the build is triggered multiple
15 | # times (readthedocs build both HTML and PDF versions separately)
16 | if announcement_path.exists():
17 | os.remove(announcement_path)
18 |
--------------------------------------------------------------------------------
/vllm/lora/punica_wrapper/punica_selector.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from vllm.logger import init_logger
5 | from vllm.platforms import current_platform
6 | from vllm.utils import resolve_obj_by_qualname
7 |
8 | from .punica_base import PunicaWrapperBase
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
14 | punica_wrapper_qualname = current_platform.get_punica_wrapper()
15 | punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
16 | punica_wrapper = punica_wrapper_cls(*args, **kwargs)
17 | assert punica_wrapper is not None, \
18 | "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
19 | logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
20 | return punica_wrapper
21 |
--------------------------------------------------------------------------------
/csrc/core/math.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | inline constexpr uint32_t next_pow_2(uint32_t const num) {
7 | if (num <= 1) return num;
8 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
9 | }
10 |
11 | template
12 | static inline constexpr auto div_ceil(A a, B b) {
13 | return (a + b - 1) / b;
14 | }
15 |
16 | // Round a down to the next multiple of b. The caller is responsible for making
17 | // sure that b is non-zero
18 | template
19 | inline constexpr T round_to_previous_multiple_of(T a, T b) {
20 | return a % b == 0 ? a : (a / b) * b;
21 | }
22 |
23 | // Round a up to the next multiple of b. The caller is responsible for making
24 | // sure that b is non-zero
25 | template
26 | inline constexpr T round_to_next_multiple_of(T a, T b) {
27 | return a % b == 0 ? a : ((a / b) + 1) * b;
28 | }
29 |
--------------------------------------------------------------------------------
/vllm/plugins/lora_resolvers/README.md:
--------------------------------------------------------------------------------
1 | # LoRA Resolver Plugins
2 |
3 | This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
4 | via the LoRAResolver plugin framework.
5 |
6 | Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
7 | to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
8 |
9 | ## lora_filesystem_resolver
10 |
11 | This LoRA Resolver is installed with vLLM by default.
12 | To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
13 | for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
14 | for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
15 | load that adapter, and then service the request as normal. That adapter will then be available
16 | for future requests as normal.
17 |
--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
1 | name: Add label on auto-merge enabled
2 | permissions:
3 | pull-requests: write
4 | on:
5 | pull_request_target:
6 | types:
7 | - auto_merge_enabled
8 | jobs:
9 | add-label-on-auto-merge:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Add label
13 | uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
14 | with:
15 | script: |
16 | github.rest.issues.addLabels({
17 | owner: context.repo.owner,
18 | repo: context.repo.repo,
19 | issue_number: context.issue.number,
20 | labels: ['ready']
21 | })
22 | env:
23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 |
--------------------------------------------------------------------------------
/tests/v1/test_request.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | from vllm.v1.request import RequestStatus
4 |
5 |
6 | def test_request_status_fmt_str():
7 | """Test that the string representation of RequestStatus is correct."""
8 | assert f"{RequestStatus.WAITING}" == "WAITING"
9 | assert f"{RequestStatus.WAITING_FOR_FSM}" == "WAITING_FOR_FSM"
10 | assert f"{RequestStatus.WAITING_FOR_REMOTE_KVS}" == "WAITING_FOR_REMOTE_KVS"
11 | assert f"{RequestStatus.RUNNING}" == "RUNNING"
12 | assert f"{RequestStatus.PREEMPTED}" == "PREEMPTED"
13 | assert f"{RequestStatus.FINISHED_STOPPED}" == "FINISHED_STOPPED"
14 | assert f"{RequestStatus.FINISHED_LENGTH_CAPPED}" == "FINISHED_LENGTH_CAPPED"
15 | assert f"{RequestStatus.FINISHED_ABORTED}" == "FINISHED_ABORTED"
16 | assert f"{RequestStatus.FINISHED_IGNORED}" == "FINISHED_IGNORED"
17 |
--------------------------------------------------------------------------------
/vllm/entrypoints/cli/types.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | from __future__ import annotations
5 |
6 | import argparse
7 | import typing
8 |
9 | if typing.TYPE_CHECKING:
10 | from vllm.utils import FlexibleArgumentParser
11 |
12 |
13 | class CLISubcommand:
14 | """Base class for CLI argument handlers."""
15 |
16 | name: str
17 |
18 | @staticmethod
19 | def cmd(args: argparse.Namespace) -> None:
20 | raise NotImplementedError("Subclasses should implement this method")
21 |
22 | def validate(self, args: argparse.Namespace) -> None:
23 | # No validation by default
24 | pass
25 |
26 | def subparser_init(
27 | self,
28 | subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
29 | raise NotImplementedError("Subclasses should implement this method")
30 |
--------------------------------------------------------------------------------
/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja:
--------------------------------------------------------------------------------
1 | {%- if messages[0]['role'] == 'system' -%}
2 | {%- set system_message = messages[0]['content'] -%}
3 | {%- set messages = messages[1:] -%}
4 | {%- else -%}
5 | {% set system_message = '' -%}
6 | {%- endif -%}
7 |
8 | {{ bos_token + system_message }}
9 | {%- for message in messages -%}
10 | {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 | {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 | {%- endif -%}
13 |
14 | {%- if message['role'] == 'user' -%}
15 | {{ '<|User|>: ' + message['content'] + '\n\n' }}
16 | {%- elif message['role'] == 'assistant' -%}
17 | {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
18 | {%- endif -%}
19 | {%- endfor -%}
20 |
21 | {%- if add_generation_prompt -%}
22 | {{ '<|Assistant|>: ' }}
23 | {%- endif -%}
24 |
--------------------------------------------------------------------------------
/tests/test_seed_behavior.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 | import random
4 |
5 | import numpy as np
6 | import torch
7 |
8 | from vllm.platforms.interface import Platform
9 |
10 |
11 | def test_seed_behavior():
12 | # Test with a specific seed
13 | Platform.seed_everything(42)
14 | random_value_1 = random.randint(0, 100)
15 | np_random_value_1 = np.random.randint(0, 100)
16 | torch_random_value_1 = torch.randint(0, 100, (1, )).item()
17 |
18 | Platform.seed_everything(42)
19 | random_value_2 = random.randint(0, 100)
20 | np_random_value_2 = np.random.randint(0, 100)
21 | torch_random_value_2 = torch.randint(0, 100, (1, )).item()
22 |
23 | assert random_value_1 == random_value_2
24 | assert np_random_value_1 == np_random_value_2
25 | assert torch_random_value_1 == torch_random_value_2
26 |
--------------------------------------------------------------------------------
/docs/serving/integrations/langchain.md:
--------------------------------------------------------------------------------
1 | # LangChain
2 |
3 | vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
4 |
5 | To install LangChain, run
6 |
7 | ```bash
8 | pip install langchain langchain_community -q
9 | ```
10 |
11 | To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
12 |
13 | ??? code
14 |
15 | ```python
16 | from langchain_community.llms import VLLM
17 |
18 | llm = VLLM(model="mosaicml/mpt-7b",
19 | trust_remote_code=True, # mandatory for hf models
20 | max_new_tokens=128,
21 | top_k=10,
22 | top_p=0.95,
23 | temperature=0.8,
24 | # tensor_parallel_size=... # for distributed inference
25 | )
26 |
27 | print(llm("What is the capital of France ?"))
28 | ```
29 |
30 | Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
31 |
--------------------------------------------------------------------------------
/examples/online_serving/chart-helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: chart-vllm
3 | description: Chart vllm
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.0.1
19 |
20 | maintainers:
21 | - name: mfournioux
22 |
--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import pytest
5 |
6 | from vllm import LLM
7 |
8 |
9 | @pytest.fixture(autouse=True)
10 | def v1(run_with_both_engines):
11 | # Simple autouse wrapper to run both engines for each test
12 | # This can be promoted up to conftest.py to run for every
13 | # test in a package
14 | pass
15 |
16 |
17 | def test_empty_prompt():
18 | llm = LLM(model="openai-community/gpt2", enforce_eager=True)
19 | with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
20 | llm.generate([""])
21 |
22 |
23 | @pytest.mark.skip_v1
24 | def test_out_of_vocab_token():
25 | llm = LLM(model="openai-community/gpt2", enforce_eager=True)
26 | with pytest.raises(ValueError, match='out of vocabulary'):
27 | llm.generate({"prompt_token_ids": [999999]})
28 |
--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3 |
4 | import argparse
5 |
6 | from transformers import AutoTokenizer
7 |
8 |
9 | def main(model, cachedir):
10 | # Load the tokenizer and save it to the specified directory
11 | tokenizer = AutoTokenizer.from_pretrained(model)
12 | tokenizer.save_pretrained(cachedir)
13 | print(f"Tokenizer saved to {cachedir}")
14 |
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser(
18 | description="Download and save Hugging Face tokenizer"
19 | )
20 | parser.add_argument("--model", type=str, required=True, help="Name of the model")
21 | parser.add_argument(
22 | "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
23 | )
24 |
25 | args = parser.parse_args()
26 | main(args.model, args.cachedir)
27 |
--------------------------------------------------------------------------------
/.github/workflows/cleanup_pr_body.yml:
--------------------------------------------------------------------------------
1 | name: Cleanup PR Body
2 |
3 | on:
4 | pull_request_target:
5 | types: [opened, reopened, edited]
6 |
7 | permissions:
8 | pull-requests: write
9 |
10 | jobs:
11 | update-description:
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - name: Checkout repository
16 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
20 | with:
21 | python-version: '3.12'
22 |
23 | - name: Install Python dependencies
24 | run: |
25 | python3 -m pip install --upgrade pip
26 | python3 -m pip install regex
27 |
28 | - name: Update PR description
29 | env:
30 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31 | run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
32 |
--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu:
--------------------------------------------------------------------------------
1 | #include "c3x/scaled_mm_helper.hpp"
2 | #include "c3x/scaled_mm_kernels.hpp"
3 |
4 | /*
5 | This file defines quantized GEMM operations using the CUTLASS 3.x API, for
6 | NVIDIA GPUs with sm100 (Blackwell).
7 | */
8 |
9 | #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
10 |
11 | void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
12 | torch::Tensor const& b,
13 | torch::Tensor const& a_scales,
14 | torch::Tensor const& b_scales,
15 | std::optional const& bias) {
16 | dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 | vllm::cutlass_scaled_mm_sm100_fp8,
18 | nullptr, // int8 not supported on SM100
19 | vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
20 | }
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu:
--------------------------------------------------------------------------------
1 | #include "c3x/scaled_mm_helper.hpp"
2 | #include "c3x/scaled_mm_kernels.hpp"
3 |
4 | /*
5 | This file defines quantized GEMM operations using the CUTLASS 3.x API, for
6 | NVIDIA GPUs with sm120 (Blackwell).
7 | */
8 |
9 | #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
10 |
11 | void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
12 | torch::Tensor const& b,
13 | torch::Tensor const& a_scales,
14 | torch::Tensor const& b_scales,
15 | std::optional const& bias) {
16 | dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
17 | vllm::cutlass_scaled_mm_sm120_fp8,
18 | nullptr, // int8 not supported on SM120
19 | vllm::cutlass_scaled_mm_blockwise_sm120_fp8);
20 | }
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | Copied from https://github.com/turboderp/exllamav2
3 | */
4 |
5 | #ifndef _qdq_8_cuh
6 | #define _qdq_8_cuh
7 |
8 | #include "qdq_util.cuh"
9 |
10 | namespace vllm {
11 | namespace gptq {
12 |
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 |
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 | const uint32_t q_1,
17 | half2 (&dq)[4], int stride,
18 | const uint32_t zero) {
19 | half dqh[8];
20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 |
23 | for (int i = 0; i < 4; i++)
24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 |
27 | } // namespace gptq
28 | } // namespace vllm
29 |
30 | #endif
31 |
--------------------------------------------------------------------------------
/benchmarks/multi_turn/generate_multi_turn.json:
--------------------------------------------------------------------------------
1 | {
2 | "filetype": "generate_conversations",
3 | "num_conversations": 24,
4 | "text_files": ["pg1184.txt"],
5 | "print_stats": false,
6 | "prompt_input": {
7 | "num_turns": {
8 | "distribution": "uniform",
9 | "min": 12,
10 | "max": 18
11 | },
12 | "common_prefix_num_tokens": {
13 | "distribution": "constant",
14 | "value": 500
15 | },
16 | "prefix_num_tokens": {
17 | "distribution": "lognormal",
18 | "average": 1000,
19 | "max": 5000
20 | },
21 | "num_tokens": {
22 | "distribution": "uniform",
23 | "min": 120,
24 | "max": 160
25 | }
26 | },
27 | "prompt_output": {
28 | "num_tokens": {
29 | "distribution": "uniform",
30 | "min": 80,
31 | "max": 120
32 | }
33 | }
34 | }
--------------------------------------------------------------------------------