├── .buildkite ├── check-wheel-size.py ├── lm-eval-harness │ ├── configs │ │ ├── DeepSeek-V2-Lite-Chat.yaml │ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-70B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-QQQ.yaml │ │ ├── Minitron-4B-Base-FP8.yaml │ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml │ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml │ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml │ │ ├── Qwen2-57B-A14-Instruct.yaml │ │ ├── models-large.txt │ │ └── models-small.txt │ ├── run-lm-eval-gsm-hf-baseline.sh │ ├── run-lm-eval-gsm-vllm-baseline.sh │ ├── run-tests.sh │ └── test_lm_eval_correctness.py ├── nightly-benchmarks │ ├── README.md │ ├── benchmark-pipeline.yaml │ ├── nightly-annotation.md │ ├── nightly-descriptions.md │ ├── nightly-pipeline.yaml │ ├── performance-benchmarks-descriptions.md │ ├── scripts │ │ ├── convert-results-json-to-markdown.py │ │ ├── download-tokenizer.py │ │ ├── generate-nightly-markdown.py │ │ ├── get-lmdeploy-modelname.py │ │ ├── launch-server.sh │ │ ├── nightly-annotate.sh │ │ ├── run-nightly-benchmarks.sh │ │ ├── run-performance-benchmarks.sh │ │ ├── summary-nightly-results.py │ │ └── wait-for-image.sh │ └── tests │ │ ├── latency-tests.json │ │ ├── nightly-tests.json │ │ ├── serving-tests.json │ │ └── throughput-tests.json ├── release-pipeline.yaml ├── run-amd-test.sh ├── run-benchmarks.sh ├── run-cpu-test-ppc64le.sh ├── run-cpu-test.sh ├── run-multi-node-test.sh ├── run-neuron-test.sh ├── run-openvino-test.sh ├── run-tpu-test.sh ├── run-xpu-test.sh └── test-pipeline.yaml ├── .clang-format ├── .dockerignore ├── .github ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug report.yml │ ├── 500-feature request.yml │ ├── 600-new model.yml │ ├── 700-performance discussion.yml │ ├── 750-RFC.yml │ ├── 800-misc discussion.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── actionlint.yml │ ├── add_label_automerge.yml │ ├── clang-format.yml │ ├── matchers │ └── actionlint.json │ ├── mypy.yaml │ ├── publish.yml │ ├── reminder_comment.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── .yapfignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.neuron ├── Dockerfile.openvino ├── Dockerfile.ppc64le ├── Dockerfile.rocm ├── Dockerfile.tpu ├── Dockerfile.xpu ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_latency.py ├── benchmark_prefix_caching.py ├── benchmark_prioritization.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── cutlass_benchmarks │ ├── w8a8_benchmarks.py │ └── weight_shapes.py ├── kernels │ ├── benchmark_aqlm.py │ ├── benchmark_layernorm.py │ ├── benchmark_machete.py │ ├── benchmark_marlin.py │ ├── benchmark_moe.py │ ├── benchmark_paged_attention.py │ ├── benchmark_quant.py │ ├── benchmark_rope.py │ ├── benchmark_shapes.py │ ├── graph_machete_bench.py │ ├── requirements.txt │ └── weight_shapes.py ├── launch_tgi_server.sh ├── overheads │ └── benchmark_hashing.py └── sonnet.txt ├── cmake ├── cpu_extension.cmake ├── hipify.py └── utils.cmake ├── collect_env.py ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cu │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ └── dtype_fp8.cuh ├── cache.h ├── cache_kernels.cu ├── core │ ├── exception.hpp │ ├── registration.h │ ├── scalar_type.hpp │ └── torch_bindings.cpp ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── cpu_types_vsx.hpp │ ├── cpu_types_x86.hpp │ ├── dnnl_helper.hpp │ ├── layernorm.cpp │ ├── pos_encoding.cpp │ ├── quant.cpp │ ├── torch_bindings.cpp │ └── utils.cpp ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── cutlass_extensions │ ├── cute_utils.cuh │ ├── torch_utils.hpp │ ├── vllm_collective_builder.cuh │ ├── vllm_custom_types.cuh │ ├── vllm_cutlass_library_extension.py │ └── vllm_numeric_conversion.cuh ├── dispatch_utils.h ├── layernorm_kernels.cu ├── mamba │ ├── causal_conv1d │ │ ├── causal_conv1d.cu │ │ ├── causal_conv1d.h │ │ └── static_switch.h │ └── mamba_ssm │ │ ├── selective_scan.h │ │ ├── selective_scan_fwd.cu │ │ └── static_switch.h ├── moe │ ├── marlin_kernels │ │ ├── marlin_moe_kernel.h │ │ ├── marlin_moe_kernel_ku4.cu │ │ ├── marlin_moe_kernel_ku4.h │ │ ├── marlin_moe_kernel_ku4b8.cu │ │ ├── marlin_moe_kernel_ku4b8.h │ │ ├── marlin_moe_kernel_ku8b128.cu │ │ └── marlin_moe_kernel_ku8b128.h │ ├── marlin_moe_ops.cu │ ├── moe_ops.h │ ├── topk_softmax_kernels.cu │ └── torch_bindings.cpp ├── moe_align_block_size_kernels.cu ├── ops.h ├── permute_cols.cu ├── pos_encoding_kernels.cu ├── prepare_inputs │ ├── advance_step.cu │ └── advance_step.cuh ├── quantization │ ├── aqlm │ │ └── gemm_kernels.cu │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── compressed_tensors │ │ └── int8_quant_kernels.cu │ ├── cutlass_w8a8 │ │ ├── Epilogues.md │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ ├── common.hpp │ │ ├── scaled_mm_c2x.cu │ │ ├── scaled_mm_c2x.cuh │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ ├── scaled_mm_c3x.cu │ │ └── scaled_mm_entry.cu │ ├── fp8 │ │ ├── amd │ │ │ ├── hip_float8.h │ │ │ ├── hip_float8_impl.h │ │ │ └── quant_utils.cuh │ │ ├── common.cu │ │ ├── fp8_marlin.cu │ │ └── nvidia │ │ │ └── quant_utils.cuh │ ├── gguf │ │ ├── dequantize.cuh │ │ ├── ggml-common.h │ │ ├── gguf_kernel.cu │ │ ├── mmq.cuh │ │ ├── mmvq.cuh │ │ └── vecdotq.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── gptq_marlin │ │ ├── awq_marlin_repack.cu │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin_repack.cu │ │ ├── marlin.cuh │ │ └── marlin_dtypes.cuh │ ├── machete │ │ ├── Readme.md │ │ ├── generate.py │ │ ├── machete_collective_builder.cuh │ │ ├── machete_interleaving_utils.cuh │ │ ├── machete_mainloop.cuh │ │ ├── machete_mm_kernel.cuh │ │ ├── machete_mm_launcher.cuh │ │ ├── machete_prepack_kernel.cuh │ │ ├── machete_prepack_launcher.cuh │ │ ├── machete_prepacked_layout.cuh │ │ └── machete_pytorch.cu │ └── marlin │ │ ├── dense │ │ ├── LICENSE │ │ ├── common │ │ │ ├── base.h │ │ │ └── mem.h │ │ └── marlin_cuda_kernel.cu │ │ ├── qqq │ │ └── marlin_qqq_gemm_kernel.cu │ │ └── sparse │ │ ├── LICENSE │ │ ├── common │ │ ├── base.h │ │ ├── mem.h │ │ └── mma.h │ │ └── marlin_24_cuda_kernel.cu ├── rocm │ ├── attention.cu │ ├── ops.h │ └── torch_bindings.cpp └── torch_bindings.cpp ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── _static │ └── custom.js │ ├── _templates │ └── sections │ │ └── header.html │ ├── assets │ ├── dev │ │ └── dockerfile-stages-dependency.png │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── automatic_prefix_caching │ ├── apc.rst │ └── details.md │ ├── community │ ├── meetups.rst │ └── sponsors.md │ ├── conf.py │ ├── dev │ ├── dockerfile │ │ └── dockerfile.rst │ ├── engine │ │ ├── async_llm_engine.rst │ │ ├── engine_index.rst │ │ └── llm_engine.rst │ ├── input_processing │ │ ├── input_processing_pipeline.rst │ │ └── model_inputs_index.rst │ ├── kernel │ │ └── paged_attention.rst │ ├── multimodal │ │ ├── adding_multimodal_plugin.rst │ │ └── multimodal_index.rst │ ├── offline_inference │ │ ├── llm.rst │ │ ├── llm_inputs.rst │ │ └── offline_index.rst │ ├── profiling │ │ └── profiling_index.rst │ └── sampling_params.rst │ ├── generate_examples.py │ ├── getting_started │ ├── amd-installation.rst │ ├── cpu-installation.rst │ ├── debugging.rst │ ├── examples │ │ └── examples_index.template.rst │ ├── installation.rst │ ├── neuron-installation.rst │ ├── openvino-installation.rst │ ├── quickstart.rst │ ├── tpu-installation.rst │ └── xpu-installation.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── enabling_multimodal_inputs.rst │ ├── engine_args.rst │ ├── lora.rst │ ├── performance.rst │ ├── spec_decode.rst │ ├── supported_models.rst │ └── vlm.rst │ ├── performance_benchmark │ └── benchmarks.rst │ ├── quantization │ ├── auto_awq.rst │ ├── bnb.rst │ ├── fp8.rst │ ├── fp8_e4m3_kvcache.rst │ ├── fp8_e5m2_kvcache.rst │ ├── gguf.rst │ ├── int8.rst │ └── supported_hardware.rst │ └── serving │ ├── compatibility_matrix.rst │ ├── deploying_with_bentoml.rst │ ├── deploying_with_cerebrium.rst │ ├── deploying_with_docker.rst │ ├── deploying_with_dstack.rst │ ├── deploying_with_k8s.rst │ ├── deploying_with_kserve.rst │ ├── deploying_with_lws.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ ├── env_vars.rst │ ├── faq.rst │ ├── integrations.rst │ ├── metrics.rst │ ├── openai_compatible_server.md │ ├── run_on_sky.rst │ ├── serving_with_langchain.rst │ ├── serving_with_llamaindex.rst │ ├── tensorizer.rst │ └── usage_stats.md ├── examples ├── api_client.py ├── aqlm_example.py ├── cpu_offload.py ├── fp8 │ ├── README.md │ ├── extract_scales.py │ └── quantizer │ │ ├── README.md │ │ └── quantize.py ├── gguf_inference.py ├── gradio_openai_chatbot_webserver.py ├── gradio_webserver.py ├── llm_engine_example.py ├── logging_configuration.md ├── lora_with_quantization_inference.py ├── multilora_inference.py ├── offline_chat_with_tools.py ├── offline_inference.py ├── offline_inference_arctic.py ├── offline_inference_audio_language.py ├── offline_inference_chat.py ├── offline_inference_distributed.py ├── offline_inference_embedding.py ├── offline_inference_encoder_decoder.py ├── offline_inference_mlpspeculator.py ├── offline_inference_neuron.py ├── offline_inference_neuron_int8_quantization.py ├── offline_inference_openai.md ├── offline_inference_pixtral.py ├── offline_inference_tpu.py ├── offline_inference_vision_language.py ├── offline_inference_vision_language_multi_image.py ├── offline_inference_with_prefix.py ├── offline_inference_with_profiler.py ├── openai_audio_api_client.py ├── openai_chat_completion_client.py ├── openai_chat_completion_client_with_tools.py ├── openai_completion_client.py ├── openai_embedding_client.py ├── openai_example_batch.jsonl ├── openai_vision_api_client.py ├── production_monitoring │ ├── Otel.md │ ├── README.md │ ├── docker-compose.yaml │ ├── dummy_client.py │ ├── grafana.json │ └── prometheus.yaml ├── run_cluster.sh ├── save_sharded_state.py ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_blip2.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja ├── template_llava.jinja ├── tensorize_vllm_model.py ├── tool_chat_template_hermes.jinja ├── tool_chat_template_internlm2_tool.jinja ├── tool_chat_template_llama3.1_json.jinja ├── tool_chat_template_llama3.2_json.jinja ├── tool_chat_template_mistral.jinja └── tool_chat_template_mistral_parallel.jinja ├── find_cuda_init.py ├── format.sh ├── pyproject.toml ├── python_only_dev.py ├── requirements-build.txt ├── requirements-common.txt ├── requirements-cpu.txt ├── requirements-cuda.txt ├── requirements-dev.txt ├── requirements-lint.txt ├── requirements-neuron.txt ├── requirements-openvino.txt ├── requirements-rocm.txt ├── requirements-test.txt ├── requirements-tpu.txt ├── requirements-xpu.txt ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── __init__.py │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ └── test_request_tracker.py ├── basic_correctness │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_chunked_prefill.py │ ├── test_cpu_offload.py │ └── test_preemption.py ├── compile │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_full_graph.py │ ├── test_wrapper.py │ └── utils.py ├── conftest.py ├── core │ ├── __init__.py │ ├── block │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── e2e │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_correctness.py │ │ │ └── test_correctness_sliding_window.py │ │ ├── test_block_manager_v2.py │ │ ├── test_block_table.py │ │ ├── test_common.py │ │ ├── test_cpu_gpu_block_allocator.py │ │ ├── test_naive_block.py │ │ └── test_prefix_caching_block.py │ ├── test_block_manager.py │ ├── test_chunked_prefill_scheduler.py │ ├── test_num_computed_tokens_update.py │ ├── test_scheduler.py │ ├── test_scheduler_encoder_decoder.py │ ├── test_serialization.py │ └── utils.py ├── data │ └── test_config.yaml ├── distributed │ ├── __init__.py │ ├── test_comm_ops.py │ ├── test_custom_all_reduce.py │ ├── test_distributed_oot.py │ ├── test_multi_node_assignment.py │ ├── test_pipeline_parallel.py │ ├── test_pipeline_partition.py │ ├── test_pp_cudagraph.py │ ├── test_pynccl.py │ ├── test_same_node.py │ ├── test_shm_broadcast.py │ └── test_utils.py ├── encoder_decoder │ ├── __init__.py │ └── test_e2e_correctness.py ├── engine │ ├── __init__.py │ ├── output_processor │ │ ├── __init__.py │ │ ├── test_multi_step.py │ │ └── test_stop_checker.py │ ├── test_arg_utils.py │ ├── test_computed_prefix_blocks.py │ ├── test_custom_executor.py │ ├── test_detokenization.py │ ├── test_multiproc_workers.py │ ├── test_skip_tokenizer_init.py │ ├── test_stop_reason.py │ └── test_stop_strings.py ├── entrypoints │ ├── __init__.py │ ├── conftest.py │ ├── llm │ │ ├── __init__.py │ │ ├── test_encode.py │ │ ├── test_generate.py │ │ ├── test_generate_multiple_loras.py │ │ ├── test_guided_generate.py │ │ ├── test_lazy_outlines.py │ │ └── test_prompt_validation.py │ ├── offline_mode │ │ ├── __init__.py │ │ └── test_offline_mode.py │ ├── openai │ │ ├── __init__.py │ │ ├── test_accuracy.py │ │ ├── test_audio.py │ │ ├── test_basic.py │ │ ├── test_chat.py │ │ ├── test_chat_template.py │ │ ├── test_cli_args.py │ │ ├── test_completion.py │ │ ├── test_embedding.py │ │ ├── test_encoder_decoder.py │ │ ├── test_lora_lineage.py │ │ ├── test_metrics.py │ │ ├── test_models.py │ │ ├── test_oot_registration.py │ │ ├── test_prompt_validation.py │ │ ├── test_return_tokens_as_ids.py │ │ ├── test_run_batch.py │ │ ├── test_serving_chat.py │ │ ├── test_serving_engine.py │ │ ├── test_shutdown.py │ │ ├── test_tokenization.py │ │ └── test_vision.py │ └── test_chat_utils.py ├── fp8_kv │ ├── llama2-70b-fp8-kv │ │ └── kv_cache_scales.json │ └── llama2-7b-fp8-kv │ │ └── kv_cache_scales.json ├── kernels │ ├── __init__.py │ ├── allclose_default.py │ ├── conftest.py │ ├── quant_utils.py │ ├── test_activation.py │ ├── test_aqlm.py │ ├── test_attention.py │ ├── test_attention_selector.py │ ├── test_awq.py │ ├── test_awq_marlin.py │ ├── test_awq_triton.py │ ├── test_blocksparse_attention.py │ ├── test_cache.py │ ├── test_causal_conv1d.py │ ├── test_cutlass.py │ ├── test_encoder_decoder_attn.py │ ├── test_flash_attn.py │ ├── test_flashinfer.py │ ├── test_fp8_quant.py │ ├── test_ggml.py │ ├── test_gguf.py │ ├── test_gptq.py │ ├── test_int8_quant.py │ ├── test_layernorm.py │ ├── test_machete_gemm.py │ ├── test_mamba_ssm.py │ ├── test_marlin_gemm.py │ ├── test_moe.py │ ├── test_permute_cols.py │ ├── test_pos_encoding.py │ ├── test_prefix_prefill.py │ ├── test_rotary_embedding.py │ ├── test_utils.py │ └── utils.py ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ ├── __init__.py │ │ └── long_context_test_data.py │ ├── test_baichuan.py │ ├── test_chatglm3.py │ ├── test_gemma.py │ ├── test_layers.py │ ├── test_llama.py │ ├── test_long_context.py │ ├── test_lora_checkpoints.py │ ├── test_lora_huggingface.py │ ├── test_lora_manager.py │ ├── test_minicpmv.py │ ├── test_minicpmv_tp.py │ ├── test_mixtral.py │ ├── test_phi.py │ ├── test_punica_sizes.py │ ├── test_punica_variation.py │ ├── test_quant_model.py │ ├── test_tokenizer_group.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ ├── __init__.py │ └── test_metrics.py ├── model_executor │ ├── __init__.py │ ├── conftest.py │ ├── test_guided_processors.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── decoder_only │ │ ├── __init__.py │ │ ├── audio_language │ │ │ ├── __init__.py │ │ │ └── test_ultravox.py │ │ ├── language │ │ │ ├── __init__.py │ │ │ ├── test_aqlm.py │ │ │ ├── test_big_models.py │ │ │ ├── test_danube3_4b.py │ │ │ ├── test_fp8.py │ │ │ ├── test_gguf.py │ │ │ ├── test_gptq_marlin.py │ │ │ ├── test_gptq_marlin_24.py │ │ │ ├── test_granite.py │ │ │ ├── test_granitemoe.py │ │ │ ├── test_jamba.py │ │ │ ├── test_mamba.py │ │ │ ├── test_marlin.py │ │ │ ├── test_mistral.py │ │ │ ├── test_modelopt.py │ │ │ ├── test_models.py │ │ │ └── test_phimoe.py │ │ └── vision_language │ │ │ ├── __init__.py │ │ │ ├── test_blip2.py │ │ │ ├── test_broadcast.py │ │ │ ├── test_chameleon.py │ │ │ ├── test_fuyu.py │ │ │ ├── test_glm4.py │ │ │ ├── test_intern_vit.py │ │ │ ├── test_internvl.py │ │ │ ├── test_llava.py │ │ │ ├── test_llava_image_embeds.py │ │ │ ├── test_llava_next.py │ │ │ ├── test_llava_next_video.py │ │ │ ├── test_llava_onevision.py │ │ │ ├── test_minicpmv.py │ │ │ ├── test_paligemma.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_pixtral.py │ │ │ └── test_qwen.py │ ├── embedding │ │ ├── __init__.py │ │ └── language │ │ │ ├── __init__.py │ │ │ └── test_embedding.py │ ├── encoder_decoder │ │ ├── __init__.py │ │ ├── language │ │ │ ├── __init__.py │ │ │ └── test_bart.py │ │ └── vision_language │ │ │ ├── __init__.py │ │ │ ├── test_broadcast.py │ │ │ └── test_mllama.py │ ├── fixtures │ │ ├── pixtral_chat.json │ │ └── pixtral_chat_engine.json │ ├── test_oot_registration.py │ ├── test_registry.py │ └── utils.py ├── mq_llm_engine │ ├── __init__.py │ ├── test_abort.py │ ├── test_error_handling.py │ ├── test_load.py │ └── utils.py ├── multi_step │ ├── __init__.py │ ├── test_correctness_async_llm.py │ └── test_correctness_llm.py ├── multimodal │ ├── __init__.py │ ├── test_base.py │ ├── test_mapper.py │ ├── test_processor_kwargs.py │ └── test_utils.py ├── plugins │ └── vllm_add_dummy_model │ │ ├── setup.py │ │ └── vllm_add_dummy_model │ │ ├── __init__.py │ │ ├── my_gemma_embedding.py │ │ ├── my_llava.py │ │ └── my_opt.py ├── prefix_caching │ ├── __init__.py │ ├── test_disable_sliding_window.py │ └── test_prefix_caching.py ├── prompt_adapter │ ├── test_bloom.py │ ├── test_multi_adapter_inference.py │ └── test_pa_lora.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── __init__.py │ ├── test_bitsandbytes.py │ ├── test_compressed_tensors.py │ ├── test_configs.py │ ├── test_cpu_offload.py │ ├── test_experts_int8.py │ ├── test_fp8.py │ ├── test_ipex_quant.py │ ├── test_lm_head.py │ └── utils.py ├── samplers │ ├── __init__.py │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_logits_processor.py │ ├── test_logprobs.py │ ├── test_ranks.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ ├── test_seeded_generate.py │ └── test_typical_acceptance_sampler.py ├── spec_decode │ ├── __init__.py │ ├── e2e │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_compatibility.py │ │ ├── test_eagle_correctness.py │ │ ├── test_integration.py │ │ ├── test_integration_dist_tp2.py │ │ ├── test_integration_dist_tp4.py │ │ ├── test_logprobs.py │ │ ├── test_medusa_correctness.py │ │ ├── test_mlp_correctness.py │ │ ├── test_multistep_correctness.py │ │ ├── test_ngram_correctness.py │ │ └── test_seed.py │ ├── test_batch_expansion.py │ ├── test_dynamic_spec_decode.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_ngram_worker.py │ ├── test_scorer.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── tensorizer_loader │ ├── __init__.py │ ├── conftest.py │ └── test_tensorizer.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_embedded_commit.py ├── test_inputs.py ├── test_logger.py ├── test_logits_processor.py ├── test_regression.py ├── test_sampling_params.py ├── test_scalartype.py ├── test_sequence.py ├── test_sharded_state_loader.py ├── test_utils.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_get_eos.py │ ├── test_tokenizer.py │ └── test_tokenizer_group.py ├── tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_chat_completion_request_validations.py │ ├── test_chat_completions.py │ ├── test_parallel_tool_calls.py │ ├── test_tool_calls.py │ └── utils.py ├── tpu │ ├── __init__.py │ ├── test_compilation.py │ └── test_custom_dispatcher.py ├── tracing │ ├── __init__.py │ └── test_tracing.py ├── utils.py ├── weight_loading │ ├── models-large.txt │ ├── models.txt │ ├── run_model_weight_loading_test.sh │ └── test_weight_loading.py └── worker │ ├── __init__.py │ ├── test_encoder_decoder_model_runner.py │ ├── test_model_input.py │ ├── test_model_runner.py │ └── test_swap.py ├── tools ├── actionlint.sh ├── mypy.sh └── report_build_time_ninja.py ├── use_existing_torch.py └── vllm ├── __init__.py ├── _core_ext.py ├── _custom_ops.py ├── _ipex_ops.py ├── adapter_commons ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── assets ├── __init__.py ├── audio.py ├── base.py ├── image.py └── video.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── blocksparse_attn.py │ ├── flash_attn.py │ ├── flashinfer.py │ ├── ipex_attn.py │ ├── openvino.py │ ├── pallas.py │ ├── placeholder_attn.py │ ├── rocm_flash_attn.py │ ├── torch_sdpa.py │ ├── utils.py │ └── xformers.py ├── layer.py ├── ops │ ├── __init__.py │ ├── blocksparse_attention │ │ ├── __init__.py │ │ ├── blocksparse_attention_kernel.py │ │ ├── interface.py │ │ └── utils.py │ ├── ipex_attn.py │ ├── paged_attn.py │ ├── prefix_prefill.py │ └── triton_flash_attention.py └── selector.py ├── beam_search.py ├── block.py ├── compilation ├── __init__.py ├── backends.py ├── compile_context.py ├── decorators.py ├── levels.py └── wrapper.py ├── config.py ├── connections.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ ├── prefix_caching_block.py │ └── utils.py ├── block_manager_v1.py ├── block_manager_v2.py ├── evictor_v1.py ├── evictor_v2.py ├── interfaces.py ├── placeholder_block_space_manager.py └── scheduler.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── cuda_wrapper.py │ ├── custom_all_reduce.py │ ├── custom_all_reduce_utils.py │ ├── pynccl.py │ ├── pynccl_wrapper.py │ ├── shm_broadcast.py │ └── tpu_communicator.py ├── parallel_state.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── async_timeout.py ├── llm_engine.py ├── metrics.py ├── metrics_types.py ├── multiprocessing │ ├── __init__.py │ ├── client.py │ └── engine.py ├── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── multi_step.py │ ├── single_step.py │ ├── stop_checker.py │ └── util.py └── protocol.py ├── entrypoints ├── __init__.py ├── api_server.py ├── chat_utils.py ├── launcher.py ├── llm.py ├── logger.py └── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── logits_processors.py │ ├── protocol.py │ ├── run_batch.py │ ├── serving_chat.py │ ├── serving_completion.py │ ├── serving_embedding.py │ ├── serving_engine.py │ ├── serving_tokenization.py │ └── tool_parsers │ ├── __init__.py │ ├── abstract_tool_parser.py │ ├── hermes_tool_parser.py │ ├── internlm2_tool_parser.py │ ├── llama_tool_parser.py │ ├── mistral_tool_parser.py │ └── utils.py ├── envs.py ├── executor ├── __init__.py ├── cpu_executor.py ├── distributed_gpu_executor.py ├── executor_base.py ├── gpu_executor.py ├── msgspec_utils.py ├── multiproc_gpu_executor.py ├── multiproc_worker_utils.py ├── multiproc_xpu_executor.py ├── neuron_executor.py ├── openvino_executor.py ├── ray_gpu_executor.py ├── ray_tpu_executor.py ├── ray_utils.py ├── ray_xpu_executor.py ├── tpu_executor.py └── xpu_executor.py ├── forward_context.py ├── inputs ├── __init__.py ├── data.py ├── parse.py ├── preprocess.py └── registry.py ├── logger.py ├── logging ├── __init__.py └── formatter.py ├── lora ├── __init__.py ├── fully_sharded_layers.py ├── layers.py ├── lora.py ├── models.py ├── ops │ ├── __init__.py │ ├── bgmv_expand.py │ ├── bgmv_expand_slice.py │ ├── bgmv_shrink.py │ ├── sgmv_expand.py │ ├── sgmv_expand_slice.py │ ├── sgmv_shrink.py │ └── utils.py ├── punica.py ├── request.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── custom_op.py ├── guided_decoding │ ├── __init__.py │ ├── guided_fields.py │ ├── lm_format_enforcer_decoding.py │ ├── outlines_decoding.py │ └── outlines_logits_processors.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ └── README │ │ ├── fused_marlin_moe.py │ │ ├── fused_moe.py │ │ ├── layer.py │ │ └── moe_pallas.py │ ├── layernorm.py │ ├── linear.py │ ├── logits_processor.py │ ├── mamba │ │ ├── __init__.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── causal_conv1d.py │ │ │ └── mamba_ssm.py │ ├── pooler.py │ ├── quantization │ │ ├── __init__.py │ │ ├── aqlm.py │ │ ├── awq.py │ │ ├── awq_marlin.py │ │ ├── awq_triton.py │ │ ├── base_config.py │ │ ├── bitsandbytes.py │ │ ├── compressed_tensors │ │ │ ├── __init__.py │ │ │ ├── compressed_tensors.py │ │ │ ├── compressed_tensors_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ ├── compressed_tensors_w4a16_24.py │ │ │ │ ├── compressed_tensors_w8a16_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_int8.py │ │ │ │ └── compressed_tensors_wNa16.py │ │ │ └── utils.py │ │ ├── deepspeedfp.py │ │ ├── experts_int8.py │ │ ├── fbgemm_fp8.py │ │ ├── fp8.py │ │ ├── gguf.py │ │ ├── gptq.py │ │ ├── gptq_marlin.py │ │ ├── gptq_marlin_24.py │ │ ├── ipex_quant.py │ │ ├── kernels │ │ │ ├── MPLinearKernel.py │ │ │ ├── __init__.py │ │ │ ├── machete.py │ │ │ └── marlin.py │ │ ├── kv_cache.py │ │ ├── marlin.py │ │ ├── modelopt.py │ │ ├── neuron_quant.py │ │ ├── qqq.py │ │ ├── schema.py │ │ ├── tpu_int8.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── layer_utils.py │ │ │ ├── machete_utils.py │ │ │ ├── marlin_utils.py │ │ │ ├── marlin_utils_fp8.py │ │ │ ├── marlin_utils_test.py │ │ │ ├── marlin_utils_test_24.py │ │ │ ├── marlin_utils_test_qqq.py │ │ │ ├── quant_utils.py │ │ │ └── w8a8_utils.py │ ├── rejection_sampler.py │ ├── resampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ ├── spec_decode_base_sampler.py │ ├── typical_acceptance_sampler.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── loader.py │ ├── neuron.py │ ├── openvino.py │ ├── tensorizer.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── arctic.py │ ├── baichuan.py │ ├── bart.py │ ├── blip.py │ ├── blip2.py │ ├── bloom.py │ ├── chameleon.py │ ├── chatglm.py │ ├── clip.py │ ├── commandr.py │ ├── dbrx.py │ ├── decilm.py │ ├── deepseek.py │ ├── deepseek_v2.py │ ├── eagle.py │ ├── exaone.py │ ├── falcon.py │ ├── fuyu.py │ ├── gemma.py │ ├── gemma2.py │ ├── gemma2_embedding.py │ ├── glm4_vision_encoder.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── granite.py │ ├── granitemoe.py │ ├── idefics2_vision_model.py │ ├── interfaces.py │ ├── interfaces_base.py │ ├── intern_vit.py │ ├── internlm2.py │ ├── internvl.py │ ├── jais.py │ ├── jamba.py │ ├── llama.py │ ├── llama_embedding.py │ ├── llava.py │ ├── llava_next.py │ ├── llava_next_video.py │ ├── llava_onevision.py │ ├── mamba.py │ ├── mamba_cache.py │ ├── medusa.py │ ├── minicpm.py │ ├── minicpm3.py │ ├── minicpmv.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mllama.py │ ├── mlp_speculator.py │ ├── module_mapping.py │ ├── molmo.py │ ├── mpt.py │ ├── nemotron.py │ ├── nvlm_d.py │ ├── olmo.py │ ├── olmoe.py │ ├── opt.py │ ├── orion.py │ ├── paligemma.py │ ├── persimmon.py │ ├── phi.py │ ├── phi3.py │ ├── phi3_small.py │ ├── phi3v.py │ ├── phimoe.py │ ├── pixtral.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_moe.py │ ├── qwen2_rm.py │ ├── qwen2_vl.py │ ├── registry.py │ ├── siglip.py │ ├── solar.py │ ├── stablelm.py │ ├── starcoder2.py │ ├── ultravox.py │ ├── utils.py │ └── xverse.py ├── parameter.py ├── pooling_metadata.py ├── sampling_metadata.py └── utils.py ├── multimodal ├── __init__.py ├── audio.py ├── base.py ├── image.py ├── registry.py ├── utils.py └── video.py ├── outputs.py ├── platforms ├── __init__.py ├── cpu.py ├── cuda.py ├── interface.py ├── rocm.py ├── tpu.py └── xpu.py ├── plugins └── __init__.py ├── pooling_params.py ├── prompt_adapter ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── py.typed ├── sampling_params.py ├── scalar_type.py ├── scripts.py ├── sequence.py ├── spec_decode ├── __init__.py ├── batch_expansion.py ├── draft_model_runner.py ├── interfaces.py ├── medusa_worker.py ├── metrics.py ├── mlp_speculator_worker.py ├── mqa_scorer.py ├── multi_step_worker.py ├── ngram_worker.py ├── proposer_worker_base.py ├── smaller_tp_proposer_worker.py ├── spec_decode_worker.py ├── target_model_runner.py ├── top1_proposer.py └── util.py ├── tracing.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── arctic.py │ ├── chatglm.py │ ├── dbrx.py │ ├── eagle.py │ ├── exaone.py │ ├── falcon.py │ ├── internvl.py │ ├── jais.py │ ├── medusa.py │ ├── mllama.py │ ├── mlp_speculator.py │ ├── mpt.py │ ├── nemotron.py │ ├── nvlm_d.py │ ├── qwen2vl.py │ ├── solar.py │ └── ultravox.py ├── detokenizer.py ├── processor.py ├── tokenizer.py ├── tokenizer_group │ ├── __init__.py │ ├── base_tokenizer_group.py │ ├── ray_tokenizer_group.py │ └── tokenizer_group.py ├── tokenizers │ ├── __init__.py │ └── mistral.py └── utils.py ├── triton_utils ├── __init__.py ├── custom_cache_manager.py ├── importing.py └── libentry.py ├── usage ├── __init__.py └── usage_lib.py ├── utils.py ├── version.py ├── vllm_flash_attn └── .gitkeep └── worker ├── __init__.py ├── cache_engine.py ├── cpu_enc_dec_model_runner.py ├── cpu_model_runner.py ├── cpu_worker.py ├── embedding_model_runner.py ├── enc_dec_model_runner.py ├── model_runner.py ├── model_runner_base.py ├── multi_step_model_runner.py ├── multi_step_tpu_worker.py ├── multi_step_worker.py ├── neuron_model_runner.py ├── neuron_worker.py ├── openvino_model_runner.py ├── openvino_worker.py ├── tpu_model_runner.py ├── tpu_worker.py ├── utils.py ├── worker.py ├── worker_base.py ├── xpu_model_runner.py └── xpu_worker.py /.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 2 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.671 8 | - name: "exact_match,flexible-extract" 9 | value: 0.664 10 | limit: 1000 11 | num_fewshot: 5 12 | trust_remote_code: True -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 2 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.905 8 | - name: "exact_match,flexible-extract" 9 | value: 0.905 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 2 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.892 8 | - name: "exact_match,flexible-extract" 9 | value: 0.892 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.752 8 | - name: "exact_match,flexible-extract" 9 | value: 0.754 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.753 8 | - name: "exact_match,flexible-extract" 9 | value: 0.753 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.755 8 | - name: "exact_match,flexible-extract" 9 | value: 0.755 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 2 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.753 8 | - name: "exact_match,flexible-extract" 9 | value: 0.753 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.764 8 | - name: "exact_match,flexible-extract" 9 | value: 0.764 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.728 8 | - name: "exact_match,flexible-extract" 9 | value: 0.728 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.758 8 | - name: "exact_match,flexible-extract" 9 | value: 0.759 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 2 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.756 8 | - name: "exact_match,flexible-extract" 9 | value: 0.752 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 2 | model_name: "HandH1998/QQQ-Llama-3-8b-g128" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.419 8 | - name: "exact_match,flexible-extract" 9 | value: 0.416 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 2 | model_name: "mgoin/Minitron-4B-Base-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.233 8 | - name: "exact_match,flexible-extract" 9 | value: 0.236 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 2 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.86 8 | - name: "exact_match,flexible-extract" 9 | value: 0.86 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 2 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.624 8 | - name: "exact_match,flexible-extract" 9 | value: 0.624 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 2 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.616 8 | - name: "exact_match,flexible-extract" 9 | value: 0.632 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.578 8 | - name: "exact_match,flexible-extract" 9 | value: 0.585 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 2 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.593 8 | - name: "exact_match,flexible-extract" 9 | value: 0.588 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.595 8 | - name: "exact_match,flexible-extract" 9 | value: 0.582 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 2 | model_name: "Qwen/Qwen2-57B-A14B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.792 8 | - name: "exact_match,flexible-extract" 9 | value: 0.824 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-large.txt: -------------------------------------------------------------------------------- 1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml 2 | Meta-Llama-3-70B-Instruct.yaml 3 | Mixtral-8x7B-Instruct-v0.1.yaml 4 | Qwen2-57B-A14-Instruct.yaml 5 | DeepSeek-V2-Lite-Chat.yaml 6 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-small.txt: -------------------------------------------------------------------------------- 1 | Meta-Llama-3-8B-Instruct.yaml 2 | Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml 3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml 4 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml 5 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml 6 | Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml 7 | Minitron-4B-Base-FP8.yaml 8 | Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml 9 | Qwen2-1.5B-Instruct-FP8W8.yaml 10 | Meta-Llama-3-8B-QQQ.yaml 11 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # We can use this script to compute baseline accuracy on GSM for transformers. 3 | # 4 | # Make sure you have lm-eval-harness installed: 5 | # pip install lm-eval==0.4.4 6 | 7 | usage() { 8 | echo`` 9 | echo "Runs lm eval harness on GSM8k using huggingface transformers." 10 | echo "This pathway is intended to be used to create baselines for " 11 | echo "our automated nm-test-accuracy workflow" 12 | echo 13 | echo "usage: ${0} " 14 | echo 15 | echo " -m - huggingface stub or local directory of the model" 16 | echo " -b - batch size to run the evaluation at" 17 | echo " -l - limit number of samples to run" 18 | echo " -f - number of fewshot samples to use" 19 | echo 20 | } 21 | 22 | while getopts "m:b:l:f:" OPT; do 23 | case ${OPT} in 24 | m ) 25 | MODEL="$OPTARG" 26 | ;; 27 | b ) 28 | BATCH_SIZE="$OPTARG" 29 | ;; 30 | l ) 31 | LIMIT="$OPTARG" 32 | ;; 33 | f ) 34 | FEWSHOT="$OPTARG" 35 | ;; 36 | \? ) 37 | usage 38 | exit 1 39 | ;; 40 | esac 41 | done 42 | 43 | lm_eval --model hf \ 44 | --model_args pretrained=$MODEL,parallelize=True \ 45 | --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ 46 | --batch_size $BATCH_SIZE 47 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/nightly-annotation.md: -------------------------------------------------------------------------------- 1 | 2 | ## Description 3 | 4 | This file contains the downloading link for benchmarking results. 5 | 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml) 7 | - [benchmarking results](artifact://results.zip) 8 | - [benchmarking code](artifact://nightly-benchmarks.zip) 9 | 10 | Please download the visualization scripts in the post 11 | 12 | 13 | ## Results reproduction 14 | 15 | - Find the docker we use in `benchmarking pipeline` 16 | - Deploy the docker, and inside the docker: 17 | - Download `nightly-benchmarks.zip`. 18 | - In the same folder, run the following code 19 | ``` 20 | export HF_TOKEN= 21 | apt update 22 | apt install -y git 23 | unzip nightly-benchmarks.zip 24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh 25 | ``` 26 | 27 | And the results will be inside `./benchmarks/results`. 28 | 29 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/download-tokenizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from transformers import AutoTokenizer 4 | 5 | 6 | def main(model, cachedir): 7 | # Load the tokenizer and save it to the specified directory 8 | tokenizer = AutoTokenizer.from_pretrained(model) 9 | tokenizer.save_pretrained(cachedir) 10 | print(f"Tokenizer saved to {cachedir}") 11 | 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser( 15 | description="Download and save Hugging Face tokenizer") 16 | parser.add_argument("--model", 17 | type=str, 18 | required=True, 19 | help="Name of the model") 20 | parser.add_argument("--cachedir", 21 | type=str, 22 | required=True, 23 | help="Directory to save the tokenizer") 24 | 25 | args = parser.parse_args() 26 | main(args.model, args.cachedir) 27 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py: -------------------------------------------------------------------------------- 1 | from lmdeploy.serve.openai.api_client import APIClient 2 | 3 | api_client = APIClient("http://localhost:8000") 4 | model_name = api_client.available_models[0] 5 | 6 | print(model_name) 7 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/wait-for-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) 3 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" 4 | 5 | TIMEOUT_SECONDS=10 6 | 7 | retries=0 8 | while [ $retries -lt 1000 ]; do 9 | if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then 10 | exit 0 11 | fi 12 | 13 | echo "Waiting for image to be available..." 14 | 15 | retries=$((retries + 1)) 16 | sleep 5 17 | done 18 | 19 | exit 1 -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/tests/latency-tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_name": "latency_llama8B_tp1", 4 | "parameters": { 5 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 6 | "tensor_parallel_size": 1, 7 | "load_format": "dummy", 8 | "num_iters_warmup": 5, 9 | "num_iters": 15 10 | } 11 | }, 12 | { 13 | "test_name": "latency_llama70B_tp4", 14 | "parameters": { 15 | "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 16 | "tensor_parallel_size": 4, 17 | "load_format": "dummy", 18 | "num-iters-warmup": 5, 19 | "num-iters": 15 20 | } 21 | }, 22 | { 23 | "test_name": "latency_mixtral8x7B_tp2", 24 | "parameters": { 25 | "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", 26 | "tensor_parallel_size": 2, 27 | "load_format": "dummy", 28 | "num-iters-warmup": 5, 29 | "num-iters": 15 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/tests/throughput-tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_name": "throughput_llama8B_tp1", 4 | "parameters": { 5 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 6 | "tensor_parallel_size": 1, 7 | "load_format": "dummy", 8 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 9 | "num_prompts": 200, 10 | "backend": "vllm" 11 | } 12 | }, 13 | { 14 | "test_name": "throughput_llama70B_tp4", 15 | "parameters": { 16 | "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 17 | "tensor_parallel_size": 4, 18 | "load_format": "dummy", 19 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 20 | "num_prompts": 200, 21 | "backend": "vllm" 22 | } 23 | }, 24 | { 25 | "test_name": "throughput_mixtral8x7B_tp2", 26 | "parameters": { 27 | "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", 28 | "tensor_parallel_size": 2, 29 | "load_format": "dummy", 30 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 31 | "num_prompts": 200, 32 | "backend": "vllm" 33 | } 34 | } 35 | ] -------------------------------------------------------------------------------- /.buildkite/run-openvino-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the OpenVINO docker image and run the offline inference inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -ex 4 | 5 | # Try building the docker image 6 | docker build -t openvino-test -f Dockerfile.openvino . 7 | 8 | # Setup cleanup 9 | remove_docker_container() { docker rm -f openvino-test || true; } 10 | trap remove_docker_container EXIT 11 | remove_docker_container 12 | 13 | # Run the image and launch offline inference 14 | docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py 15 | -------------------------------------------------------------------------------- /.buildkite/run-tpu-test.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # Build the docker image. 4 | docker build -f Dockerfile.tpu -t vllm-tpu . 5 | 6 | # Set up cleanup. 7 | remove_docker_container() { docker rm -f tpu-test || true; } 8 | trap remove_docker_container EXIT 9 | # Remove the container that might not be cleaned up in the previous run. 10 | remove_docker_container 11 | 12 | # For HF_TOKEN. 13 | source /etc/environment 14 | # Run a simple end-to-end example. 15 | docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" 16 | -------------------------------------------------------------------------------- /.buildkite/run-xpu-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the CPU docker image and run the offline inference inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -ex 4 | 5 | # Try building the docker image 6 | docker build -t xpu-test -f Dockerfile.xpu . 7 | 8 | # Setup cleanup 9 | remove_docker_container() { docker rm -f xpu-test || true; } 10 | trap remove_docker_container EXIT 11 | remove_docker_container 12 | 13 | # Run the image and launch offline inference 14 | docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py 15 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | UseTab: Never 3 | IndentWidth: 2 4 | ColumnLimit: 80 5 | 6 | # Force pointers to the type for C++. 7 | DerivePointerAlignment: false 8 | PointerAlignment: Left 9 | 10 | # Reordering #include statements can (and currently will) introduce errors 11 | SortIncludes: false 12 | 13 | # Style choices 14 | AlignConsecutiveAssignments: false 15 | AlignConsecutiveDeclarations: false 16 | IndentPPDirectives: BeforeHash 17 | 18 | IncludeCategories: 19 | - Regex: '^<' 20 | Priority: 4 21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' 22 | Priority: 3 23 | - Regex: '^"(qoda|\.\.)/' 24 | Priority: 2 25 | - Regex: '.*' 26 | Priority: 1 27 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /.github/ 2 | /.venv 3 | /build 4 | dist 5 | vllm/*.so 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | .mypy_cache 13 | 14 | # Distribution / packaging 15 | .Python 16 | /build/ 17 | cmake-build-*/ 18 | CMakeUserPresets.json 19 | develop-eggs/ 20 | /dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [vllm-project] 2 | open_collective: [vllm] 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://docs.vllm.ai/ 3 | title: "[Doc]: " 4 | labels: ["documentation"] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description of what content in https://docs.vllm.ai/ is an issue. 12 | validations: 13 | required: true 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | - type: markdown 20 | attributes: 21 | value: > 22 | Thanks for contributing 🎉! 23 | - type: checkboxes 24 | id: askllm 25 | attributes: 26 | label: Before submitting a new issue... 27 | options: 28 | - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. 29 | required: true 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/800-misc discussion.yml: -------------------------------------------------------------------------------- 1 | name: 🎲 Misc/random discussions that do not fit into the above categories. 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. 3 | title: "[Misc]: " 4 | labels: ["misc"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Anything you want to discuss about vllm. 14 | description: > 15 | Anything you want to discuss about vllm. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | - type: checkboxes 23 | id: askllm 24 | attributes: 25 | label: Before submitting a new issue... 26 | options: 27 | - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. 28 | required: true 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.yml: -------------------------------------------------------------------------------- 1 | name: Lint GitHub Actions workflows 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | paths: 7 | - '.github/workflows/*.ya?ml' 8 | - '.github/workflows/actionlint.*' 9 | pull_request: 10 | branches: 11 | - "main" 12 | paths: 13 | - '.github/workflows/*.ya?ml' 14 | - '.github/workflows/actionlint.*' 15 | 16 | env: 17 | LC_ALL: en_US.UTF-8 18 | 19 | defaults: 20 | run: 21 | shell: bash 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | actionlint: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: "Checkout" 31 | uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 32 | with: 33 | fetch-depth: 0 34 | 35 | - name: "Run actionlint" 36 | run: | 37 | tools/actionlint.sh -color 38 | -------------------------------------------------------------------------------- /.github/workflows/add_label_automerge.yml: -------------------------------------------------------------------------------- 1 | name: Add label on auto-merge enabled 2 | on: 3 | pull_request_target: 4 | types: 5 | - auto_merge_enabled 6 | jobs: 7 | add-label-on-auto-merge: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Add label 11 | uses: actions/github-script@v7 12 | with: 13 | script: | 14 | github.rest.issues.addLabels({ 15 | owner: context.repo.owner, 16 | repo: context.repo.repo, 17 | issue_number: context.issue.number, 18 | labels: ['ready'] 19 | }) 20 | env: 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | -------------------------------------------------------------------------------- /.github/workflows/clang-format.yml: -------------------------------------------------------------------------------- 1 | name: clang-format 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | clang-format: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.11"] 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install clang-format==18.1.5 29 | - name: Running clang-format 30 | run: | 31 | EXCLUDES=( 32 | 'csrc/moe/topk_softmax_kernels.cu' 33 | 'csrc/quantization/gguf/ggml-common.h' 34 | 'csrc/quantization/gguf/dequantize.cuh' 35 | 'csrc/quantization/gguf/vecdotq.cuh' 36 | 'csrc/quantization/gguf/mmq.cuh' 37 | 'csrc/quantization/gguf/mmvq.cuh' 38 | ) 39 | find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ 40 | | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ 41 | | xargs clang-format --dry-run --Werror -------------------------------------------------------------------------------- /.github/workflows/matchers/actionlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "actionlint", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "message": 4, 12 | "code": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/mypy.yaml: -------------------------------------------------------------------------------- 1 | name: mypy 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | mypy: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install mypy==1.11.1 29 | pip install types-setuptools 30 | pip install types-PyYAML 31 | pip install types-requests 32 | pip install types-setuptools 33 | - name: Mypy 34 | run: | 35 | tools/mypy.sh 36 | -------------------------------------------------------------------------------- /.github/workflows/reminder_comment.yml: -------------------------------------------------------------------------------- 1 | name: PR Reminder Comment Bot 2 | on: 3 | pull_request_target: 4 | types: [opened] 5 | 6 | jobs: 7 | pr_reminder: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Remind to run full CI on PR 11 | uses: actions/github-script@v7 12 | with: 13 | script: | 14 | github.rest.issues.createComment({ 15 | owner: context.repo.owner, 16 | repo: context.repo.repo, 17 | issue_number: context.issue.number, 18 | body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' 19 | }) 20 | env: 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements-lint.txt 29 | - name: Analysing the code with ruff 30 | run: | 31 | ruff check . 32 | - name: Spelling check with codespell 33 | run: | 34 | codespell --toml pyproject.toml 35 | - name: Run isort 36 | run: | 37 | isort . --check-only 38 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt 12 | 13 | # Limit the number of parallel jobs to avoid OOM 14 | export MAX_JOBS=1 15 | # Make sure release wheels are built for the following architectures 16 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" 17 | export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" 18 | # Build 19 | $python_executable setup.py bdist_wheel --dist-dir=dist 20 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: true, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive . 32 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | fail_on_warning: true 14 | 15 | # If using Sphinx, optionally build your docs in additional formats such as PDF 16 | formats: [] 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | 23 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /Dockerfile.openvino: -------------------------------------------------------------------------------- 1 | # The vLLM Dockerfile is used to construct vLLM image that can be directly used 2 | # to run the OpenAI compatible server. 3 | 4 | FROM ubuntu:22.04 AS dev 5 | 6 | RUN apt-get update -y && \ 7 | apt-get install -y \ 8 | git python3-pip \ 9 | ffmpeg libsm6 libxext6 libgl1 10 | WORKDIR /workspace 11 | 12 | COPY . . 13 | 14 | # install build requirements 15 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt 16 | # build vLLM with OpenVINO backend 17 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ 18 | 19 | COPY examples/ /workspace/vllm/examples 20 | COPY benchmarks/ /workspace/vllm/benchmarks 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /Dockerfile.tpu: -------------------------------------------------------------------------------- 1 | ARG NIGHTLY_DATE="20240828" 2 | ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" 3 | 4 | FROM $BASE_IMAGE 5 | WORKDIR /workspace 6 | 7 | # Install some basic utilities 8 | RUN apt-get update && apt-get install -y \ 9 | git \ 10 | ffmpeg libsm6 libxext6 libgl1 11 | 12 | # Install the TPU and Pallas dependencies. 13 | RUN --mount=type=cache,target=/root/.cache/pip \ 14 | python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html 15 | RUN --mount=type=cache,target=/root/.cache/pip \ 16 | python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html 17 | 18 | # Build vLLM. 19 | COPY . /workspace/vllm 20 | ENV VLLM_TARGET_DEVICE="tpu" 21 | RUN --mount=type=cache,target=/root/.cache/pip \ 22 | --mount=type=bind,source=.git,target=.git \ 23 | cd /workspace/vllm && \ 24 | python3 -m pip install \ 25 | cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ 26 | -r requirements-tpu.txt 27 | RUN cd /workspace/vllm && python3 setup.py develop 28 | 29 | CMD ["/bin/bash"] 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements-common.txt 3 | include requirements-cuda.txt 4 | include requirements-rocm.txt 5 | include requirements-neuron.txt 6 | include requirements-cpu.txt 7 | include CMakeLists.txt 8 | 9 | recursive-include cmake * 10 | recursive-include csrc * 11 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. 6 | 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). 8 | 9 | --- 10 | 11 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. 12 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /benchmarks/cutlass_benchmarks/weight_shapes.py: -------------------------------------------------------------------------------- 1 | # Weight Shapes are in the format 2 | # ([K, N], TP_SPLIT_DIM) 3 | # Example: 4 | # A shape of ([14336, 4096], 0) indicates the following GEMM shape, 5 | # - TP1 : K = 14336, N = 4096 6 | # - TP2 : K = 7168, N = 4096 7 | # A shape of ([4096, 6144], 1) indicates the following GEMM shape, 8 | # - TP1 : K = 4096, N = 6144 9 | # - TP4 : K = 4096, N = 1536 10 | 11 | # TP1 shapes 12 | WEIGHT_SHAPES = { 13 | "mistralai/Mistral-7B-v0.1": [ 14 | ([4096, 6144], 1), 15 | ([4096, 4096], 0), 16 | ([4096, 28672], 1), 17 | ([14336, 4096], 0), 18 | ], 19 | "meta-llama/Llama-2-7b-hf": [ 20 | ([4096, 12288], 1), 21 | ([4096, 4096], 0), 22 | ([4096, 22016], 1), 23 | ([11008, 4096], 0), 24 | ], 25 | "meta-llama/Llama-3-8b": [ 26 | ([4096, 6144], 1), 27 | ([4096, 4096], 0), 28 | ([4096, 28672], 1), 29 | ([14336, 4096], 0), 30 | ], 31 | "meta-llama/Llama-2-13b-hf": [ 32 | ([5120, 15360], 1), 33 | ([5120, 5120], 0), 34 | ([5120, 27648], 1), 35 | ([13824, 5120], 0), 36 | ], 37 | "meta-llama/Llama-2-70b-hf": [ 38 | ([8192, 10240], 1), 39 | ([8192, 8192], 0), 40 | ([8192, 57344], 1), 41 | ([28672, 8192], 0), 42 | ], 43 | } 44 | -------------------------------------------------------------------------------- /benchmarks/kernels/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /benchmarks/kernels/weight_shapes.py: -------------------------------------------------------------------------------- 1 | # Weight Shapes are in the format 2 | # ([K, N], TP_SPLIT_DIM) 3 | # Example: 4 | # A shape of ([14336, 4096], 0) indicates the following GEMM shape, 5 | # - TP1 : K = 14336, N = 4096 6 | # - TP2 : K = 7168, N = 4096 7 | # A shape of ([4096, 6144], 1) indicates the following GEMM shape, 8 | # - TP1 : K = 4096, N = 6144 9 | # - TP4 : K = 4096, N = 1536 10 | 11 | # TP1 shapes 12 | WEIGHT_SHAPES = { 13 | "mistralai/Mistral-7B-v0.1": [ 14 | ([4096, 6144], 1), 15 | ([4096, 4096], 0), 16 | ([4096, 28672], 1), 17 | ([14336, 4096], 0), 18 | ], 19 | "meta-llama/Llama-2-7b-hf": [ 20 | ([4096, 12288], 1), 21 | ([4096, 4096], 0), 22 | ([4096, 22016], 1), 23 | ([11008, 4096], 0), 24 | ], 25 | "meta-llama/Llama-3-8b": [ 26 | ([4096, 6144], 1), 27 | ([4096, 4096], 0), 28 | ([4096, 28672], 1), 29 | ([14336, 4096], 0), 30 | ], 31 | "meta-llama/Llama-2-13b-hf": [ 32 | ([5120, 15360], 1), 33 | ([5120, 5120], 0), 34 | ([5120, 27648], 1), 35 | ([13824, 5120], 0), 36 | ], 37 | "meta-llama/Llama-2-70b-hf": [ 38 | ([8192, 10240], 1), 39 | ([8192, 8192], 0), 40 | ([8192, 57344], 1), 41 | ([28672, 8192], 0), 42 | ], 43 | } 44 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:2.2.0 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8.cuh" 8 | -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8 7 | #ifndef USE_ROCM 8 | #include 9 | #endif // USE_ROCM 10 | #endif // ENABLE_FP8 11 | 12 | namespace vllm { 13 | 14 | enum class Fp8KVCacheDataType { 15 | kAuto = 0, 16 | kFp8E4M3 = 1, 17 | kFp8E5M2 = 2, 18 | }; 19 | 20 | // fp8 vector types for quantization of kv cache 21 | template <> 22 | struct Vec { 23 | using Type = uint8_t; 24 | }; 25 | 26 | template <> 27 | struct Vec { 28 | using Type = uint16_t; 29 | }; 30 | 31 | template <> 32 | struct Vec { 33 | using Type = uint32_t; 34 | }; 35 | 36 | template <> 37 | struct Vec { 38 | using Type = uint2; 39 | }; 40 | 41 | } // namespace vllm 42 | -------------------------------------------------------------------------------- /csrc/core/exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VLLM_IMPLIES(p, q) (!(p) || (q)) 4 | -------------------------------------------------------------------------------- /csrc/core/registration.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define _CONCAT(A, B) A##B 6 | #define CONCAT(A, B) _CONCAT(A, B) 7 | 8 | #define _STRINGIFY(A) #A 9 | #define STRINGIFY(A) _STRINGIFY(A) 10 | 11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME 12 | // could be a macro instead of a literal token. 13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) 14 | 15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME 16 | // could be a macro instead of a literal token. 17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \ 18 | TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE) 19 | 20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized 21 | // via python's import statement. 22 | #define REGISTER_EXTENSION(NAME) \ 23 | PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \ 24 | static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \ 25 | STRINGIFY(NAME), nullptr, 0, nullptr}; \ 26 | return PyModule_Create(&module); \ 27 | } 28 | -------------------------------------------------------------------------------- /csrc/core/torch_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "scalar_type.hpp" 4 | #include "registration.h" 5 | 6 | // Note the CORE exstension will be built for (almost) all hardware targets so 7 | // new additions must account for this. (currently not built for TPU and Neuron) 8 | 9 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) { 10 | // ScalarType, a custom class for representing data types that supports 11 | // quantized types, declared here so it can be used when creating interfaces 12 | // for custom ops. 13 | vllm::ScalarTypeTorch::bind_class(lib); 14 | } 15 | 16 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME) 17 | -------------------------------------------------------------------------------- /csrc/cpu/cpu_types.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef CPU_TYPES_HPP 3 | #define CPU_TYPES_HPP 4 | 5 | #if defined(__x86_64__) 6 | //x86 implementation 7 | #include "cpu_types_x86.hpp" 8 | #elif defined(__POWER9_VECTOR__) 9 | //ppc implementation 10 | #include "cpu_types_vsx.hpp" 11 | #else 12 | #warning "unsupported vLLM cpu implementation" 13 | #endif 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) 4 | #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 5 | #define DEVICE_INLINE __forceinline__ __device__ 6 | #define HOST_INLINE __forceinline__ __host__ 7 | #else 8 | #define HOST_DEVICE_INLINE inline 9 | #define DEVICE_INLINE inline 10 | #define HOST_INLINE inline 11 | #endif 12 | 13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id); 14 | 15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); 16 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) { 6 | int device, value; 7 | if (device_id < 0) { 8 | cudaGetDevice(&device); 9 | } else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), 13 | device); 14 | return value; 15 | } 16 | 17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) { 18 | int64_t attribute; 19 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 20 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 21 | 22 | #ifdef USE_ROCM 23 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 24 | #else 25 | attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; 26 | #endif 27 | 28 | return get_device_attribute(attribute, device_id); 29 | } 30 | -------------------------------------------------------------------------------- /csrc/mamba/mamba_ssm/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 4 | 5 | // clang-format off 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h 7 | #pragma once 8 | 9 | /// @param COND - a boolean expression to switch by 10 | /// @param CONST_NAME - a name given for the constexpr bool variable. 11 | /// @param ... - code to execute for true and false 12 | /// 13 | /// Usage: 14 | /// ``` 15 | /// BOOL_SWITCH(flag, BoolConst, [&] { 16 | /// some_function(...); 17 | /// }); 18 | /// ``` 19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 20 | [&] { \ 21 | if (COND) { \ 22 | constexpr bool CONST_NAME = true; \ 23 | return __VA_ARGS__(); \ 24 | } else { \ 25 | constexpr bool CONST_NAME = false; \ 26 | return __VA_ARGS__(); \ 27 | } \ 28 | }() 29 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = true; 18 | 19 | if (false) { 20 | } 21 | AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256) 22 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256) 23 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128) 24 | AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4b8.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4b8( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4b8( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku8b128.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, 6 | torch::Tensor& token_expert_indices, 7 | torch::Tensor& gating_output); 8 | -------------------------------------------------------------------------------- /csrc/moe/torch_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "core/registration.h" 2 | #include "moe_ops.h" 3 | 4 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { 5 | // Apply topk softmax to the gating outputs. 6 | m.def( 7 | "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! " 8 | "token_expert_indices, Tensor gating_output) -> ()"); 9 | m.impl("topk_softmax", torch::kCUDA, &topk_softmax); 10 | 11 | #ifndef USE_ROCM 12 | m.def( 13 | "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " 14 | "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " 15 | "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " 16 | "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, " 17 | "int size_n, int size_k, bool is_k_full, int num_experts, int topk, " 18 | "int moe_block_size, bool replicate_input, bool apply_weights)" 19 | " -> Tensor"); 20 | // conditionally compiled so impl registration is in source file 21 | #endif 22 | } 23 | 24 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME) 25 | -------------------------------------------------------------------------------- /csrc/prepare_inputs/advance_step.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace prepare_inputs { 13 | 14 | static constexpr int max_threads = 256; 15 | static constexpr bool logging = false; 16 | 17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 18 | 19 | } // namespace prepare_inputs 20 | -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass/cutlass.h" 4 | #include 5 | 6 | /** 7 | * Helper function for checking CUTLASS errors 8 | */ 9 | #define CUTLASS_CHECK(status) \ 10 | { \ 11 | TORCH_CHECK(status == cutlass::Status::kSuccess, \ 12 | cutlassGetStatusString(status)) \ 13 | } 14 | 15 | inline uint32_t next_pow_2(uint32_t const num) { 16 | if (num <= 1) return num; 17 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); 18 | } 19 | 20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { 21 | int max_shared_mem_per_block_opt_in = 0; 22 | cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, 23 | cudaDevAttrMaxSharedMemoryPerBlockOptin, 24 | device); 25 | return max_shared_mem_per_block_opt_in; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 14 | 15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 16 | const uint32_t q_1, 17 | half2 (&dq)[4], int stride, 18 | const uint32_t zero) { 19 | half dqh[8]; 20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 22 | 23 | for (int i = 0; i < 4; i++) 24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 25 | } 26 | 27 | } // namespace gptq 28 | } // namespace vllm 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /csrc/quantization/marlin/dense/common/base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Modified by HandH1998 3 | * Modified by Neural Magic 4 | * Copyright (C) Marlin.2024 Elias Frantar 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | #pragma once 20 | 21 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } 22 | 23 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed 24 | // for instance as inputs to tensor core operations. Consequently, all 25 | // corresponding index accesses must be compile-time constants, which is why we 26 | // extensively use `#pragma unroll` throughout the kernel code to guarantee 27 | // this. 28 | template 29 | struct Vec { 30 | T elems[n]; 31 | __device__ T& operator[](int i) { return elems[i]; } 32 | }; 33 | -------------------------------------------------------------------------------- /csrc/rocm/ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, 6 | torch::Tensor& max_logits, torch::Tensor& tmp_out, 7 | torch::Tensor& query, torch::Tensor& key_cache, 8 | torch::Tensor& value_cache, int64_t num_kv_heads, 9 | double scale, torch::Tensor& block_tables, 10 | torch::Tensor& context_lens, int64_t block_size, 11 | int64_t max_context_len, 12 | const c10::optional& alibi_slopes, 13 | const std::string& kv_cache_dtype, double k_scale, 14 | double v_scale); 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx==6.2.1 2 | sphinx-book-theme==1.0.1 3 | sphinx-copybutton==0.5.2 4 | myst-parser==2.0.0 5 | sphinx-argparse==0.4.0 6 | msgspec 7 | cloudpickle 8 | 9 | # packages to install to build the documentation 10 | pydantic >= 2.8 11 | -f https://download.pytorch.org/whl/cpu 12 | torch 13 | py-cpuinfo 14 | transformers 15 | mistral_common >= 1.3.4 16 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args 17 | partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -------------------------------------------------------------------------------- /docs/source/_static/custom.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function () { 2 | var script = document.createElement("script"); 3 | script.type = "module"; 4 | script.id = "runllm-widget-script" 5 | 6 | script.src = "https://widget.runllm.com"; 7 | 8 | script.setAttribute("version", "stable"); 9 | script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. 10 | script.setAttribute("runllm-name", "vLLM"); 11 | script.setAttribute("runllm-position", "BOTTOM_RIGHT"); 12 | script.setAttribute("runllm-assistant-id", "207"); 13 | 14 | script.async = true; 15 | document.head.appendChild(script); 16 | }); -------------------------------------------------------------------------------- /docs/source/_templates/sections/header.html: -------------------------------------------------------------------------------- 1 | 36 | 37 |
38 |

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.

39 |
40 | -------------------------------------------------------------------------------- /docs/source/assets/dev/dockerfile-stages-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/dev/dockerfile-stages-dependency.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/community/sponsors.md: -------------------------------------------------------------------------------- 1 | # Sponsors 2 | 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! 4 | 5 | 6 | 7 | 8 | - a16z 9 | - AMD 10 | - Anyscale 11 | - AWS 12 | - Crusoe Cloud 13 | - Databricks 14 | - DeepInfra 15 | - Dropbox 16 | - Google Cloud 17 | - Lambda Lab 18 | - NVIDIA 19 | - Replicate 20 | - Roblox 21 | - RunPod 22 | - Sequoia Capital 23 | - Skywork AI 24 | - Trainy 25 | - UC Berkeley 26 | - UC San Diego 27 | - ZhenFund 28 | 29 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. 30 | -------------------------------------------------------------------------------- /docs/source/dev/engine/async_llm_engine.rst: -------------------------------------------------------------------------------- 1 | AsyncLLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.AsyncLLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/engine/engine_index.rst: -------------------------------------------------------------------------------- 1 | vLLM Engine 2 | ================================= 3 | 4 | .. automodule:: vllm.engine 5 | .. currentmodule:: vllm.engine 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Engines 10 | 11 | llm_engine 12 | async_llm_engine 13 | 14 | -------------------------------------------------------------------------------- /docs/source/dev/engine/llm_engine.rst: -------------------------------------------------------------------------------- 1 | LLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.LLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/input_processing/input_processing_pipeline.rst: -------------------------------------------------------------------------------- 1 | .. _input_processing_pipeline: 2 | 3 | Input Processing Pipeline 4 | ========================= 5 | 6 | 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). 7 | 8 | 2. Tokenize the data if necessary. 9 | 10 | 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. 11 | 12 | - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. 13 | 14 | 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. 15 | 16 | 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. 17 | 18 | 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. 19 | 20 | - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. 21 | -------------------------------------------------------------------------------- /docs/source/dev/input_processing/model_inputs_index.rst: -------------------------------------------------------------------------------- 1 | .. _input_processing: 2 | 3 | Input Processing 4 | ================ 5 | 6 | .. currentmodule:: vllm.inputs 7 | 8 | Each model can override parts of vLLM's :ref:`input processing pipeline ` via 9 | :data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. 10 | 11 | Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input 12 | data in addition to input prompt, but it can be extended to text-only language models when needed. 13 | 14 | Guides 15 | ++++++ 16 | 17 | .. toctree:: 18 | :maxdepth: 1 19 | 20 | input_processing_pipeline 21 | 22 | Module Contents 23 | +++++++++++++++ 24 | 25 | LLM Engine Inputs 26 | ----------------- 27 | 28 | .. autoclass:: vllm.inputs.LLMInputs 29 | :members: 30 | :show-inheritance: 31 | 32 | Registry 33 | -------- 34 | 35 | .. autodata:: vllm.inputs.INPUT_REGISTRY 36 | 37 | .. automodule:: vllm.inputs.registry 38 | :members: 39 | :show-inheritance: 40 | -------------------------------------------------------------------------------- /docs/source/dev/multimodal/adding_multimodal_plugin.rst: -------------------------------------------------------------------------------- 1 | .. _adding_multimodal_plugin: 2 | 3 | Adding a Multimodal Plugin 4 | ========================== 5 | 6 | This document teaches you how to add a new modality to vLLM. 7 | 8 | Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. 9 | For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. 10 | 11 | The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. 12 | 13 | .. note:: 14 | This article is a work in progress. 15 | 16 | .. 17 | TODO: Add more instructions on how to add new plugins once embeddings is in. 18 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm.rst: -------------------------------------------------------------------------------- 1 | LLM Class 2 | ========= 3 | 4 | .. autoclass:: vllm.LLM 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm_inputs.rst: -------------------------------------------------------------------------------- 1 | LLM Inputs 2 | ========== 3 | 4 | .. autodata:: vllm.inputs.PromptType 5 | 6 | .. autoclass:: vllm.inputs.TextPrompt 7 | :show-inheritance: 8 | :members: 9 | :member-order: bysource 10 | 11 | .. autoclass:: vllm.inputs.TokensPrompt 12 | :show-inheritance: 13 | :members: 14 | :member-order: bysource 15 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/offline_index.rst: -------------------------------------------------------------------------------- 1 | Offline Inference 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | llm 8 | llm_inputs 9 | -------------------------------------------------------------------------------- /docs/source/dev/sampling_params.rst: -------------------------------------------------------------------------------- 1 | Sampling Parameters 2 | =================== 3 | 4 | .. autoclass:: vllm.SamplingParams 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/getting_started/examples/examples_index.template.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Scripts 7 | 8 | %EXAMPLE_DOCS% 9 | -------------------------------------------------------------------------------- /docs/source/models/engine_args.rst: -------------------------------------------------------------------------------- 1 | .. _engine_args: 2 | 3 | Engine Arguments 4 | ================ 5 | 6 | Below, you can find an explanation of every engine argument for vLLM: 7 | 8 | .. argparse:: 9 | :module: vllm.engine.arg_utils 10 | :func: _engine_args_parser 11 | :prog: vllm serve 12 | :nodefaultconst: 13 | 14 | Async Engine Arguments 15 | ---------------------- 16 | 17 | Below are the additional arguments related to the asynchronous engine: 18 | 19 | .. argparse:: 20 | :module: vllm.engine.arg_utils 21 | :func: _async_engine_args_parser 22 | :prog: vllm serve 23 | :nodefaultconst: -------------------------------------------------------------------------------- /docs/source/performance_benchmark/benchmarks.rst: -------------------------------------------------------------------------------- 1 | .. _benchmarks: 2 | 3 | Benchmark suites of vLLM 4 | ======================== 5 | 6 | 7 | 8 | vLLM contains two sets of benchmarks: 9 | 10 | + **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results. 11 | 12 | + **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_. 13 | 14 | 15 | Trigger a benchmark 16 | ------------------- 17 | 18 | The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`. 19 | 20 | 21 | .. note:: 22 | 23 | Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics. 24 | -------------------------------------------------------------------------------- /docs/source/quantization/fp8_e5m2_kvcache.rst: -------------------------------------------------------------------------------- 1 | .. _fp8_kv_cache: 2 | 3 | FP8 E5M2 KV Cache 4 | ================== 5 | 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. 8 | 9 | Here is an example of how to enable this feature: 10 | 11 | .. code-block:: python 12 | 13 | from vllm import LLM, SamplingParams 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 23 | # Create an LLM. 24 | llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") 25 | # Generate texts from the prompts. The output is a list of RequestOutput objects 26 | # that contain the prompt, generated text, and other information. 27 | outputs = llm.generate(prompts, sampling_params) 28 | # Print the outputs. 29 | for output in outputs: 30 | prompt = output.prompt 31 | generated_text = output.outputs[0].text 32 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_bentoml.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_bentoml: 2 | 3 | Deploying with BentoML 4 | ====================== 5 | 6 | `BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial `vLLM inference in the BentoML documentation `_. -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_kserve.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_kserve: 2 | 3 | Deploying with KServe 4 | ============================ 5 | 6 | vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see `this guide `_ for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_lws.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_lws: 2 | 3 | Deploying with LWS 4 | ============================ 5 | 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. 7 | A major use case is for multi-host/multi-node distributed inference. 8 | 9 | vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. 10 | 11 | Please see `this guide `_ for more details on 12 | deploying vLLM on Kubernetes using LWS. 13 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/env_vars.rst: -------------------------------------------------------------------------------- 1 | Environment Variables 2 | ======================== 3 | 4 | vLLM uses the following environment variables to configure the system: 5 | 6 | .. warning:: 7 | Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. 8 | 9 | All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. 10 | 11 | .. literalinclude:: ../../../vllm/envs.py 12 | :language: python 13 | :start-after: begin-env-vars-definition 14 | :end-before: end-env-vars-definition 15 | -------------------------------------------------------------------------------- /docs/source/serving/integrations.rst: -------------------------------------------------------------------------------- 1 | Integrations 2 | ------------ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | run_on_sky 8 | deploying_with_kserve 9 | deploying_with_triton 10 | deploying_with_bentoml 11 | deploying_with_cerebrium 12 | deploying_with_lws 13 | deploying_with_dstack 14 | serving_with_langchain 15 | serving_with_llamaindex 16 | -------------------------------------------------------------------------------- /docs/source/serving/metrics.rst: -------------------------------------------------------------------------------- 1 | Production Metrics 2 | ================== 3 | 4 | vLLM exposes a number of metrics that can be used to monitor the health of the 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM 6 | OpenAI compatible API server. 7 | 8 | The following metrics are exposed: 9 | 10 | .. literalinclude:: ../../../vllm/engine/metrics.py 11 | :language: python 12 | :start-after: begin-metrics-definitions 13 | :end-before: end-metrics-definitions 14 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_langchain.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_langchain: 2 | 3 | Serving with Langchain 4 | ============================ 5 | 6 | vLLM is also available via `Langchain `_ . 7 | 8 | To install langchain, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install langchain langchain_community -q 13 | 14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. 15 | 16 | .. code-block:: python 17 | 18 | from langchain_community.llms import VLLM 19 | 20 | llm = VLLM(model="mosaicml/mpt-7b", 21 | trust_remote_code=True, # mandatory for hf models 22 | max_new_tokens=128, 23 | top_k=10, 24 | top_p=0.95, 25 | temperature=0.8, 26 | # tensor_parallel_size=... # for distributed inference 27 | ) 28 | 29 | print(llm("What is the capital of France ?")) 30 | 31 | Please refer to this `Tutorial `_ for more details. 32 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_llamaindex.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_llamaindex: 2 | 3 | Serving with llama_index 4 | ============================ 5 | 6 | vLLM is also available via `llama_index `_ . 7 | 8 | To install llamaindex, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install llama-index-llms-vllm -q 13 | 14 | To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. 15 | 16 | .. code-block:: python 17 | 18 | from llama_index.llms.vllm import Vllm 19 | 20 | llm = Vllm( 21 | model="microsoft/Orca-2-7b", 22 | tensor_parallel_size=4, 23 | max_new_tokens=100, 24 | vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, 25 | ) 26 | 27 | Please refer to this `Tutorial `_ for more details. 28 | -------------------------------------------------------------------------------- /docs/source/serving/tensorizer.rst: -------------------------------------------------------------------------------- 1 | .. _tensorizer: 2 | 3 | Loading Models with CoreWeave's Tensorizer 4 | ========================================== 5 | vLLM supports loading models with `CoreWeave's Tensorizer `_. 6 | vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized 7 | at runtime extremely quickly directly to the GPU, resulting in significantly 8 | shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. 9 | 10 | For more information on CoreWeave's Tensorizer, please refer to 11 | `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see 12 | the `vLLM example script `_. -------------------------------------------------------------------------------- /examples/cpu_offload.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10) 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference_arctic.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="snowflake/snowflake-arctic-instruct", 15 | quantization="deepspeedfp", 16 | tensor_parallel_size=8, 17 | trust_remote_code=True) 18 | # Generate texts from the prompts. The output is a list of RequestOutput objects 19 | # that contain the prompt, generated text, and other information. 20 | 21 | outputs = llm.generate(prompts, sampling_params) 22 | # Print the outputs. 23 | for output in outputs: 24 | prompt = output.prompt 25 | generated_text = output.outputs[0].text 26 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 27 | -------------------------------------------------------------------------------- /examples/offline_inference_embedding.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | 11 | # Create an LLM. 12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) 13 | # Generate embedding. The output is a list of EmbeddingRequestOutputs. 14 | outputs = model.encode(prompts) 15 | # Print the outputs. 16 | for output in outputs: 17 | print(output.outputs.embedding) # list of 4096 floats 18 | -------------------------------------------------------------------------------- /examples/offline_inference_tpu.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | prompts = [ 4 | "A robot may not injure a human being", 5 | "It is only with the heart that one can see rightly;", 6 | "The greatest glory in living lies not in never falling,", 7 | ] 8 | answers = [ 9 | " or, through inaction, allow a human being to come to harm.", 10 | " what is essential is invisible to the eye.", 11 | " but in rising every time we fall.", 12 | ] 13 | N = 1 14 | # Currently, top-p sampling is disabled. `top_p` should be 1.0. 15 | sampling_params = SamplingParams(temperature=0.7, 16 | top_p=1.0, 17 | n=N, 18 | max_tokens=16) 19 | 20 | # Set `enforce_eager=True` to avoid ahead-of-time compilation. 21 | # In real workloads, `enforace_eager` should be `False`. 22 | llm = LLM(model="google/gemma-2b", enforce_eager=True) 23 | outputs = llm.generate(prompts, sampling_params) 24 | for output, answer in zip(outputs, answers): 25 | prompt = output.prompt 26 | generated_text = output.outputs[0].text 27 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 28 | assert generated_text.startswith(answer) 29 | -------------------------------------------------------------------------------- /examples/offline_inference_with_profiler.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from vllm import LLM, SamplingParams 4 | 5 | # enable torch profiler, can also be set on cmd line 6 | os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" 7 | 8 | # Sample prompts. 9 | prompts = [ 10 | "Hello, my name is", 11 | "The president of the United States is", 12 | "The capital of France is", 13 | "The future of AI is", 14 | ] 15 | # Create a sampling params object. 16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 17 | 18 | # Create an LLM. 19 | llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) 20 | 21 | llm.start_profile() 22 | 23 | # Generate texts from the prompts. The output is a list of RequestOutput objects 24 | # that contain the prompt, generated text, and other information. 25 | outputs = llm.generate(prompts, sampling_params) 26 | 27 | llm.stop_profile() 28 | 29 | # Print the outputs. 30 | for output in outputs: 31 | prompt = output.prompt 32 | generated_text = output.outputs[0].text 33 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 34 | -------------------------------------------------------------------------------- /examples/openai_chat_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | chat_completion = client.chat.completions.create( 17 | messages=[{ 18 | "role": "system", 19 | "content": "You are a helpful assistant." 20 | }, { 21 | "role": "user", 22 | "content": "Who won the world series in 2020?" 23 | }, { 24 | "role": 25 | "assistant", 26 | "content": 27 | "The Los Angeles Dodgers won the World Series in 2020." 28 | }, { 29 | "role": "user", 30 | "content": "Where was it played?" 31 | }], 32 | model=model, 33 | ) 34 | 35 | print("Chat completion results:") 36 | print(chat_completion) 37 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | # Completion API 17 | stream = False 18 | completion = client.completions.create( 19 | model=model, 20 | prompt="A robot may not injure a human being", 21 | echo=False, 22 | n=2, 23 | stream=stream, 24 | logprobs=3) 25 | 26 | print("Completion results:") 27 | if stream: 28 | for c in completion: 29 | print(c) 30 | else: 31 | print(completion) 32 | -------------------------------------------------------------------------------- /examples/openai_embedding_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | responses = client.embeddings.create( 17 | input=[ 18 | "Hello my name is", 19 | "The best thing about vLLM is that it supports many different models" 20 | ], 21 | model=model, 22 | ) 23 | 24 | for data in responses.data: 25 | print(data.embedding) # list of float of len 4096 26 | -------------------------------------------------------------------------------- /examples/openai_example_batch.jsonl: -------------------------------------------------------------------------------- 1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} 2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} 3 | -------------------------------------------------------------------------------- /examples/production_monitoring/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/production_monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_blip2.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'Question: ' + message['content'] + ' ' -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Answer: ' + message['content'] + ' ' -}} 6 | {%- endif -%} 7 | {%- endfor -%} 8 | 9 | {%- if add_generation_prompt -%} 10 | {{- 'Answer:' -}} 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- 1 | <#meta#> 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} 4 | <#system#> 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 6 | <#chat#> 7 | {% for message in messages %} 8 | {% if message['role'] == 'user' %} 9 | <#user#> 10 | {{ message['content']|trim -}} 11 | {% if not loop.last %} 12 | 13 | {% endif %} 14 | {% elif message['role'] == 'assistant' %} 15 | <#bot#> 16 | {{ message['content']|trim -}} 17 | {% if not loop.last %} 18 | 19 | {% endif %} 20 | {% elif message['role'] == 'user_context' %} 21 | <#user_context#> 22 | {{ message['content']|trim -}} 23 | {% if not loop.last %} 24 | 25 | {% endif %} 26 | {% endif %} 27 | {% endfor %} 28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 29 | <#bot#> 30 | {% endif %} -------------------------------------------------------------------------------- /examples/template_llava.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages[0]['role'] == 'system' -%} 2 | {%- set system_message = messages[0]['content'] -%} 3 | {%- set messages = messages[1:] -%} 4 | {%- else -%} 5 | {% set system_message = '' -%} 6 | {%- endif -%} 7 | 8 | {{ bos_token + system_message }} 9 | {%- for message in messages -%} 10 | {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} 11 | {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} 12 | {%- endif -%} 13 | 14 | {%- if message['role'] == 'user' -%} 15 | {{ 'USER: ' + message['content'] + '\n' }} 16 | {%- elif message['role'] == 'assistant' -%} 17 | {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }} 18 | {%- endif -%} 19 | {%- endfor -%} 20 | 21 | {%- if add_generation_prompt -%} 22 | {{ 'ASSISTANT:' }} 23 | {% endif %} 24 | -------------------------------------------------------------------------------- /find_cuda_init.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import traceback 3 | from typing import Callable 4 | from unittest.mock import patch 5 | 6 | 7 | def find_cuda_init(fn: Callable[[], object]) -> None: 8 | """ 9 | Helper function to debug CUDA re-initialization errors. 10 | 11 | If `fn` initializes CUDA, prints the stack trace of how this happens. 12 | """ 13 | from torch.cuda import _lazy_init 14 | 15 | stack = None 16 | 17 | def wrapper(): 18 | nonlocal stack 19 | stack = traceback.extract_stack() 20 | return _lazy_init() 21 | 22 | with patch("torch.cuda._lazy_init", wrapper): 23 | fn() 24 | 25 | if stack is not None: 26 | print("==== CUDA Initialized ====") 27 | print("".join(traceback.format_list(stack)).strip()) 28 | print("==========================") 29 | 30 | 31 | if __name__ == "__main__": 32 | find_cuda_init( 33 | lambda: importlib.import_module("vllm.model_executor.models.llava")) 34 | -------------------------------------------------------------------------------- /requirements-build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.26 3 | ninja 4 | packaging 5 | setuptools>=61 6 | setuptools-scm>=8 7 | torch==2.4.0 8 | wheel 9 | jinja2 10 | -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for x86_64 CPUs 5 | torch == 2.4.0+cpu; platform_machine != "ppc64le" 6 | torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch 7 | -------------------------------------------------------------------------------- /requirements-cuda.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for NVIDIA GPUs 5 | ray >= 2.9 6 | nvidia-ml-py # for pynvml package 7 | torch == 2.4.0 8 | # These must be updated alongside torch 9 | torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version 10 | xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 11 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements-lint.txt 2 | -r requirements-test.txt 3 | 4 | # Avoid adding requirements directly to this file. 5 | # Instead, modify the two files referenced above. 6 | -------------------------------------------------------------------------------- /requirements-lint.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | tomli==2.0.1 5 | ruff==0.6.5 6 | codespell==2.3.0 7 | isort==5.13.2 8 | clang-format==18.1.5 9 | 10 | # type checking 11 | mypy==1.11.1 12 | types-PyYAML 13 | types-requests 14 | types-setuptools 15 | -------------------------------------------------------------------------------- /requirements-neuron.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for Neuron devices 5 | transformers-neuronx >= 0.12.0 6 | torch-neuronx >= 2.1.2 7 | neuronx-cc 8 | -------------------------------------------------------------------------------- /requirements-openvino.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | torch == 2.4.0 # should be aligned with "common" vLLM torch version 5 | openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention 6 | 7 | optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version 8 | optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version 9 | -------------------------------------------------------------------------------- /requirements-rocm.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for AMD GPUs 5 | awscli 6 | boto3 7 | botocore 8 | ray >= 2.10.0 9 | peft 10 | pytest-asyncio 11 | tensorizer>=2.9.0 -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | # testing 2 | pytest 3 | tensorizer>=2.9.0 4 | pytest-forked 5 | pytest-asyncio 6 | pytest-rerunfailures 7 | pytest-shard 8 | 9 | # testing utils 10 | awscli 11 | einops # required for MPT, qwen-vl and Mamba 12 | httpx 13 | librosa # required for audio tests 14 | opencv-python # required for video tests 15 | peft 16 | requests 17 | ray[adag]==2.35 18 | sentence-transformers # required for embedding 19 | soundfile # required for audio test 20 | compressed-tensors==0.4.0 # required for compressed-tensors 21 | timm # required for internvl test 22 | transformers_stream_generator # required for qwen-vl test 23 | matplotlib # required for qwen-vl test 24 | datamodel_code_generator # required for minicpm3 test 25 | lm-eval[api]==0.4.4 # required for model evaluation test 26 | 27 | # TODO: Add this after fully implementing llava(mantis) 28 | # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test 29 | 30 | # Benchmarking 31 | aiohttp 32 | 33 | # quantization 34 | bitsandbytes>=0.44.0 35 | buildkite-test-collector==0.1.8 36 | -------------------------------------------------------------------------------- /requirements-tpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for TPU 5 | # Currently, the TPU backend uses a nightly version of PyTorch XLA. 6 | # You can install the dependencies in Dockerfile.tpu. 7 | ray[default] 8 | -------------------------------------------------------------------------------- /requirements-xpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | ray >= 2.9 5 | cmake>=3.26 6 | ninja 7 | packaging 8 | setuptools-scm>=8 9 | wheel 10 | jinja2 11 | # Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 12 | torch == 2.3.1+cxx11.abi 13 | intel-extension-for-pytorch == 2.3.110+xpu 14 | oneccl_bind_pt == 2.3.100+xpu 15 | 16 | triton-xpu == 3.0.0b2 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/async_engine/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/basic_correctness/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/test_cpu_offload.py: -------------------------------------------------------------------------------- 1 | from ..utils import compare_two_settings 2 | 3 | 4 | def test_cpu_offload(): 5 | compare_two_settings("meta-llama/Llama-2-7b-hf", [], 6 | ["--cpu-offload-gb", "4"]) 7 | -------------------------------------------------------------------------------- /tests/compile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/compile/__init__.py -------------------------------------------------------------------------------- /tests/compile/test_full_graph.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.compilation.levels import CompilationLevel 4 | 5 | from ..utils import fork_new_process_for_each_test 6 | from .utils import TEST_MODELS, check_full_graph_support 7 | 8 | 9 | @pytest.mark.parametrize("model_info", TEST_MODELS) 10 | @pytest.mark.parametrize( 11 | "optimization_level", 12 | [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR]) 13 | @fork_new_process_for_each_test 14 | def test_full_graph(model_info, optimization_level): 15 | model = model_info[0] 16 | model_kwargs = model_info[1] 17 | check_full_graph_support(model, 18 | model_kwargs, 19 | optimization_level, 20 | tp_size=1) 21 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/block/__init__.py -------------------------------------------------------------------------------- /tests/core/block/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def should_do_global_cleanup_after_test() -> bool: 6 | """Disable the global cleanup fixture for tests in this directory. This 7 | provides a ~10x speedup for unit tests that don't load a model to GPU. 8 | 9 | This requires that tests in this directory clean up after themselves if they 10 | use the GPU. 11 | """ 12 | return False 13 | -------------------------------------------------------------------------------- /tests/core/block/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/block/e2e/__init__.py -------------------------------------------------------------------------------- /tests/core/test_serialization.py: -------------------------------------------------------------------------------- 1 | import msgspec 2 | 3 | from vllm.executor.msgspec_utils import decode_hook, encode_hook 4 | from vllm.sequence import ExecuteModelRequest 5 | 6 | from ..spec_decode.utils import create_batch 7 | 8 | 9 | def test_msgspec_serialization(): 10 | num_lookahead_slots = 4 11 | seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots) 12 | execute_model_req = ExecuteModelRequest( 13 | seq_group_metadata_list=seq_group_metadata_list, 14 | num_lookahead_slots=num_lookahead_slots, 15 | running_queue_size=4) 16 | 17 | encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) 18 | decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, 19 | dec_hook=decode_hook) 20 | req = decoder.decode(encoder.encode(execute_model_req)) 21 | expected = execute_model_req.seq_group_metadata_list 22 | actual = req.seq_group_metadata_list 23 | assert (len(expected) == len(actual)) 24 | expected = expected[0] 25 | actual = actual[0] 26 | 27 | assert expected.block_tables == actual.block_tables 28 | assert expected.is_prompt == actual.is_prompt 29 | assert expected.request_id == actual.request_id 30 | assert (expected.seq_data[0].prompt_token_ids == 31 | actual.seq_data[0].prompt_token_ids) 32 | assert (expected.seq_data[0].output_token_ids == 33 | actual.seq_data[0].output_token_ids) 34 | -------------------------------------------------------------------------------- /tests/data/test_config.yaml: -------------------------------------------------------------------------------- 1 | port: 12312 2 | served_model_name: mymodel 3 | tensor_parallel_size: 2 4 | -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/distributed/__init__.py -------------------------------------------------------------------------------- /tests/distributed/test_distributed_oot.py: -------------------------------------------------------------------------------- 1 | from ..entrypoints.openai.test_oot_registration import ( 2 | run_and_test_dummy_opt_api_server) 3 | 4 | 5 | def test_distributed_oot(dummy_opt_path: str): 6 | run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2) 7 | -------------------------------------------------------------------------------- /tests/distributed/test_pp_cudagraph.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from ..utils import compare_two_settings, fork_new_process_for_each_test 6 | 7 | 8 | @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ 9 | (2, "JackFram/llama-160m"), 10 | ]) 11 | @pytest.mark.parametrize("ATTN_BACKEND", [ 12 | "FLASH_ATTN", 13 | "FLASHINFER", 14 | ]) 15 | @fork_new_process_for_each_test 16 | def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): 17 | cudagraph_args = [ 18 | # use half precision for speed and memory savings in CI environment 19 | "--dtype", 20 | "float16", 21 | "--pipeline-parallel-size", 22 | str(PP_SIZE), 23 | "--distributed-executor-backend", 24 | "mp", 25 | ] 26 | os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND 27 | 28 | eager_args = cudagraph_args + ["--enforce-eager"] 29 | 30 | compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) 31 | -------------------------------------------------------------------------------- /tests/distributed/test_same_node.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch.distributed as dist 4 | 5 | from vllm.distributed.parallel_state import in_the_same_node_as 6 | 7 | if __name__ == "__main__": 8 | dist.init_process_group(backend="gloo") 9 | test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0)) 10 | 11 | expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" 12 | assert test_result == expected, f"Expected {expected}, got {test_result}" 13 | print("Same node test passed!") 14 | -------------------------------------------------------------------------------- /tests/distributed/test_utils.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | import vllm.envs as envs 4 | from vllm.utils import (cuda_device_count_stateless, 5 | update_environment_variables) 6 | 7 | 8 | @ray.remote 9 | class _CUDADeviceCountStatelessTestActor: 10 | 11 | def get_count(self): 12 | return cuda_device_count_stateless() 13 | 14 | def set_cuda_visible_devices(self, cuda_visible_devices: str): 15 | update_environment_variables( 16 | {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) 17 | 18 | def get_cuda_visible_devices(self): 19 | return envs.CUDA_VISIBLE_DEVICES 20 | 21 | 22 | def test_cuda_device_count_stateless(): 23 | """Test that cuda_device_count_stateless changes return value if 24 | CUDA_VISIBLE_DEVICES is changed.""" 25 | actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore 26 | num_gpus=2).remote() 27 | assert sorted(ray.get( 28 | actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] 29 | assert ray.get(actor.get_count.remote()) == 2 30 | ray.get(actor.set_cuda_visible_devices.remote("0")) 31 | assert ray.get(actor.get_count.remote()) == 1 32 | ray.get(actor.set_cuda_visible_devices.remote("")) 33 | assert ray.get(actor.get_count.remote()) == 0 34 | -------------------------------------------------------------------------------- /tests/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/encoder_decoder/__init__.py -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/engine/__init__.py -------------------------------------------------------------------------------- /tests/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /tests/engine/test_skip_tokenizer_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_skip_tokenizer_initialization(model: str): 9 | # This test checks if the flag skip_tokenizer_init skips the initialization 10 | # of tokenizer and detokenizer. The generated output is expected to contain 11 | # token ids. 12 | llm = LLM(model=model, skip_tokenizer_init=True) 13 | sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) 14 | 15 | with pytest.raises(ValueError, match="cannot pass text prompts when"): 16 | llm.generate("abc", sampling_params) 17 | 18 | outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, 19 | sampling_params=sampling_params) 20 | assert len(outputs) > 0 21 | completions = outputs[0].outputs 22 | assert len(completions) > 0 23 | assert completions[0].text == "" 24 | assert completions[0].token_ids 25 | -------------------------------------------------------------------------------- /tests/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/llm/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/test_prompt_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import LLM 4 | 5 | 6 | def test_empty_prompt(): 7 | llm = LLM(model="gpt2") 8 | with pytest.raises(ValueError, match='Prompt cannot be empty'): 9 | llm.generate([""]) 10 | -------------------------------------------------------------------------------- /tests/entrypoints/offline_mode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/offline_mode/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/test_prompt_validation.py: -------------------------------------------------------------------------------- 1 | # imports for guided decoding tests 2 | import re 3 | 4 | import openai 5 | import pytest 6 | 7 | from ...utils import RemoteOpenAIServer 8 | 9 | 10 | @pytest.mark.asyncio 11 | async def test_empty_prompt(): 12 | model_name = "gpt2" 13 | server_args = ["--enforce-eager"] 14 | with RemoteOpenAIServer(model_name, server_args) as remote_server: 15 | client = remote_server.get_async_client() 16 | 17 | with pytest.raises(openai.BadRequestError, 18 | match=re.compile('.+Prompt cannot be empty.+')): 19 | await client.completions.create(model=model_name, 20 | prompt="", 21 | max_tokens=5, 22 | temperature=0.0) 23 | -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/kernels/__init__.py -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Reference default values of atol and rtol are from 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 6 | default_rtol = { 7 | torch.float16: 1e-3, 8 | torch.bfloat16: 1.6e-2, 9 | torch.float: 1.3e-6 10 | } 11 | 12 | 13 | def get_default_atol(output) -> float: 14 | return default_atol[output.dtype] 15 | 16 | 17 | def get_default_rtol(output) -> float: 18 | return default_rtol[output.dtype] 19 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.utils import (create_kv_caches_with_random, 4 | create_kv_caches_with_random_flash) 5 | 6 | 7 | @pytest.fixture() 8 | def kv_cache_factory(): 9 | return create_kv_caches_with_random 10 | 11 | 12 | @pytest.fixture() 13 | def kv_cache_factory_flashinfer(): 14 | return create_kv_caches_with_random_flash 15 | -------------------------------------------------------------------------------- /tests/kernels/test_ggml.py: -------------------------------------------------------------------------------- 1 | import gguf 2 | import pytest 3 | import torch 4 | 5 | from tests.kernels.utils import opcheck 6 | from vllm import _custom_ops as ops # noqa: F401 7 | 8 | 9 | @pytest.mark.parametrize("quant_type", [12]) 10 | def test_ggml_opcheck(quant_type): 11 | block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] 12 | shape = [256, 1152] 13 | qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8) 14 | m = qweight.shape[0] 15 | n = qweight.shape[1] // type_size * block_size 16 | opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n)) 17 | 18 | x = torch.rand((m, 512), device='cuda', dtype=torch.float16) 19 | opcheck(torch.ops._C.ggml_mul_mat_a8, 20 | (qweight, x, quant_type, qweight.shape[0])) 21 | opcheck(torch.ops._C.ggml_mul_mat_vec_a8, 22 | (qweight, x, quant_type, qweight.shape[0])) 23 | -------------------------------------------------------------------------------- /tests/kernels/test_gptq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from tests.kernels.utils import opcheck 4 | from vllm import _custom_ops as ops # noqa: F401 5 | 6 | 7 | def test_gptq_shuffle_opcheck(): 8 | weight = torch.randint(-2000000, 9 | 2000000, (1792, 4096), 10 | device='cuda', 11 | dtype=torch.int32) 12 | perm = torch.empty((0, ), device='cuda', dtype=torch.int32) 13 | bit = 4 14 | opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit)) 15 | 16 | 17 | def test_gptq_gemm_opcheck(): 18 | a = torch.rand((240, 4096), device='cuda', dtype=torch.float16) 19 | weight = torch.randint(-2000000, 20 | 2000000, (512, 6144), 21 | device='cuda', 22 | dtype=torch.int32) 23 | zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32) 24 | scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16) 25 | idx = torch.empty((0, ), device='cuda', dtype=torch.int32) 26 | use_exllama = True 27 | bit = 4 28 | opcheck(torch.ops._C.gptq_gemm, 29 | (a, weight, zeros, scales, idx, use_exllama, bit)) 30 | -------------------------------------------------------------------------------- /tests/kernels/test_permute_cols.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from tests.kernels.utils import opcheck 5 | from vllm._custom_ops import permute_cols 6 | 7 | 8 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)]) 9 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16]) 10 | def test_permute_cols(shape, dtype): 11 | x = torch.randn(shape, dtype=dtype).cuda() 12 | perm = torch.randperm(x.shape[1]).to(torch.int).cuda() 13 | opcheck(torch.ops._C.permute_cols, (x, perm)) 14 | y = permute_cols(x, perm) 15 | torch.testing.assert_close(y, x[:, perm]) -------------------------------------------------------------------------------- /tests/kernels/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for miscellaneous utilities 3 | """ 4 | 5 | import pytest 6 | import torch 7 | 8 | from tests.kernels.utils import opcheck 9 | from vllm.platforms import current_platform 10 | 11 | 12 | def test_convert_fp8_opcheck(): 13 | data = torch.randn((256, 256), dtype=torch.float32, device="cuda") 14 | result = torch.empty_like(data, dtype=torch.float8_e4m3fn) 15 | opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) 16 | 17 | 18 | @pytest.mark.skipif(not current_platform.is_cuda(), 19 | reason="Only supported for CUDA") 20 | def test_cuda_utils_opcheck(): 21 | opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) 22 | opcheck( 23 | torch.ops._C_cuda_utils. 24 | get_max_shared_memory_per_block_device_attribute, (0, )) 25 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/lora/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/lora/data/__init__.py -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/model_executor/__init__.py -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/audio_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/audio_language/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/language/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/language/test_granite.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Granite models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_granite.py`. 4 | """ 5 | import pytest 6 | 7 | from ...utils import check_logprobs_close 8 | 9 | MODELS = [ 10 | "ibm/PowerLM-3b", 11 | ] 12 | 13 | 14 | @pytest.mark.parametrize("model", MODELS) 15 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 16 | @pytest.mark.parametrize("max_tokens", [64]) 17 | @pytest.mark.parametrize("num_logprobs", [5]) 18 | def test_models( 19 | hf_runner, 20 | vllm_runner, 21 | example_prompts, 22 | model: str, 23 | dtype: str, 24 | max_tokens: int, 25 | num_logprobs: int, 26 | ) -> None: 27 | # TODO(sang): Sliding window should be tested separately. 28 | with hf_runner(model, dtype=dtype) as hf_model: 29 | hf_outputs = hf_model.generate_greedy_logprobs_limit( 30 | example_prompts, max_tokens, num_logprobs) 31 | 32 | with vllm_runner(model, dtype=dtype) as vllm_model: 33 | vllm_outputs = vllm_model.generate_greedy_logprobs( 34 | example_prompts, max_tokens, num_logprobs) 35 | check_logprobs_close( 36 | outputs_0_lst=hf_outputs, 37 | outputs_1_lst=vllm_outputs, 38 | name_0="hf", 39 | name_1="vllm", 40 | ) 41 | -------------------------------------------------------------------------------- /tests/models/decoder_only/language/test_granitemoe.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Granite models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_granite.py`. 4 | """ 5 | import pytest 6 | 7 | from ...utils import check_logprobs_close 8 | 9 | MODELS = [ 10 | "ibm/PowerMoE-3b", 11 | ] 12 | 13 | 14 | @pytest.mark.parametrize("model", MODELS) 15 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 16 | @pytest.mark.parametrize("max_tokens", [64]) 17 | @pytest.mark.parametrize("num_logprobs", [5]) 18 | def test_models( 19 | hf_runner, 20 | vllm_runner, 21 | example_prompts, 22 | model: str, 23 | dtype: str, 24 | max_tokens: int, 25 | num_logprobs: int, 26 | ) -> None: 27 | with hf_runner(model, dtype=dtype) as hf_model: 28 | hf_outputs = hf_model.generate_greedy_logprobs_limit( 29 | example_prompts, max_tokens, num_logprobs) 30 | 31 | with vllm_runner(model, dtype=dtype) as vllm_model: 32 | vllm_outputs = vllm_model.generate_greedy_logprobs( 33 | example_prompts, max_tokens, num_logprobs) 34 | check_logprobs_close( 35 | outputs_0_lst=hf_outputs, 36 | outputs_1_lst=vllm_outputs, 37 | name_0="hf", 38 | name_1="vllm", 39 | ) 40 | -------------------------------------------------------------------------------- /tests/models/decoder_only/vision_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/vision_language/__init__.py -------------------------------------------------------------------------------- /tests/models/embedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/embedding/__init__.py -------------------------------------------------------------------------------- /tests/models/embedding/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/embedding/language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/vision_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/vision_language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/vision_language/test_broadcast.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ....utils import multi_gpu_test 4 | 5 | 6 | @multi_gpu_test(num_gpus=2) 7 | @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) 8 | @pytest.mark.parametrize("model", [ 9 | "meta-llama/Llama-3.2-11B-Vision-Instruct", 10 | ]) 11 | def test_models(hf_runner, vllm_runner, image_assets, 12 | distributed_executor_backend, model) -> None: 13 | 14 | dtype = "half" 15 | max_tokens = 5 16 | num_logprobs = 5 17 | tensor_parallel_size = 2 18 | 19 | if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"): 20 | from .test_mllama import models, run_test 21 | else: 22 | raise NotImplementedError(f"Unsupported model: {model}") 23 | 24 | run_test( 25 | hf_runner, 26 | vllm_runner, 27 | image_assets, 28 | model=models[0], 29 | size_factors=[0.25, 0.5, 1.0], 30 | dtype=dtype, 31 | max_tokens=max_tokens, 32 | num_logprobs=num_logprobs, 33 | tensor_parallel_size=tensor_parallel_size, 34 | distributed_executor_backend=distributed_executor_backend, 35 | ) 36 | -------------------------------------------------------------------------------- /tests/mq_llm_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/mq_llm_engine/__init__.py -------------------------------------------------------------------------------- /tests/multi_step/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/multi_step/__init__.py -------------------------------------------------------------------------------- /tests/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/multimodal/__init__.py -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='vllm_add_dummy_model', 4 | version='0.1', 5 | packages=['vllm_add_dummy_model'], 6 | entry_points={ 7 | 'vllm.general_plugins': 8 | ["register_dummy_model = vllm_add_dummy_model:register"] 9 | }) 10 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm import ModelRegistry 2 | 3 | 4 | def register(): 5 | # Test directly passing the model 6 | from .my_opt import MyOPTForCausalLM 7 | 8 | if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs(): 9 | ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM) 10 | 11 | # Test passing lazy model 12 | if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs(): 13 | ModelRegistry.register_model( 14 | "MyGemma2Embedding", 15 | "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding", 16 | ) 17 | 18 | if "MyLlava" not in ModelRegistry.get_supported_archs(): 19 | ModelRegistry.register_model("MyLlava", 20 | "vllm_add_dummy_model.my_llava:MyLlava") 21 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | import torch 4 | 5 | from vllm.attention import AttentionMetadata 6 | from vllm.model_executor.models.gemma2_embedding import Gemma2EmbeddingModel 7 | from vllm.sequence import IntermediateTensors 8 | 9 | 10 | class MyGemma2Embedding(Gemma2EmbeddingModel): 11 | 12 | def forward( 13 | self, 14 | input_ids: torch.Tensor, 15 | positions: torch.Tensor, 16 | kv_caches: List[torch.Tensor], 17 | attn_metadata: AttentionMetadata, 18 | intermediate_tensors: Optional[IntermediateTensors] = None, 19 | inputs_embeds: Optional[torch.Tensor] = None, 20 | ) -> Union[torch.Tensor, IntermediateTensors]: 21 | hidden_states = super().forward( 22 | input_ids, 23 | positions, 24 | kv_caches, 25 | attn_metadata, 26 | intermediate_tensors=intermediate_tensors, 27 | inputs_embeds=inputs_embeds, 28 | ) 29 | 30 | if isinstance(hidden_states, IntermediateTensors): 31 | return hidden_states 32 | 33 | # Return all-zero embeddings 34 | return torch.zeros_like(hidden_states) 35 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from vllm.inputs import INPUT_REGISTRY 6 | from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, 7 | dummy_data_for_llava, 8 | get_max_llava_image_tokens, 9 | input_processor_for_llava) 10 | from vllm.model_executor.sampling_metadata import SamplingMetadata 11 | from vllm.multimodal import MULTIMODAL_REGISTRY 12 | 13 | 14 | @MULTIMODAL_REGISTRY.register_image_input_mapper() 15 | @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) 16 | @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava) 17 | @INPUT_REGISTRY.register_input_processor(input_processor_for_llava) 18 | class MyLlava(LlavaForConditionalGeneration): 19 | 20 | def compute_logits( 21 | self, hidden_states: torch.Tensor, 22 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: 23 | # this dummy model always predicts the first token 24 | logits = super().compute_logits(hidden_states, sampling_metadata) 25 | if logits is not None: 26 | logits.zero_() 27 | logits[:, 0] += 1.0 28 | return logits 29 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from vllm.model_executor.models.opt import OPTForCausalLM 6 | from vllm.model_executor.sampling_metadata import SamplingMetadata 7 | 8 | 9 | class MyOPTForCausalLM(OPTForCausalLM): 10 | 11 | def compute_logits( 12 | self, hidden_states: torch.Tensor, 13 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: 14 | # this dummy model always predicts the first token 15 | logits = super().compute_logits(hidden_states, sampling_metadata) 16 | if logits is not None: 17 | logits.zero_() 18 | logits[:, 0] += 1.0 19 | return logits 20 | -------------------------------------------------------------------------------- /tests/prefix_caching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/prefix_caching/__init__.py -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/quantization/__init__.py -------------------------------------------------------------------------------- /tests/quantization/test_experts_int8.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | """Tests experts_int8 quantization startup and generation, 3 | doesn't test correctness 4 | """ 5 | import pytest 6 | 7 | from tests.quantization.utils import is_quant_method_supported 8 | 9 | MODELS = ["ai21labs/Jamba-tiny-random"] 10 | 11 | 12 | @pytest.mark.skipif(not is_quant_method_supported("experts_int8"), 13 | reason="ExpertsInt8 is not supported on this GPU type.") 14 | @pytest.mark.parametrize("model", MODELS) 15 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 16 | @pytest.mark.parametrize("max_tokens", [10]) 17 | def test_model_experts_int8_startup( 18 | hf_runner, 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | ) -> None: 25 | 26 | with vllm_runner(model, dtype=dtype, 27 | quantization="experts_int8") as vllm_model: 28 | vllm_model.generate_greedy(example_prompts, max_tokens) 29 | -------------------------------------------------------------------------------- /tests/quantization/test_ipex_quant.py: -------------------------------------------------------------------------------- 1 | """Test model set-up and inference for quantized HF models supported 2 | on the CPU backend using IPEX (including AWQ). 3 | 4 | Validating the configuration and printing results for manual checking. 5 | 6 | Run `pytest tests/quantization/test_ipex_quant.py`. 7 | """ 8 | 9 | import pytest 10 | 11 | from vllm.platforms import current_platform 12 | 13 | MODELS = [ 14 | "casperhansen/llama-3-8b-instruct-awq", 15 | ] 16 | DTYPE = ["bfloat16"] 17 | 18 | 19 | @pytest.mark.skipif(not current_platform.is_cpu(), 20 | reason="only supports the CPU backend.") 21 | @pytest.mark.parametrize("model", MODELS) 22 | @pytest.mark.parametrize("dtype", DTYPE) 23 | def test_ipex_quant(vllm_runner, model, dtype): 24 | with vllm_runner(model, dtype=dtype) as llm: 25 | output = llm.generate_greedy(["The capital of France is"], 26 | max_tokens=32) 27 | assert output 28 | print(output) 29 | -------------------------------------------------------------------------------- /tests/quantization/utils.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS 2 | from vllm.platforms import current_platform 3 | 4 | 5 | def is_quant_method_supported(quant_method: str) -> bool: 6 | # Currently, all quantization methods require Nvidia or AMD GPUs 7 | if not (current_platform.is_cuda() or current_platform.is_rocm()): 8 | return False 9 | 10 | capability = current_platform.get_device_capability() 11 | assert capability is not None 12 | 13 | min_capability = QUANTIZATION_METHODS[quant_method].get_min_capability() 14 | 15 | return capability.to_int() >= min_capability 16 | -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/samplers/__init__.py -------------------------------------------------------------------------------- /tests/samplers/test_ignore_eos.py: -------------------------------------------------------------------------------- 1 | """Make sure ignore_eos works. 2 | 3 | Run `pytest tests/samplers/test_ignore_eos.py`. 4 | """ 5 | 6 | import pytest 7 | 8 | from vllm import SamplingParams 9 | 10 | # We also test with llama because it has generation_config to specify EOS 11 | # (past regression). 12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] 13 | 14 | 15 | @pytest.mark.parametrize("model", MODELS) 16 | @pytest.mark.parametrize("dtype", ["half"]) 17 | @pytest.mark.parametrize("max_tokens", [512]) 18 | def test_ignore_eos( 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | ) -> None: 25 | with vllm_runner(model, dtype=dtype) as vllm_model: 26 | sampling_params = SamplingParams(max_tokens=max_tokens, 27 | ignore_eos=True) 28 | 29 | for prompt in example_prompts: 30 | ignore_eos_output = vllm_model.model.generate( 31 | prompt, sampling_params=sampling_params) 32 | output_length = len(ignore_eos_output[0].outputs[0].token_ids) 33 | assert output_length == max_tokens 34 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/spec_decode/e2e/__init__.py -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tensorizer_loader/__init__.py -------------------------------------------------------------------------------- /tests/test_embedded_commit.py: -------------------------------------------------------------------------------- 1 | import vllm 2 | 3 | 4 | def test_embedded_commit_defined(): 5 | assert hasattr(vllm, "__version__") 6 | assert hasattr(vllm, "__version_tuple__") 7 | assert vllm.__version__ != "dev" 8 | assert vllm.__version_tuple__ != (0, 0, "dev") 9 | -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- 1 | """Tests for the SamplingParams class. 2 | """ 3 | from vllm import SamplingParams 4 | 5 | 6 | def test_max_tokens_none(): 7 | """max_tokens=None should be allowed""" 8 | SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | import pytest 13 | pytest.main([__file__]) 14 | -------------------------------------------------------------------------------- /tests/test_scalartype.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm.scalar_type import scalar_types 5 | 6 | 7 | @pytest.mark.parametrize("type_tuple", ( 8 | (-8, 7, scalar_types.int4), 9 | (0, 15, scalar_types.uint4), 10 | (-8, 7, scalar_types.uint4b8), 11 | (-128, 127, scalar_types.uint8b128), 12 | (-28., 28., scalar_types.float6_e3m2f), 13 | (torch.int8, scalar_types.int8), 14 | (torch.uint8, scalar_types.uint8), 15 | (torch.float8_e5m2, scalar_types.float8_e5m2), 16 | (torch.float8_e4m3fn, scalar_types.float8_e4m3fn), 17 | (torch.bfloat16, scalar_types.float16_e8m7), 18 | (torch.float16, scalar_types.float16_e5m10), 19 | ), 20 | ids=lambda x: str(x)) 21 | def test_scalar_type_min_max(type_tuple): 22 | print(type_tuple) 23 | if len(type_tuple) == 3: 24 | min, max, t = type_tuple 25 | else: 26 | torch_type, t = type_tuple 27 | if torch_type.is_floating_point: 28 | min = torch.finfo(torch_type).min 29 | max = torch.finfo(torch_type).max 30 | else: 31 | min = torch.iinfo(torch_type).min 32 | max = torch.iinfo(torch_type).max 33 | 34 | print(t, min, max, t.min(), t.max()) 35 | assert min == t.min() 36 | assert max == t.max() 37 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_cached_tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer 6 | 7 | 8 | def test_cached_tokenizer(): 9 | reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") 10 | reference_tokenizer.add_special_tokens({"cls_token": ""}) 11 | reference_tokenizer.add_special_tokens( 12 | {"additional_special_tokens": [""]}) 13 | cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) 14 | 15 | assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( 16 | "prompt") 17 | assert set(reference_tokenizer.all_special_ids) == set( 18 | cached_tokenizer.all_special_ids) 19 | assert set(reference_tokenizer.all_special_tokens) == set( 20 | cached_tokenizer.all_special_tokens) 21 | assert set(reference_tokenizer.all_special_tokens_extended) == set( 22 | cached_tokenizer.all_special_tokens_extended) 23 | -------------------------------------------------------------------------------- /tests/tokenization/test_get_eos.py: -------------------------------------------------------------------------------- 1 | """ 2 | This test file includes some cases where it is inappropriate to 3 | only get the `eos_token_id` from the tokenizer as defined by 4 | :meth:`vllm.LLMEngine._get_eos_token_id`. 5 | """ 6 | from vllm.transformers_utils.config import try_get_generation_config 7 | from vllm.transformers_utils.tokenizer import get_tokenizer 8 | 9 | 10 | def test_get_llama3_eos_token(): 11 | model_name = "meta-llama/Meta-Llama-3-8B-Instruct" 12 | 13 | tokenizer = get_tokenizer(model_name) 14 | assert tokenizer.eos_token_id == 128009 15 | 16 | generation_config = try_get_generation_config(model_name, 17 | trust_remote_code=False) 18 | assert generation_config is not None 19 | assert generation_config.eos_token_id == [128001, 128009] 20 | 21 | 22 | def test_get_blip2_eos_token(): 23 | model_name = "Salesforce/blip2-opt-2.7b" 24 | 25 | tokenizer = get_tokenizer(model_name) 26 | assert tokenizer.eos_token_id == 2 27 | 28 | generation_config = try_get_generation_config(model_name, 29 | trust_remote_code=False) 30 | assert generation_config is not None 31 | assert generation_config.eos_token_id == 50118 32 | -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import PreTrainedTokenizerBase 3 | 4 | from vllm.transformers_utils.tokenizer import get_tokenizer 5 | 6 | TOKENIZER_NAMES = [ 7 | "facebook/opt-125m", 8 | "gpt2", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) 13 | def test_tokenizer_revision(tokenizer_name: str): 14 | # Assume that "main" branch always exists 15 | tokenizer = get_tokenizer(tokenizer_name, revision="main") 16 | assert isinstance(tokenizer, PreTrainedTokenizerBase) 17 | 18 | # Assume that "never" branch always does not exist 19 | with pytest.raises(OSError, match='not a valid git identifier'): 20 | get_tokenizer(tokenizer_name, revision="never") 21 | -------------------------------------------------------------------------------- /tests/tool_use/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tool_use/__init__.py -------------------------------------------------------------------------------- /tests/tool_use/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pytest_asyncio 3 | from huggingface_hub import snapshot_download 4 | 5 | from tests.utils import RemoteOpenAIServer 6 | 7 | from .utils import ARGS, CONFIGS, ServerConfig 8 | 9 | 10 | # for each server config, download the model and return the config 11 | @pytest.fixture(scope="session", params=CONFIGS.keys()) 12 | def server_config(request): 13 | config = CONFIGS[request.param] 14 | # download model and tokenizer using transformers 15 | snapshot_download(config["model"]) 16 | yield CONFIGS[request.param] 17 | 18 | 19 | # run this for each server config 20 | @pytest.fixture(scope="session") 21 | def server(request, server_config: ServerConfig): 22 | model = server_config["model"] 23 | args_for_model = server_config["arguments"] 24 | with RemoteOpenAIServer(model, ARGS + args_for_model, 25 | max_wait_seconds=480) as server: 26 | yield server 27 | 28 | 29 | @pytest_asyncio.fixture 30 | async def client(server: RemoteOpenAIServer): 31 | async with server.get_async_client() as async_client: 32 | yield async_client 33 | -------------------------------------------------------------------------------- /tests/tpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tpu/__init__.py -------------------------------------------------------------------------------- /tests/tpu/test_custom_dispatcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from vllm.compilation.levels import CompilationLevel 4 | 5 | from ..utils import compare_two_settings 6 | 7 | # --enforce-eager on TPU causes graph compilation 8 | # this times out default Health Check in the MQLLMEngine, 9 | # so we set the timeout here to 30s 10 | os.environ["VLLM_RPC_TIMEOUT"] = "30000" 11 | 12 | 13 | def test_custom_dispatcher(): 14 | compare_two_settings( 15 | "google/gemma-2b", 16 | arg1=["--enforce-eager"], 17 | arg2=["--enforce-eager"], 18 | env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)}, 19 | env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)}) 20 | -------------------------------------------------------------------------------- /tests/tracing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tracing/__init__.py -------------------------------------------------------------------------------- /tests/weight_loading/models-large.txt: -------------------------------------------------------------------------------- 1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main 2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main 3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main 4 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main 5 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main -------------------------------------------------------------------------------- /tests/weight_loading/run_model_weight_loading_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SUCCESS=0 3 | 4 | while getopts "c:" OPT; do 5 | case ${OPT} in 6 | c ) 7 | CONFIG="$OPTARG" 8 | ;; 9 | \? ) 10 | usage 11 | exit 1 12 | ;; 13 | esac 14 | done 15 | 16 | 17 | IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG 18 | 19 | for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" 20 | do 21 | LOCAL_SUCCESS=0 22 | IFS=', ' read -r -a array <<< "$MODEL_CONFIG" 23 | 24 | echo "=== RUNNING MODEL: $MODEL_CONFIG ===" 25 | 26 | export QUANTIZATION=${array[0]} 27 | export MODEL_NAME=${array[1]} 28 | export REVISION=${array[2]} 29 | pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$? 30 | 31 | if [[ $LOCAL_SUCCESS == 0 ]]; then 32 | echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" 33 | else 34 | echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" 35 | fi 36 | 37 | SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) 38 | 39 | done 40 | 41 | if [ "${SUCCESS}" -eq "0" ]; then 42 | exit 0 43 | else 44 | exit 1 45 | fi 46 | -------------------------------------------------------------------------------- /tests/weight_loading/test_weight_loading.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | MAX_MODEL_LEN = 1024 6 | MODEL_NAME = os.environ.get("MODEL_NAME", 7 | "robertgshaw2/zephyr-7b-beta-channelwise-gptq") 8 | REVISION = os.environ.get("REVISION", "main") 9 | QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin") 10 | 11 | 12 | def test_weight_loading(vllm_runner): 13 | """ 14 | Test parameter weight loading with tp>1. 15 | """ 16 | with vllm_runner(model_name=MODEL_NAME, 17 | revision=REVISION, 18 | dtype=torch.half if QUANTIZATION == "gptq" else "auto", 19 | quantization=QUANTIZATION, 20 | max_model_len=MAX_MODEL_LEN, 21 | tensor_parallel_size=2) as model: 22 | 23 | output = model.generate_greedy("Hello world!", max_tokens=20) 24 | print(output) 25 | assert output 26 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/worker/__init__.py -------------------------------------------------------------------------------- /tools/actionlint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if command -v actionlint &> /dev/null; then 4 | actionlint "$@" 5 | exit 0 6 | elif [ -x ./actionlint ]; then 7 | ./actionlint "$@" 8 | exit 0 9 | fi 10 | 11 | # download a binary to the current directory - v1.7.3 12 | bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) 13 | ./actionlint "$@" 14 | -------------------------------------------------------------------------------- /tools/mypy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CI=${1:-0} 4 | 5 | run_mypy() { 6 | echo "Running mypy on $1" 7 | if [ $CI -eq 1 ] && [ -z "$1" ]; then 8 | mypy "$@" 9 | return 10 | fi 11 | mypy --follow-imports skip "$@" 12 | } 13 | 14 | run_mypy # Note that this is less strict than CI 15 | run_mypy tests 16 | run_mypy vllm/assets 17 | run_mypy vllm/attention 18 | #run_mypy vllm/compilation 19 | #run_mypy vllm/core 20 | run_mypy vllm/distributed 21 | run_mypy vllm/engine 22 | run_mypy vllm/entrypoints 23 | run_mypy vllm/executor 24 | #run_mypy vllm/inputs 25 | run_mypy vllm/logging 26 | run_mypy vllm/lora 27 | run_mypy vllm/model_executor 28 | run_mypy vllm/multimodal 29 | run_mypy vllm/platforms 30 | run_mypy vllm/plugins 31 | run_mypy vllm/prompt_adapter 32 | run_mypy vllm/spec_decode 33 | run_mypy vllm/transformers_utils 34 | run_mypy vllm/usage 35 | #run_mypy vllm/vllm_flash_attn 36 | run_mypy vllm/worker 37 | -------------------------------------------------------------------------------- /use_existing_torch.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | requires_files = glob.glob('requirements*.txt') 4 | requires_files += ["pyproject.toml"] 5 | for file in requires_files: 6 | print(f">>> cleaning {file}") 7 | with open(file, 'r') as f: 8 | lines = f.readlines() 9 | if "torch" in "".join(lines).lower(): 10 | print("removed:") 11 | with open(file, 'w') as f: 12 | for line in lines: 13 | if 'torch' not in line.lower(): 14 | f.write(line) 15 | else: 16 | print(line.strip()) 17 | print(f"<<< done cleaning {file}") 18 | print() 19 | -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.entrypoints.llm import LLM 7 | from vllm.executor.ray_utils import initialize_ray_cluster 8 | from vllm.inputs import PromptType, TextPrompt, TokensPrompt 9 | from vllm.model_executor.models import ModelRegistry 10 | from vllm.outputs import (CompletionOutput, EmbeddingOutput, 11 | EmbeddingRequestOutput, RequestOutput) 12 | from vllm.pooling_params import PoolingParams 13 | from vllm.sampling_params import SamplingParams 14 | 15 | from .version import __version__, __version_tuple__ 16 | 17 | __all__ = [ 18 | "__version__", 19 | "__version_tuple__", 20 | "LLM", 21 | "ModelRegistry", 22 | "PromptType", 23 | "TextPrompt", 24 | "TokensPrompt", 25 | "SamplingParams", 26 | "RequestOutput", 27 | "CompletionOutput", 28 | "EmbeddingOutput", 29 | "EmbeddingRequestOutput", 30 | "LLMEngine", 31 | "EngineArgs", 32 | "AsyncLLMEngine", 33 | "AsyncEngineArgs", 34 | "initialize_ray_cluster", 35 | "PoolingParams", 36 | ] 37 | -------------------------------------------------------------------------------- /vllm/adapter_commons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/adapter_commons/__init__.py -------------------------------------------------------------------------------- /vllm/adapter_commons/layers.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Tuple 3 | 4 | 5 | @dataclass 6 | class AdapterMapping: 7 | # Per every token in input_ids: 8 | index_mapping: Tuple[int, ...] 9 | # Per sampled token: 10 | prompt_mapping: Tuple[int, ...] 11 | 12 | def __post_init__(self): 13 | self.index_mapping = tuple(self.index_mapping) 14 | self.prompt_mapping = tuple(self.prompt_mapping) -------------------------------------------------------------------------------- /vllm/adapter_commons/request.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AdapterRequest(ABC): 5 | """ 6 | Base class for adapter requests. 7 | """ 8 | 9 | @property 10 | @abstractmethod 11 | def adapter_id(self) -> int: 12 | raise NotImplementedError 13 | 14 | def __post_init__(self) -> None: 15 | if self.adapter_id < 1: 16 | raise ValueError(f"id must be > 0, got {self.adapter_id}") 17 | 18 | def __eq__(self, value: object) -> bool: 19 | return isinstance( 20 | value, self.__class__) and self.adapter_id == value.adapter_id 21 | 22 | def __hash__(self) -> int: 23 | return hash(self.adapter_id) 24 | -------------------------------------------------------------------------------- /vllm/adapter_commons/worker_manager.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Optional, Set 3 | 4 | import torch 5 | 6 | 7 | class AbstractWorkerManager(ABC): 8 | 9 | def __init__(self, device: torch.device): 10 | self.device = device 11 | 12 | @property 13 | @abstractmethod 14 | def is_enabled(self) -> bool: 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def set_active_adapters(self, requests: Set[Any], 19 | mapping: Optional[Any]) -> None: 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def add_adapter(self, adapter_request: Any) -> bool: 24 | raise NotImplementedError 25 | 26 | @abstractmethod 27 | def remove_adapter(self, adapter_id: int) -> bool: 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def remove_all_adapters(self) -> None: 32 | raise NotImplementedError 33 | 34 | @abstractmethod 35 | def list_adapters(self) -> Set[int]: 36 | raise NotImplementedError 37 | -------------------------------------------------------------------------------- /vllm/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/assets/__init__.py -------------------------------------------------------------------------------- /vllm/assets/audio.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal, Tuple 3 | from urllib.parse import urljoin 4 | 5 | import librosa 6 | import numpy as np 7 | 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL 9 | 10 | ASSET_DIR = "multimodal_asset" 11 | 12 | 13 | @dataclass(frozen=True) 14 | class AudioAsset: 15 | name: Literal["winning_call", "mary_had_lamb"] 16 | 17 | @property 18 | def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: 19 | 20 | audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", 21 | s3_prefix=ASSET_DIR) 22 | y, sr = librosa.load(audio_path, sr=None) 23 | assert isinstance(sr, int) 24 | return y, sr 25 | 26 | @property 27 | def url(self) -> str: 28 | return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") 29 | -------------------------------------------------------------------------------- /vllm/assets/base.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import vllm.envs as envs 6 | from vllm.connections import global_http_connection 7 | from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT 8 | 9 | vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" 10 | 11 | 12 | def get_cache_dir() -> Path: 13 | """Get the path to the cache for storing downloaded assets.""" 14 | path = Path(envs.VLLM_ASSETS_CACHE) 15 | path.mkdir(parents=True, exist_ok=True) 16 | 17 | return path 18 | 19 | 20 | @lru_cache 21 | def get_vllm_public_assets(filename: str, 22 | s3_prefix: Optional[str] = None) -> Path: 23 | """ 24 | Download an asset file from ``s3://vllm-public-assets`` 25 | and return the path to the downloaded file. 26 | """ 27 | asset_directory = get_cache_dir() / "vllm_public_assets" 28 | asset_directory.mkdir(parents=True, exist_ok=True) 29 | 30 | asset_path = asset_directory / filename 31 | if not asset_path.exists(): 32 | if s3_prefix is not None: 33 | filename = s3_prefix + "/" + filename 34 | global_http_connection.download_file( 35 | f"{vLLM_S3_BUCKET_URL}/{filename}", 36 | asset_path, 37 | timeout=VLLM_IMAGE_FETCH_TIMEOUT) 38 | 39 | return asset_path 40 | -------------------------------------------------------------------------------- /vllm/assets/image.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from PIL import Image 6 | 7 | from vllm.assets.base import get_vllm_public_assets 8 | 9 | VLM_IMAGES_DIR = "vision_model_images" 10 | 11 | 12 | @dataclass(frozen=True) 13 | class ImageAsset: 14 | name: Literal["stop_sign", "cherry_blossom"] 15 | 16 | @property 17 | def pil_image(self) -> Image.Image: 18 | 19 | image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", 20 | s3_prefix=VLM_IMAGES_DIR) 21 | return Image.open(image_path) 22 | 23 | @property 24 | def image_embeds(self) -> torch.Tensor: 25 | """ 26 | Image embeddings, only used for testing purposes with llava 1.5. 27 | """ 28 | image_path = get_vllm_public_assets(filename=f"{self.name}.pt", 29 | s3_prefix=VLM_IMAGES_DIR) 30 | return torch.load(image_path) 31 | -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.attention.backends.abstract import (AttentionBackend, 2 | AttentionMetadata, 3 | AttentionMetadataBuilder, 4 | AttentionState, AttentionType) 5 | from vllm.attention.layer import Attention 6 | from vllm.attention.selector import get_attn_backend 7 | 8 | __all__ = [ 9 | "Attention", 10 | "AttentionBackend", 11 | "AttentionMetadata", 12 | "AttentionType", 13 | "AttentionMetadataBuilder", 14 | "Attention", 15 | "AttentionState", 16 | "get_attn_backend", 17 | ] 18 | -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/blocksparse_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/ops/blocksparse_attention/__init__.py -------------------------------------------------------------------------------- /vllm/compilation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/compilation/__init__.py -------------------------------------------------------------------------------- /vllm/compilation/compile_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from typing import Any 3 | 4 | _compile_context: Any = None 5 | 6 | 7 | def get_compile_context() -> Any: 8 | """Get the current compile context.""" 9 | return _compile_context 10 | 11 | 12 | @contextmanager 13 | def set_compile_context(context: Any): 14 | """A context manager that stores the current compile context, 15 | usually it is a list of sizes to specialize. 16 | """ 17 | global _compile_context 18 | prev_context = _compile_context 19 | _compile_context = context 20 | try: 21 | yield 22 | finally: 23 | _compile_context = prev_context 24 | -------------------------------------------------------------------------------- /vllm/compilation/levels.py: -------------------------------------------------------------------------------- 1 | # constants for the levels of the compilation process 2 | 3 | 4 | class CompilationLevel: 5 | NO_COMPILATION = 0 6 | DYNAMO_AS_IS = 1 7 | DYNAMO_ONCE = 2 8 | INDUCTOR = 3 9 | INDUCTOR_MAX_AUTOTUNE = 4 10 | -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /vllm/distributed/communication_op.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Union 2 | 3 | import torch 4 | import torch.distributed 5 | 6 | from .parallel_state import get_tp_group 7 | 8 | 9 | def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: 10 | """All-reduce the input tensor across model parallel group.""" 11 | return get_tp_group().all_reduce(input_) 12 | 13 | 14 | def tensor_model_parallel_all_gather(input_: torch.Tensor, 15 | dim: int = -1) -> torch.Tensor: 16 | """All-gather the input tensor across model parallel group.""" 17 | return get_tp_group().all_gather(input_, dim) 18 | 19 | 20 | def tensor_model_parallel_gather(input_: torch.Tensor, 21 | dst: int = 0, 22 | dim: int = -1) -> Optional[torch.Tensor]: 23 | """Gather the input tensor across model parallel group.""" 24 | return get_tp_group().gather(input_, dst, dim) 25 | 26 | 27 | def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, 28 | Any]]] = None, 29 | src: int = 0): 30 | if not torch.distributed.is_initialized(): 31 | return tensor_dict 32 | return get_tp_group().broadcast_tensor_dict(tensor_dict, src) 33 | -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Sequence as GenericSequence 3 | from typing import Union 4 | 5 | from vllm.model_executor.layers.sampler import SamplerOutput 6 | from vllm.sequence import PoolerOutput, SequenceGroupOutput 7 | 8 | 9 | def create_output_by_sequence_group( 10 | outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], 11 | num_seq_groups: int) -> List[List[SequenceGroupOutput]]: 12 | """Helper method which transforms a 2d list organized by 13 | [step][sequence group] into [sequence group][step]. 14 | """ 15 | output_by_sequence_group: List[List[SequenceGroupOutput]] = [ 16 | [] for _ in range(num_seq_groups) 17 | ] 18 | for step in outputs: 19 | for i, sequence_group_output in enumerate(step): 20 | output_by_sequence_group[i].append(sequence_group_output) 21 | 22 | return output_by_sequence_group 23 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_tool_parser import ToolParser, ToolParserManager 2 | from .hermes_tool_parser import Hermes2ProToolParser 3 | from .internlm2_tool_parser import Internlm2ToolParser 4 | from .llama_tool_parser import Llama3JsonToolParser 5 | from .mistral_tool_parser import MistralToolParser 6 | 7 | __all__ = [ 8 | "ToolParser", "ToolParserManager", "Hermes2ProToolParser", 9 | "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser" 10 | ] 11 | -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/executor/msgspec_utils.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Any, Type 3 | 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE 5 | 6 | 7 | def encode_hook(obj: Any) -> Any: 8 | """Custom msgspec enc hook that supports array types. 9 | 10 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 11 | """ 12 | if isinstance(obj, array): 13 | assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( 14 | f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " 15 | f"Given array has a type code of {obj.typecode}.") 16 | return obj.tobytes() 17 | 18 | 19 | def decode_hook(type: Type, obj: Any) -> Any: 20 | """Custom msgspec dec hook that supports array types. 21 | 22 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 23 | """ 24 | if type is array: 25 | deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) 26 | deserialized.frombytes(obj) 27 | return deserialized 28 | -------------------------------------------------------------------------------- /vllm/executor/multiproc_xpu_executor.py: -------------------------------------------------------------------------------- 1 | import vllm.envs as envs 2 | from vllm.executor.multiproc_gpu_executor import ( 3 | MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync) 4 | from vllm.executor.xpu_executor import XPUExecutor 5 | from vllm.logger import init_logger 6 | from vllm.utils import make_async 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor): 12 | """Python multiprocessing-based multi-XPU executor""" 13 | 14 | def _check_executor_parameters(self): 15 | mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD 16 | if mp_method != "spawn": 17 | raise RuntimeError( 18 | "XPU multiprocess executor only support spawn as mp method") 19 | 20 | 21 | class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor, 22 | MultiprocessingGPUExecutorAsync): 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__(*args, **kwargs) 26 | self.driver_exec_model = make_async(self.driver_worker.execute_model) 27 | -------------------------------------------------------------------------------- /vllm/forward_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from typing import Any 3 | 4 | _forward_context: Any = None 5 | 6 | 7 | def get_forward_context() -> Any: 8 | """Get the current forward context.""" 9 | return _forward_context 10 | 11 | 12 | @contextmanager 13 | def set_forward_context(context: Any): 14 | """A context manager that stores the current forward context, 15 | can be attention metadata, etc.""" 16 | global _forward_context 17 | prev_context = _forward_context 18 | _forward_context = context 19 | try: 20 | yield 21 | finally: 22 | _forward_context = prev_context 23 | -------------------------------------------------------------------------------- /vllm/logging/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.logging.formatter import NewLineFormatter 2 | 3 | __all__ = [ 4 | "NewLineFormatter", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/logging/formatter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NewLineFormatter(logging.Formatter): 5 | """Adds logging prefix to newlines to align multi-line messages.""" 6 | 7 | def __init__(self, fmt, datefmt=None, style="%"): 8 | logging.Formatter.__init__(self, fmt, datefmt, style) 9 | 10 | def format(self, record): 11 | msg = logging.Formatter.format(self, record) 12 | if record.message != "": 13 | parts = msg.split(record.message) 14 | msg = msg.replace("\n", "\r\n" + parts[0]) 15 | return msg 16 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/lora/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.parameter import (BasevLLMParameter, 2 | PackedvLLMParameter) 3 | from vllm.model_executor.sampling_metadata import (SamplingMetadata, 4 | SamplingMetadataCache) 5 | from vllm.model_executor.utils import set_random_seed 6 | 7 | __all__ = [ 8 | "SamplingMetadata", 9 | "SamplingMetadataCache", 10 | "set_random_seed", 11 | "BasevLLMParameter", 12 | "PackedvLLMParameter", 13 | ] 14 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.fused_moe.layer import ( 2 | FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) 3 | from vllm.triton_utils import HAS_TRITON 4 | 5 | __all__ = [ 6 | "FusedMoE", 7 | "FusedMoEMethodBase", 8 | "FusedMoeWeightScaleSupported", 9 | ] 10 | 11 | if HAS_TRITON: 12 | from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( 13 | fused_marlin_moe, single_marlin_moe) 14 | from vllm.model_executor.layers.fused_moe.fused_moe import ( 15 | fused_experts, fused_moe, fused_topk, get_config_file_name, 16 | grouped_topk) 17 | 18 | __all__ += [ 19 | "fused_marlin_moe", 20 | "single_marlin_moe", 21 | "fused_moe", 22 | "fused_topk", 23 | "fused_experts", 24 | "get_config_file_name", 25 | "grouped_topk", 26 | ] 27 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/mamba/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/mamba/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | from .compressed_tensors_scheme import CompressedTensorsScheme 2 | from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, 3 | CompressedTensorsW4A16Sparse24) 4 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 5 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 6 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 7 | from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS, 8 | CompressedTensorsWNA16) 9 | 10 | __all__ = [ 11 | "CompressedTensorsScheme", 12 | "CompressedTensorsWNA16", 13 | "CompressedTensorsW8A16Fp8", 14 | "CompressedTensorsW4A16Sparse24", 15 | "CompressedTensorsW8A8Int8", 16 | "CompressedTensorsW8A8Fp8", 17 | "WNA16_SUPPORTED_BITS", 18 | "W4A16SPARSE24_SUPPORTED_BITS", 19 | ] 20 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_utils import replace_parameter, update_tensor_inplace 2 | 3 | __all__ = ['update_tensor_inplace', 'replace_parameter'] 4 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/machete_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | 5 | from vllm.scalar_type import ScalarType, scalar_types 6 | 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128] 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] 9 | 10 | 11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]: 12 | if zero_points: 13 | return [scalar_types.uint4, scalar_types.uint8] 14 | else: 15 | return [scalar_types.uint4b8, scalar_types.uint8b128] 16 | 17 | 18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]: 19 | return [torch.float16, torch.bfloat16] 20 | 21 | 22 | def check_machete_supports_shape(in_features: int, out_featrues: int) \ 23 | -> Tuple[bool, Optional[str]]: 24 | if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: 25 | return False, "Input features size must be divisible by "\ 26 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}" 27 | if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: 28 | return False, "Output features size must be divisible by "\ 29 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}" 30 | return True, None 31 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch import nn 4 | 5 | from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, 6 | ModelConfig, ParallelConfig, SchedulerConfig) 7 | from vllm.model_executor.model_loader.loader import (BaseModelLoader, 8 | get_model_loader) 9 | from vllm.model_executor.model_loader.utils import ( 10 | get_architecture_class_name, get_model_architecture) 11 | 12 | 13 | def get_model(*, model_config: ModelConfig, load_config: LoadConfig, 14 | device_config: DeviceConfig, parallel_config: ParallelConfig, 15 | scheduler_config: SchedulerConfig, 16 | lora_config: Optional[LoRAConfig], 17 | cache_config: CacheConfig) -> nn.Module: 18 | loader = get_model_loader(load_config) 19 | return loader.load_model(model_config=model_config, 20 | device_config=device_config, 21 | lora_config=lora_config, 22 | parallel_config=parallel_config, 23 | scheduler_config=scheduler_config, 24 | cache_config=cache_config) 25 | 26 | 27 | __all__ = [ 28 | "get_model", "get_model_loader", "BaseModelLoader", 29 | "get_architecture_class_name", "get_model_architecture" 30 | ] 31 | -------------------------------------------------------------------------------- /vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, 2 | SupportsPP, has_inner_state, supports_lora, 3 | supports_multimodal, supports_pp) 4 | from .interfaces_base import (VllmModelForEmbedding, 5 | VllmModelForTextGeneration, is_embedding_model, 6 | is_text_generation_model) 7 | from .registry import ModelRegistry 8 | 9 | __all__ = [ 10 | "ModelRegistry", 11 | "VllmModelForEmbedding", 12 | "is_embedding_model", 13 | "VllmModelForTextGeneration", 14 | "is_text_generation_model", 15 | "HasInnerState", 16 | "has_inner_state", 17 | "SupportsLoRA", 18 | "supports_lora", 19 | "SupportsMultiModal", 20 | "supports_multimodal", 21 | "SupportsPP", 22 | "supports_pp", 23 | ] -------------------------------------------------------------------------------- /vllm/model_executor/models/phi3.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Adapted from llama.py 3 | """Inference-only Phi3 model code inherit from Llama.py""" 4 | 5 | from vllm.model_executor.models.llama import LlamaForCausalLM 6 | 7 | 8 | class Phi3ForCausalLM(LlamaForCausalLM): 9 | 10 | packed_modules_mapping = { 11 | "qkv_proj": [ 12 | "qkv_proj", 13 | ], 14 | "gate_up_proj": [ 15 | "gate_up_proj", 16 | ], 17 | } 18 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | from typing import Any, Dict, Optional 3 | 4 | import torch 5 | 6 | from vllm.utils import seed_everything 7 | 8 | 9 | def set_random_seed(seed: int) -> None: 10 | seed_everything(seed) 11 | 12 | 13 | def set_weight_attrs( 14 | weight: torch.Tensor, 15 | weight_attrs: Optional[Dict[str, Any]], 16 | ): 17 | """Set attributes on a weight tensor. 18 | 19 | This method is used to set attributes on a weight tensor. This method 20 | will not overwrite existing attributes. 21 | 22 | Args: 23 | weight: The weight tensor. 24 | weight_attrs: A dictionary of attributes to set on the weight tensor. 25 | """ 26 | if weight_attrs is None: 27 | return 28 | for key, value in weight_attrs.items(): 29 | assert not hasattr( 30 | weight, key), (f"Overwriting existing tensor attribute: {key}") 31 | setattr(weight, key, value) 32 | -------------------------------------------------------------------------------- /vllm/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import (BatchedTensorInputs, MultiModalDataBuiltins, 2 | MultiModalDataDict, MultiModalInputs, MultiModalPlugin, 3 | NestedTensors) 4 | from .registry import MultiModalRegistry 5 | 6 | MULTIMODAL_REGISTRY = MultiModalRegistry() 7 | """ 8 | The global :class:`~MultiModalRegistry` is used by model runners to 9 | dispatch data processing according to its modality and the target model. 10 | 11 | See also: 12 | :ref:`input_processing_pipeline` 13 | """ 14 | 15 | __all__ = [ 16 | "BatchedTensorInputs", 17 | "MultiModalDataBuiltins", 18 | "MultiModalDataDict", 19 | "MultiModalInputs", 20 | "MultiModalPlugin", 21 | "NestedTensors", 22 | "MULTIMODAL_REGISTRY", 23 | "MultiModalRegistry", 24 | ] 25 | -------------------------------------------------------------------------------- /vllm/multimodal/audio.py: -------------------------------------------------------------------------------- 1 | from vllm.inputs.registry import InputContext 2 | from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin 3 | 4 | 5 | class AudioPlugin(MultiModalPlugin): 6 | """Plugin for audio data.""" 7 | 8 | def get_data_key(self) -> str: 9 | return "audio" 10 | 11 | def _default_input_mapper(self, ctx: InputContext, data: object, 12 | **mm_processor_kwargs) -> MultiModalInputs: 13 | raise NotImplementedError("There is no default audio input mapper") 14 | 15 | def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: 16 | raise NotImplementedError( 17 | "There is no default maximum multimodal tokens") 18 | -------------------------------------------------------------------------------- /vllm/platforms/cpu.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import torch 3 | 4 | from .interface import Platform, PlatformEnum 5 | 6 | 7 | class CpuPlatform(Platform): 8 | _enum = PlatformEnum.CPU 9 | 10 | @classmethod 11 | def get_device_name(cls, device_id: int = 0) -> str: 12 | return "cpu" 13 | 14 | @classmethod 15 | def get_device_total_memory(cls, device_id: int = 0) -> int: 16 | return psutil.virtual_memory().total 17 | 18 | @classmethod 19 | def inference_mode(cls): 20 | return torch.no_grad() 21 | -------------------------------------------------------------------------------- /vllm/platforms/rocm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | 4 | import torch 5 | 6 | from vllm.logger import init_logger 7 | 8 | from .interface import DeviceCapability, Platform, PlatformEnum 9 | 10 | logger = init_logger(__name__) 11 | 12 | if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: 13 | logger.warning("`fork` method is not supported by ROCm. " 14 | "VLLM_WORKER_MULTIPROC_METHOD is overridden to" 15 | " `spawn` instead.") 16 | os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" 17 | 18 | 19 | class RocmPlatform(Platform): 20 | _enum = PlatformEnum.ROCM 21 | 22 | @classmethod 23 | @lru_cache(maxsize=8) 24 | def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: 25 | major, minor = torch.cuda.get_device_capability(device_id) 26 | return DeviceCapability(major=major, minor=minor) 27 | 28 | @classmethod 29 | @lru_cache(maxsize=8) 30 | def get_device_name(cls, device_id: int = 0) -> str: 31 | return torch.cuda.get_device_name(device_id) 32 | 33 | @classmethod 34 | def get_device_total_memory(cls, device_id: int = 0) -> int: 35 | device_props = torch.cuda.get_device_properties(device_id) 36 | return device_props.total_memory 37 | -------------------------------------------------------------------------------- /vllm/platforms/tpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | import vllm.envs as envs 6 | from vllm.compilation.levels import CompilationLevel 7 | from vllm.plugins import set_torch_compile_backend 8 | 9 | from .interface import Platform, PlatformEnum 10 | 11 | if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: 12 | os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE) 13 | 14 | assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\ 15 | "TPU does not support Inductor." 16 | 17 | set_torch_compile_backend("openxla") 18 | 19 | 20 | class TpuPlatform(Platform): 21 | _enum = PlatformEnum.TPU 22 | 23 | @classmethod 24 | def get_device_name(cls, device_id: int = 0) -> str: 25 | raise NotImplementedError 26 | 27 | @classmethod 28 | def get_device_total_memory(cls, device_id: int = 0) -> int: 29 | raise NotImplementedError 30 | 31 | @classmethod 32 | def inference_mode(cls): 33 | return torch.no_grad() 34 | -------------------------------------------------------------------------------- /vllm/platforms/xpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .interface import DeviceCapability, Platform, PlatformEnum 4 | 5 | 6 | class XPUPlatform(Platform): 7 | _enum = PlatformEnum.XPU 8 | 9 | @staticmethod 10 | def get_device_capability(device_id: int = 0) -> DeviceCapability: 11 | major, minor, *_ = torch.xpu.get_device_capability( 12 | device_id)['version'].split('.') 13 | return DeviceCapability(major=int(major), minor=int(minor)) 14 | 15 | @staticmethod 16 | def get_device_name(device_id: int = 0) -> str: 17 | return torch.xpu.get_device_name(device_id) 18 | 19 | @classmethod 20 | def get_device_total_memory(cls, device_id: int = 0) -> int: 21 | device_props = torch.xpu.get_device_properties(device_id) 22 | return device_props.total_memory 23 | -------------------------------------------------------------------------------- /vllm/pooling_params.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import msgspec 4 | 5 | 6 | class PoolingParams( 7 | msgspec.Struct, 8 | omit_defaults=True, # type: ignore[call-arg] 9 | array_like=True): # type: ignore[call-arg] 10 | """Pooling parameters for pooling. 11 | 12 | Attributes: 13 | additional_data: Any additional data needed for pooling. 14 | """ 15 | additional_data: Optional[Any] = None 16 | 17 | def clone(self) -> "PoolingParams": 18 | """Returns a deep copy of the PoolingParams instance.""" 19 | return PoolingParams(additional_data=self.additional_data, ) 20 | 21 | def __repr__(self) -> str: 22 | return (f"PoolingParams(" 23 | f"additional_metadata={self.additional_data})") 24 | -------------------------------------------------------------------------------- /vllm/prompt_adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/prompt_adapter/__init__.py -------------------------------------------------------------------------------- /vllm/prompt_adapter/request.py: -------------------------------------------------------------------------------- 1 | import msgspec 2 | 3 | from vllm.adapter_commons.request import AdapterRequest 4 | 5 | 6 | class PromptAdapterRequest( 7 | msgspec.Struct, 8 | array_like=True, # type: ignore[call-arg] 9 | omit_defaults=True, # type: ignore[call-arg] 10 | frozen=True): # type: ignore[call-arg] 11 | """ 12 | Request for a Prompt adapter. 13 | """ 14 | __metaclass__ = AdapterRequest 15 | 16 | prompt_adapter_name: str 17 | prompt_adapter_id: int 18 | prompt_adapter_local_path: str 19 | prompt_adapter_num_virtual_tokens: int 20 | 21 | def __hash__(self): 22 | return super().__hash__() 23 | 24 | @property 25 | def adapter_id(self): 26 | return self.prompt_adapter_id 27 | 28 | @property 29 | def name(self): 30 | return self.prompt_adapter_name 31 | 32 | @property 33 | def local_path(self): 34 | return self.prompt_adapter_local_path 35 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/scalar_type.py: -------------------------------------------------------------------------------- 1 | from ._core_ext import NanRepr, ScalarType 2 | 3 | # naming generally follows: https://github.com/jax-ml/ml_dtypes 4 | # for floating point types (leading f) the scheme is: 5 | # `float_em[flags]` 6 | # flags: 7 | # - no-flags: means it follows IEEE 754 conventions 8 | # - f: means finite values only (no infinities) 9 | # - n: means nans are supported (non-standard encoding) 10 | # for integer types the scheme is: 11 | # `[u]int[b]` 12 | # - if bias is not present it means its zero 13 | 14 | 15 | class scalar_types: 16 | int4 = ScalarType.int_(4, None) 17 | uint4 = ScalarType.uint(4, None) 18 | int8 = ScalarType.int_(8, None) 19 | uint8 = ScalarType.uint(8, None) 20 | float8_e4m3fn = ScalarType.float_(4, 3, True, 21 | NanRepr.EXTD_RANGE_MAX_MIN.value) 22 | float8_e5m2 = ScalarType.float_IEEE754(5, 2) 23 | float16_e8m7 = ScalarType.float_IEEE754(8, 7) 24 | float16_e5m10 = ScalarType.float_IEEE754(5, 10) 25 | 26 | # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main 27 | float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value) 28 | 29 | # "gptq" types 30 | uint4b8 = ScalarType.uint(4, 8) 31 | uint8b128 = ScalarType.uint(8, 128) 32 | 33 | # colloquial names 34 | bfloat16 = float16_e8m7 35 | float16 = float16_e5m10 36 | -------------------------------------------------------------------------------- /vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.envs import VLLM_USE_MODELSCOPE 2 | 3 | if VLLM_USE_MODELSCOPE: 4 | # Patch here, before each import happens 5 | import modelscope 6 | from packaging import version 7 | 8 | # patch_hub begins from modelscope>=1.18.1 9 | if version.parse(modelscope.__version__) <= version.parse('1.18.0'): 10 | raise ImportError( 11 | 'Using vLLM with ModelScope needs modelscope>=1.18.1, please ' 12 | 'install by `pip install modelscope>=1.18.1`') 13 | 14 | from modelscope.utils.hf_util import patch_hub 15 | 16 | # Patch hub to download models from modelscope to speed up. 17 | patch_hub() 18 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/mllama.py: -------------------------------------------------------------------------------- 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config 2 | 3 | 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig): 5 | ''' 6 | Use this class to override is_encoder_decoder: 7 | - transformers regards mllama as is_encoder_decoder=False 8 | - vllm needs is_encoder_decoder=True to enable cross-attention 9 | ''' 10 | 11 | def __init__( 12 | self, 13 | **kwargs, 14 | ): 15 | super().__init__(**kwargs) 16 | self.is_encoder_decoder = True 17 | 18 | 19 | class MllamaConfig(mllama_hf_config.MllamaConfig): 20 | 21 | def __init__( 22 | self, 23 | text_config=None, 24 | **kwargs, 25 | ): 26 | if isinstance(text_config, dict): 27 | text_config = MllamaTextConfig(**text_config) 28 | super().__init__(text_config=text_config, **kwargs) 29 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/nvlm_d.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py 3 | # -------------------------------------------------------- 4 | # NVLM-D 5 | # Copyright (c) 2024 NVIDIA 6 | # Licensed under Apache 2.0 License [see LICENSE for details] 7 | # -------------------------------------------------------- 8 | from .internvl import InternVLChatConfig 9 | 10 | 11 | class NVLM_D_Config(InternVLChatConfig): 12 | model_type = 'NVLM_D' 13 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .mistral import MistralTokenizer 2 | 3 | __all__ = ["MistralTokenizer"] 4 | -------------------------------------------------------------------------------- /vllm/transformers_utils/utils.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool: 7 | """Check if the file is a GGUF model.""" 8 | model = Path(model) 9 | if not model.is_file(): 10 | return False 11 | elif model.suffix == ".gguf": 12 | return True 13 | 14 | with open(model, "rb") as f: 15 | header = f.read(4) 16 | return header == b"GGUF" 17 | -------------------------------------------------------------------------------- /vllm/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.triton_utils.importing import HAS_TRITON 2 | 3 | __all__ = ["HAS_TRITON"] 4 | 5 | if HAS_TRITON: 6 | 7 | from vllm.triton_utils.custom_cache_manager import ( 8 | maybe_set_triton_cache_manager) 9 | from vllm.triton_utils.libentry import libentry 10 | 11 | __all__ += ["maybe_set_triton_cache_manager", "libentry"] 12 | -------------------------------------------------------------------------------- /vllm/triton_utils/importing.py: -------------------------------------------------------------------------------- 1 | from importlib.util import find_spec 2 | 3 | from vllm.logger import init_logger 4 | 5 | logger = init_logger(__name__) 6 | 7 | HAS_TRITON = find_spec("triton") is not None 8 | 9 | if not HAS_TRITON: 10 | logger.info("Triton not installed; certain GPU-related functions" 11 | " will not be available.") 12 | -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/usage/__init__.py -------------------------------------------------------------------------------- /vllm/version.py: -------------------------------------------------------------------------------- 1 | try: 2 | from ._version import __version__, __version_tuple__ 3 | except Exception as e: 4 | import warnings 5 | 6 | warnings.warn(f"Failed to read commit hash:\n{e}", 7 | RuntimeWarning, 8 | stacklevel=2) 9 | 10 | __version__ = "dev" 11 | __version_tuple__ = (0, 0, __version__) 12 | -------------------------------------------------------------------------------- /vllm/vllm_flash_attn/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/vllm_flash_attn/.gitkeep -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/worker/__init__.py --------------------------------------------------------------------------------