├── .buildkite
    ├── check-wheel-size.py
    ├── lm-eval-harness
    │   ├── configs
    │   │   ├── DeepSeek-V2-Lite-Chat.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-QQQ.yaml
    │   │   ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Minitron-4B-Base-FP8.yaml
    │   │   ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1.yaml
    │   │   ├── Qwen2-1.5B-Instruct-FP8W8.yaml
    │   │   ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
    │   │   ├── Qwen2-57B-A14-Instruct.yaml
    │   │   ├── models-large.txt
    │   │   └── models-small.txt
    │   ├── run-lm-eval-gsm-hf-baseline.sh
    │   ├── run-lm-eval-gsm-vllm-baseline.sh
    │   ├── run-tests.sh
    │   └── test_lm_eval_correctness.py
    ├── nightly-benchmarks
    │   ├── README.md
    │   ├── benchmark-pipeline.yaml
    │   ├── nightly-annotation.md
    │   ├── nightly-descriptions.md
    │   ├── nightly-pipeline.yaml
    │   ├── performance-benchmarks-descriptions.md
    │   ├── scripts
    │   │   ├── convert-results-json-to-markdown.py
    │   │   ├── download-tokenizer.py
    │   │   ├── generate-nightly-markdown.py
    │   │   ├── get-lmdeploy-modelname.py
    │   │   ├── launch-server.sh
    │   │   ├── nightly-annotate.sh
    │   │   ├── run-nightly-benchmarks.sh
    │   │   ├── run-performance-benchmarks.sh
    │   │   ├── summary-nightly-results.py
    │   │   └── wait-for-image.sh
    │   └── tests
    │   │   ├── latency-tests.json
    │   │   ├── nightly-tests.json
    │   │   ├── serving-tests.json
    │   │   └── throughput-tests.json
    ├── release-pipeline.yaml
    ├── run-amd-test.sh
    ├── run-benchmarks.sh
    ├── run-cpu-test-ppc64le.sh
    ├── run-cpu-test.sh
    ├── run-hpu-test.sh
    ├── run-multi-node-test.sh
    ├── run-neuron-test.sh
    ├── run-openvino-test.sh
    ├── run-tpu-test.sh
    ├── run-xpu-test.sh
    ├── test-pipeline.yaml
    └── upload-wheels.sh
├── .clang-format
├── .dockerignore
├── .github
    ├── CODEOWNERS
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug report.yml
    │   ├── 500-feature request.yml
    │   ├── 600-new model.yml
    │   ├── 700-performance discussion.yml
    │   ├── 750-RFC.yml
    │   ├── 800-misc discussion.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    ├── mergify.yml
    ├── scripts
    │   └── cleanup_pr_body.sh
    └── workflows
    │   ├── actionlint.yml
    │   ├── add_label_automerge.yml
    │   ├── clang-format.yml
    │   ├── cleanup_pr_body.yml
    │   ├── codespell.yml
    │   ├── matchers
    │       ├── actionlint.json
    │       ├── mypy.json
    │       └── ruff.json
    │   ├── mypy.yaml
    │   ├── png-lint.yml
    │   ├── publish.yml
    │   ├── reminder_comment.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   ├── shellcheck.yml
    │   ├── sphinx-lint.yml
    │   ├── stale.yml
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── .shellcheckrc
├── .yapfignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DCO
├── Dockerfile
├── Dockerfile.arm
├── Dockerfile.cpu
├── Dockerfile.hpu
├── Dockerfile.neuron
├── Dockerfile.openvino
├── Dockerfile.ppc64le
├── Dockerfile.rocm
├── Dockerfile.tpu
├── Dockerfile.xpu
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── benchmarks
    ├── README.md
    ├── backend_request_func.py
    ├── benchmark_latency.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_prioritization.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── cutlass_benchmarks
    │   ├── w8a8_benchmarks.py
    │   └── weight_shapes.py
    ├── disagg_benchmarks
    │   ├── disagg_overhead_benchmark.sh
    │   ├── disagg_performance_benchmark.sh
    │   ├── disagg_prefill_proxy_server.py
    │   ├── round_robin_proxy.py
    │   └── visualize_benchmark_results.py
    ├── kernels
    │   ├── benchmark_aqlm.py
    │   ├── benchmark_layernorm.py
    │   ├── benchmark_machete.py
    │   ├── benchmark_marlin.py
    │   ├── benchmark_moe.py
    │   ├── benchmark_paged_attention.py
    │   ├── benchmark_quant.py
    │   ├── benchmark_rope.py
    │   ├── benchmark_shapes.py
    │   ├── graph_machete_bench.py
    │   ├── requirements.txt
    │   └── weight_shapes.py
    ├── launch_tgi_server.sh
    ├── overheads
    │   └── benchmark_hashing.py
    └── sonnet.txt
├── cmake
    ├── cpu_extension.cmake
    ├── hipify.py
    └── utils.cmake
├── collect_env.py
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cuh
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   ├── dtype_float32.cuh
    │   ├── dtype_fp8.cuh
    │   ├── paged_attention_v1.cu
    │   └── paged_attention_v2.cu
    ├── cache.h
    ├── cache_kernels.cu
    ├── core
    │   ├── exception.hpp
    │   ├── registration.h
    │   └── scalar_type.hpp
    ├── cpu
    │   ├── activation.cpp
    │   ├── attention.cpp
    │   ├── cache.cpp
    │   ├── cpu_types.hpp
    │   ├── cpu_types_arm.hpp
    │   ├── cpu_types_vsx.hpp
    │   ├── cpu_types_x86.hpp
    │   ├── dnnl_helper.hpp
    │   ├── layernorm.cpp
    │   ├── pos_encoding.cpp
    │   ├── quant.cpp
    │   ├── torch_bindings.cpp
    │   └── utils.cpp
    ├── cuda_compat.h
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── custom_all_reduce.cu
    ├── custom_all_reduce.cuh
    ├── custom_all_reduce_test.cu
    ├── cutlass_extensions
    │   ├── cute_utils.cuh
    │   ├── epilogue
    │   │   ├── broadcast_load_epilogue_c2x.hpp
    │   │   ├── broadcast_load_epilogue_c3x.hpp
    │   │   ├── scaled_mm_epilogues_c2x.hpp
    │   │   └── scaled_mm_epilogues_c3x.hpp
    │   ├── torch_utils.hpp
    │   ├── vllm_collective_builder.cuh
    │   ├── vllm_custom_types.cuh
    │   ├── vllm_cutlass_library_extension.py
    │   ├── vllm_numeric_conversion.cuh
    │   └── vllm_type_utils.cuh
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── layernorm_quant_kernels.cu
    ├── mamba
    │   ├── causal_conv1d
    │   │   ├── causal_conv1d.cu
    │   │   ├── causal_conv1d.h
    │   │   └── static_switch.h
    │   └── mamba_ssm
    │   │   ├── selective_scan.h
    │   │   ├── selective_scan_fwd.cu
    │   │   └── static_switch.h
    ├── moe
    │   ├── marlin_kernels
    │   │   ├── marlin_moe_kernel.h
    │   │   ├── marlin_moe_kernel_ku4.cu
    │   │   ├── marlin_moe_kernel_ku4.h
    │   │   ├── marlin_moe_kernel_ku4b8.cu
    │   │   ├── marlin_moe_kernel_ku4b8.h
    │   │   ├── marlin_moe_kernel_ku8b128.cu
    │   │   └── marlin_moe_kernel_ku8b128.h
    │   ├── marlin_moe_ops.cu
    │   ├── moe_align_sum_kernels.cu
    │   ├── moe_ops.h
    │   ├── topk_softmax_kernels.cu
    │   └── torch_bindings.cpp
    ├── ops.h
    ├── permute_cols.cu
    ├── pos_encoding_kernels.cu
    ├── prepare_inputs
    │   ├── advance_step.cu
    │   └── advance_step.cuh
    ├── quantization
    │   ├── aqlm
    │   │   └── gemm_kernels.cu
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   ├── compressed_tensors
    │   │   └── int8_quant_kernels.cu
    │   ├── cutlass_w8a8
    │   │   ├── Epilogues.md
    │   │   ├── common.hpp
    │   │   ├── scaled_mm_c2x.cu
    │   │   ├── scaled_mm_c2x.cuh
    │   │   ├── scaled_mm_c2x_sm75_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm80_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
    │   │   ├── scaled_mm_c3x.cu
    │   │   └── scaled_mm_entry.cu
    │   ├── fp8
    │   │   ├── amd
    │   │   │   ├── hip_float8.h
    │   │   │   ├── hip_float8_impl.h
    │   │   │   └── quant_utils.cuh
    │   │   ├── common.cu
    │   │   ├── common.cuh
    │   │   ├── fp8_marlin.cu
    │   │   └── nvidia
    │   │   │   └── quant_utils.cuh
    │   ├── gguf
    │   │   ├── dequantize.cuh
    │   │   ├── ggml-common.h
    │   │   ├── gguf_kernel.cu
    │   │   ├── mmq.cuh
    │   │   ├── mmvq.cuh
    │   │   └── vecdotq.cuh
    │   ├── gptq
    │   │   ├── compat.cuh
    │   │   ├── matrix_view.cuh
    │   │   ├── q_gemm.cu
    │   │   ├── qdq_2.cuh
    │   │   ├── qdq_3.cuh
    │   │   ├── qdq_4.cuh
    │   │   ├── qdq_8.cuh
    │   │   └── qdq_util.cuh
    │   ├── gptq_marlin
    │   │   ├── awq_marlin_repack.cu
    │   │   ├── gptq_marlin.cu
    │   │   ├── gptq_marlin_repack.cu
    │   │   ├── marlin.cuh
    │   │   └── marlin_dtypes.cuh
    │   ├── machete
    │   │   ├── Readme.md
    │   │   ├── generate.py
    │   │   ├── machete_collective_builder.cuh
    │   │   ├── machete_interleaving_utils.cuh
    │   │   ├── machete_mainloop.cuh
    │   │   ├── machete_mm_kernel.cuh
    │   │   ├── machete_mm_launcher.cuh
    │   │   ├── machete_prepack_kernel.cuh
    │   │   ├── machete_prepack_launcher.cuh
    │   │   ├── machete_prepacked_layout.cuh
    │   │   └── machete_pytorch.cu
    │   └── marlin
    │   │   ├── dense
    │   │       ├── LICENSE
    │   │       ├── common
    │   │       │   ├── base.h
    │   │       │   └── mem.h
    │   │       └── marlin_cuda_kernel.cu
    │   │   ├── qqq
    │   │       └── marlin_qqq_gemm_kernel.cu
    │   │   └── sparse
    │   │       ├── LICENSE
    │   │       ├── common
    │   │           ├── base.h
    │   │           ├── mem.h
    │   │           └── mma.h
    │   │       └── marlin_24_cuda_kernel.cu
    ├── rocm
    │   ├── attention.cu
    │   ├── ops.h
    │   └── torch_bindings.cpp
    ├── torch_bindings.cpp
    └── type_convert.cuh
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── _static
    │       └── custom.js
    │   ├── _templates
    │       └── sections
    │       │   └── header.html
    │   ├── assets
    │       ├── design
    │       │   ├── arch_overview
    │       │   │   ├── entrypoints.excalidraw.png
    │       │   │   └── llm_engine.excalidraw.png
    │       │   └── hierarchy.png
    │       ├── dev
    │       │   └── dockerfile-stages-dependency.png
    │       ├── kernel
    │       │   ├── k_vecs.png
    │       │   ├── key.png
    │       │   ├── logits_vec.png
    │       │   ├── q_vecs.png
    │       │   ├── query.png
    │       │   ├── v_vec.png
    │       │   └── value.png
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── automatic_prefix_caching
    │       ├── apc.rst
    │       └── details.md
    │   ├── community
    │       ├── meetups.rst
    │       └── sponsors.md
    │   ├── conf.py
    │   ├── contributing
    │       ├── dockerfile
    │       │   └── dockerfile.rst
    │       ├── overview.rst
    │       └── profiling
    │       │   └── profiling_index.rst
    │   ├── design
    │       ├── arch_overview.rst
    │       ├── huggingface_integration.rst
    │       ├── input_processing
    │       │   ├── input_processing_pipeline.rst
    │       │   └── model_inputs_index.rst
    │       ├── kernel
    │       │   └── paged_attention.rst
    │       ├── multimodal
    │       │   ├── adding_multimodal_plugin.rst
    │       │   └── multimodal_index.rst
    │       └── plugin_system.rst
    │   ├── dev
    │       ├── engine
    │       │   ├── async_llm_engine.rst
    │       │   ├── engine_index.rst
    │       │   └── llm_engine.rst
    │       ├── offline_inference
    │       │   ├── llm.rst
    │       │   ├── llm_inputs.rst
    │       │   └── offline_index.rst
    │       ├── pooling_params.rst
    │       └── sampling_params.rst
    │   ├── generate_examples.py
    │   ├── getting_started
    │       ├── amd-installation.rst
    │       ├── arm-installation.rst
    │       ├── cpu-installation.rst
    │       ├── debugging.rst
    │       ├── examples
    │       │   └── examples_index.template.rst
    │       ├── gaudi-installation.rst
    │       ├── installation.rst
    │       ├── neuron-installation.rst
    │       ├── openvino-installation.rst
    │       ├── quickstart.rst
    │       ├── tpu-installation.rst
    │       └── xpu-installation.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── enabling_multimodal_inputs.rst
    │       ├── engine_args.rst
    │       ├── lora.rst
    │       ├── performance.rst
    │       ├── spec_decode.rst
    │       ├── structured_outputs.rst
    │       ├── supported_models.rst
    │       └── vlm.rst
    │   ├── performance
    │       └── benchmarks.rst
    │   ├── quantization
    │       ├── auto_awq.rst
    │       ├── bnb.rst
    │       ├── fp8.rst
    │       ├── fp8_e4m3_kvcache.rst
    │       ├── fp8_e5m2_kvcache.rst
    │       ├── gguf.rst
    │       ├── int8.rst
    │       └── supported_hardware.rst
    │   └── serving
    │       ├── compatibility_matrix.rst
    │       ├── deploying_with_bentoml.rst
    │       ├── deploying_with_cerebrium.rst
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_dstack.rst
    │       ├── deploying_with_k8s.rst
    │       ├── deploying_with_kserve.rst
    │       ├── deploying_with_lws.rst
    │       ├── deploying_with_nginx.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       ├── env_vars.rst
    │       ├── faq.rst
    │       ├── integrations.rst
    │       ├── metrics.rst
    │       ├── openai_compatible_server.md
    │       ├── run_on_sky.rst
    │       ├── serving_with_langchain.rst
    │       ├── serving_with_llamaindex.rst
    │       ├── serving_with_llamastack.rst
    │       ├── tensorizer.rst
    │       └── usage_stats.md
├── examples
    ├── api_client.py
    ├── aqlm_example.py
    ├── cpu_offload.py
    ├── disaggregated_prefill.sh
    ├── florence2_inference.py
    ├── fp8
    │   ├── README.md
    │   ├── extract_scales.py
    │   └── quantizer
    │   │   ├── README.md
    │   │   └── quantize.py
    ├── gguf_inference.py
    ├── gradio_openai_chatbot_webserver.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    ├── logging_configuration.md
    ├── lora_with_quantization_inference.py
    ├── multilora_inference.py
    ├── offline_chat_with_tools.py
    ├── offline_inference.py
    ├── offline_inference_arctic.py
    ├── offline_inference_audio_language.py
    ├── offline_inference_chat.py
    ├── offline_inference_cli.py
    ├── offline_inference_distributed.py
    ├── offline_inference_embedding.py
    ├── offline_inference_encoder_decoder.py
    ├── offline_inference_mlpspeculator.py
    ├── offline_inference_neuron.py
    ├── offline_inference_neuron_int8_quantization.py
    ├── offline_inference_openai.md
    ├── offline_inference_pixtral.py
    ├── offline_inference_structured_outputs.py
    ├── offline_inference_tpu.py
    ├── offline_inference_vision_language.py
    ├── offline_inference_vision_language_embedding.py
    ├── offline_inference_vision_language_multi_image.py
    ├── offline_inference_with_prefix.py
    ├── offline_inference_with_profiler.py
    ├── offline_profile.py
    ├── openai_chat_completion_client.py
    ├── openai_chat_completion_client_for_multimodal.py
    ├── openai_chat_completion_client_with_tools.py
    ├── openai_chat_completion_structured_outputs.py
    ├── openai_chat_embedding_client_for_multimodal.py
    ├── openai_completion_client.py
    ├── openai_cross_encoder_score.py
    ├── openai_embedding_client.py
    ├── openai_example_batch.jsonl
    ├── production_monitoring
    │   ├── Otel.md
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── dummy_client.py
    │   ├── grafana.json
    │   └── prometheus.yaml
    ├── run_cluster.sh
    ├── save_sharded_state.py
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_blip2.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_dse_qwen2_vl.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    ├── template_inkbot.jinja
    ├── template_llava.jinja
    ├── template_vlm2vec.jinja
    ├── tensorize_vllm_model.py
    ├── tool_chat_template_granite.jinja
    ├── tool_chat_template_granite_20b_fc.jinja
    ├── tool_chat_template_hermes.jinja
    ├── tool_chat_template_internlm2_tool.jinja
    ├── tool_chat_template_llama3.1_json.jinja
    ├── tool_chat_template_llama3.2_json.jinja
    ├── tool_chat_template_llama3.2_pythonic.jinja
    ├── tool_chat_template_mistral.jinja
    ├── tool_chat_template_mistral_parallel.jinja
    └── tool_chat_template_toolace.jinja
├── find_cuda_init.py
├── format.sh
├── pyproject.toml
├── python_only_dev.py
├── requirements-build.txt
├── requirements-common.txt
├── requirements-cpu.txt
├── requirements-cuda.txt
├── requirements-dev.txt
├── requirements-hpu.txt
├── requirements-lint.txt
├── requirements-neuron.txt
├── requirements-openvino.txt
├── requirements-rocm.txt
├── requirements-test.in
├── requirements-test.txt
├── requirements-tpu.txt
├── requirements-xpu.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── __init__.py
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   ├── __init__.py
    │   ├── test_basic_correctness.py
    │   ├── test_chunked_prefill.py
    │   ├── test_cpu_offload.py
    │   └── test_preemption.py
    ├── compile
    │   ├── __init__.py
    │   ├── backend.py
    │   ├── piecewise
    │   │   ├── __init__.py
    │   │   ├── test_simple.py
    │   │   └── test_toy_llama.py
    │   ├── test_basic_correctness.py
    │   ├── test_full_graph.py
    │   ├── test_functionalization.py
    │   ├── test_fusion.py
    │   ├── test_pass_manager.py
    │   ├── test_wrapper.py
    │   └── utils.py
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── block
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── e2e
    │   │   │   ├── __init__.py
    │   │   │   ├── conftest.py
    │   │   │   ├── test_correctness.py
    │   │   │   └── test_correctness_sliding_window.py
    │   │   ├── test_block_manager.py
    │   │   ├── test_block_table.py
    │   │   ├── test_common.py
    │   │   ├── test_cpu_gpu_block_allocator.py
    │   │   ├── test_naive_block.py
    │   │   └── test_prefix_caching_block.py
    │   ├── test_chunked_prefill_scheduler.py
    │   ├── test_num_computed_tokens_update.py
    │   ├── test_scheduler.py
    │   ├── test_scheduler_encoder_decoder.py
    │   ├── test_serialization.py
    │   └── utils.py
    ├── data
    │   └── test_config.yaml
    ├── distributed
    │   ├── __init__.py
    │   ├── test_ca_buffer_sharing.py
    │   ├── test_comm_ops.py
    │   ├── test_custom_all_reduce.py
    │   ├── test_distributed_oot.py
    │   ├── test_multi_node_assignment.py
    │   ├── test_pipeline_parallel.py
    │   ├── test_pipeline_partition.py
    │   ├── test_pp_cudagraph.py
    │   ├── test_pynccl.py
    │   ├── test_same_node.py
    │   ├── test_shm_broadcast.py
    │   └── test_utils.py
    ├── encoder_decoder
    │   ├── __init__.py
    │   └── test_e2e_correctness.py
    ├── engine
    │   ├── __init__.py
    │   ├── output_processor
    │   │   ├── __init__.py
    │   │   ├── test_multi_step.py
    │   │   └── test_stop_checker.py
    │   ├── test_arg_utils.py
    │   ├── test_computed_prefix_blocks.py
    │   ├── test_custom_executor.py
    │   ├── test_detokenization.py
    │   ├── test_multiproc_workers.py
    │   ├── test_short_mm_context.py
    │   ├── test_skip_tokenizer_init.py
    │   ├── test_stop_reason.py
    │   └── test_stop_strings.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── test_accuracy.py
    │   │   ├── test_chat.py
    │   │   ├── test_encode.py
    │   │   ├── test_generate.py
    │   │   ├── test_generate_multiple_loras.py
    │   │   ├── test_guided_generate.py
    │   │   ├── test_init.py
    │   │   ├── test_lazy_outlines.py
    │   │   └── test_prompt_validation.py
    │   ├── offline_mode
    │   │   ├── __init__.py
    │   │   └── test_offline_mode.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── test_accuracy.py
    │   │   ├── test_async_tokenization.py
    │   │   ├── test_audio.py
    │   │   ├── test_basic.py
    │   │   ├── test_chat.py
    │   │   ├── test_chat_echo.py
    │   │   ├── test_chat_template.py
    │   │   ├── test_chunked_prompt.py
    │   │   ├── test_cli_args.py
    │   │   ├── test_completion.py
    │   │   ├── test_embedding.py
    │   │   ├── test_encoder_decoder.py
    │   │   ├── test_lora_lineage.py
    │   │   ├── test_metrics.py
    │   │   ├── test_models.py
    │   │   ├── test_oot_registration.py
    │   │   ├── test_prompt_validation.py
    │   │   ├── test_return_tokens_as_ids.py
    │   │   ├── test_root_path.py
    │   │   ├── test_run_batch.py
    │   │   ├── test_score.py
    │   │   ├── test_serving_chat.py
    │   │   ├── test_serving_engine.py
    │   │   ├── test_shutdown.py
    │   │   ├── test_tokenization.py
    │   │   ├── test_video.py
    │   │   ├── test_vision.py
    │   │   ├── test_vision_embedding.py
    │   │   └── tool_parsers
    │   │   │   ├── __init__.py
    │   │   │   ├── test_pythonic_tool_parser.py
    │   │   │   └── utils.py
    │   └── test_chat_utils.py
    ├── fp8_kv
    │   ├── llama2-70b-fp8-kv
    │   │   └── kv_cache_scales.json
    │   └── llama2-7b-fp8-kv
    │   │   └── kv_cache_scales.json
    ├── kernels
    │   ├── __init__.py
    │   ├── allclose_default.py
    │   ├── conftest.py
    │   ├── quant_utils.py
    │   ├── test_activation.py
    │   ├── test_aqlm.py
    │   ├── test_attention.py
    │   ├── test_attention_selector.py
    │   ├── test_awq.py
    │   ├── test_awq_marlin.py
    │   ├── test_awq_triton.py
    │   ├── test_blocksparse_attention.py
    │   ├── test_cache.py
    │   ├── test_causal_conv1d.py
    │   ├── test_cutlass.py
    │   ├── test_encoder_decoder_attn.py
    │   ├── test_flash_attn.py
    │   ├── test_flashinfer.py
    │   ├── test_fp8_quant.py
    │   ├── test_ggml.py
    │   ├── test_gguf.py
    │   ├── test_gptq.py
    │   ├── test_int8_quant.py
    │   ├── test_layernorm.py
    │   ├── test_machete_mm.py
    │   ├── test_mamba_ssm.py
    │   ├── test_marlin_gemm.py
    │   ├── test_moe.py
    │   ├── test_permute_cols.py
    │   ├── test_pos_encoding.py
    │   ├── test_prefix_prefill.py
    │   ├── test_rotary_embedding.py
    │   ├── test_triton_scaled_mm.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── kv_transfer
    │   ├── disagg_test.py
    │   ├── module_test.py
    │   ├── test_lookup_buffer.py
    │   ├── test_lookup_buffer.sh
    │   ├── test_send_recv.py
    │   └── test_send_recv.sh
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── long_context_test_data.py
    │   ├── test_baichuan.py
    │   ├── test_chatglm3_tp.py
    │   ├── test_gemma.py
    │   ├── test_layers.py
    │   ├── test_llama_tp.py
    │   ├── test_long_context.py
    │   ├── test_lora_bias_e2e.py
    │   ├── test_lora_checkpoints.py
    │   ├── test_lora_huggingface.py
    │   ├── test_lora_manager.py
    │   ├── test_minicpmv.py
    │   ├── test_minicpmv_tp.py
    │   ├── test_mixtral.py
    │   ├── test_phi.py
    │   ├── test_punica_sizes.py
    │   ├── test_punica_variation.py
    │   ├── test_quant_model.py
    │   ├── test_tokenizer_group.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   ├── __init__.py
    │   └── test_metrics.py
    ├── model_executor
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_enabled_custom_ops.py
    │   ├── test_guided_processors.py
    │   ├── test_model_load_with_params.py
    │   └── weight_utils.py
    ├── models
    │   ├── __init__.py
    │   ├── decoder_only
    │   │   ├── __init__.py
    │   │   ├── audio_language
    │   │   │   ├── __init__.py
    │   │   │   └── test_ultravox.py
    │   │   ├── language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_aqlm.py
    │   │   │   ├── test_fp8.py
    │   │   │   ├── test_gguf.py
    │   │   │   ├── test_gptq_marlin.py
    │   │   │   ├── test_gptq_marlin_24.py
    │   │   │   ├── test_granite.py
    │   │   │   ├── test_jamba.py
    │   │   │   ├── test_mamba.py
    │   │   │   ├── test_mistral.py
    │   │   │   ├── test_modelopt.py
    │   │   │   ├── test_models.py
    │   │   │   └── test_phimoe.py
    │   │   └── vision_language
    │   │   │   ├── __init__.py
    │   │   │   ├── mm_processor_kwargs
    │   │   │       ├── __init__.py
    │   │   │       ├── test_idefics3.py
    │   │   │       ├── test_internvl.py
    │   │   │       ├── test_llava_next.py
    │   │   │       ├── test_phi3v.py
    │   │   │       ├── test_qwen.py
    │   │   │       └── test_qwen2_vl.py
    │   │   │   ├── test_awq.py
    │   │   │   ├── test_h2ovl.py
    │   │   │   ├── test_intern_vit.py
    │   │   │   ├── test_models.py
    │   │   │   ├── test_phi3v.py
    │   │   │   ├── test_pixtral.py
    │   │   │   ├── test_qwen2_vl.py
    │   │   │   └── vlm_utils
    │   │   │       ├── __init__.py
    │   │   │       ├── builders.py
    │   │   │       ├── case_filtering.py
    │   │   │       ├── core.py
    │   │   │       ├── custom_inputs.py
    │   │   │       ├── model_utils.py
    │   │   │       ├── runners.py
    │   │   │       └── types.py
    │   ├── embedding
    │   │   ├── __init__.py
    │   │   ├── language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_cls_models.py
    │   │   │   ├── test_embedding.py
    │   │   │   └── test_scoring.py
    │   │   ├── utils.py
    │   │   └── vision_language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_dse_qwen2_vl.py
    │   │   │   ├── test_llava_next.py
    │   │   │   └── test_phi3v.py
    │   ├── encoder_decoder
    │   │   ├── __init__.py
    │   │   ├── language
    │   │   │   ├── __init__.py
    │   │   │   └── test_bart.py
    │   │   └── vision_language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_broadcast.py
    │   │   │   ├── test_florence2.py
    │   │   │   └── test_mllama.py
    │   ├── fixtures
    │   │   ├── pixtral_chat.json
    │   │   └── pixtral_chat_engine.json
    │   ├── registry.py
    │   ├── test_initialization.py
    │   ├── test_oot_registration.py
    │   ├── test_registry.py
    │   └── utils.py
    ├── mq_llm_engine
    │   ├── __init__.py
    │   ├── test_abort.py
    │   ├── test_error_handling.py
    │   ├── test_load.py
    │   └── utils.py
    ├── multi_step
    │   ├── __init__.py
    │   ├── test_correctness_async_llm.py
    │   └── test_correctness_llm.py
    ├── multimodal
    │   ├── __init__.py
    │   ├── test_inputs.py
    │   ├── test_mapper.py
    │   ├── test_processing.py
    │   ├── test_processor_kwargs.py
    │   └── test_utils.py
    ├── plugins
    │   └── vllm_add_dummy_model
    │   │   ├── setup.py
    │   │   └── vllm_add_dummy_model
    │   │       ├── __init__.py
    │   │       ├── my_gemma_embedding.py
    │   │       ├── my_llava.py
    │   │       └── my_opt.py
    ├── prefix_caching
    │   ├── __init__.py
    │   ├── test_disable_sliding_window.py
    │   └── test_prefix_caching.py
    ├── prompt_adapter
    │   ├── test_bloom.py
    │   ├── test_multi_adapter_inference.py
    │   └── test_pa_lora.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── quantization
    │   ├── __init__.py
    │   ├── test_bitsandbytes.py
    │   ├── test_compressed_tensors.py
    │   ├── test_configs.py
    │   ├── test_cpu_offload.py
    │   ├── test_experts_int8.py
    │   ├── test_fp8.py
    │   ├── test_ipex_quant.py
    │   ├── test_lm_head.py
    │   └── utils.py
    ├── samplers
    │   ├── __init__.py
    │   ├── test_beam_search.py
    │   ├── test_ignore_eos.py
    │   ├── test_logits_processor.py
    │   ├── test_logprobs.py
    │   ├── test_no_bad_words.py
    │   ├── test_ranks.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   ├── test_seeded_generate.py
    │   └── test_typical_acceptance_sampler.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compatibility.py
    │   │   ├── test_eagle_correctness.py
    │   │   ├── test_integration.py
    │   │   ├── test_integration_dist_tp2.py
    │   │   ├── test_integration_dist_tp4.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_medusa_correctness.py
    │   │   ├── test_mlp_correctness.py
    │   │   ├── test_multistep_correctness.py
    │   │   ├── test_ngram_correctness.py
    │   │   └── test_seed.py
    │   ├── test_batch_expansion.py
    │   ├── test_dynamic_spec_decode.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_ngram_worker.py
    │   ├── test_scorer.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── tensorizer_loader
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_tensorizer.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_embedded_commit.py
    ├── test_inputs.py
    ├── test_lazy_torch_compile.py
    ├── test_logger.py
    ├── test_logits_processor.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_scalartype.py
    ├── test_sequence.py
    ├── test_sharded_state_loader.py
    ├── test_utils.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   ├── test_get_eos.py
    │   ├── test_tokenizer.py
    │   └── test_tokenizer_group.py
    ├── tool_use
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_chat_completion_request_validations.py
    │   ├── test_chat_completions.py
    │   ├── test_jamba_tool_parser.py
    │   ├── test_parallel_tool_calls.py
    │   ├── test_tool_calls.py
    │   └── utils.py
    ├── tpu
    │   ├── __init__.py
    │   ├── test_compilation.py
    │   └── test_custom_dispatcher.py
    ├── tracing
    │   ├── __init__.py
    │   └── test_tracing.py
    ├── utils.py
    ├── v1
    │   ├── __init__.py
    │   ├── core
    │   │   └── test_prefix_caching.py
    │   └── engine
    │   │   ├── __init__.py
    │   │   ├── test_async_llm.py
    │   │   ├── test_detokenizer.py
    │   │   ├── test_engine_args.py
    │   │   ├── test_engine_core.py
    │   │   └── test_engine_core_client.py
    ├── vllm_test_utils
    │   ├── setup.py
    │   └── vllm_test_utils
    │   │   ├── __init__.py
    │   │   └── blame.py
    ├── weight_loading
    │   ├── models-large.txt
    │   ├── models.txt
    │   ├── run_model_weight_loading_test.sh
    │   └── test_weight_loading.py
    └── worker
    │   ├── __init__.py
    │   ├── test_encoder_decoder_model_runner.py
    │   ├── test_model_input.py
    │   ├── test_model_runner.py
    │   ├── test_profile.py
    │   └── test_swap.py
├── tools
    ├── actionlint.sh
    ├── check_repo.sh
    ├── mypy.sh
    ├── png-lint.sh
    ├── profiler
    │   ├── print_layerwise_table.py
    │   └── visualize_layerwise_profile.py
    ├── report_build_time_ninja.py
    ├── shellcheck.sh
    └── sphinx-lint.sh
├── use_existing_torch.py
└── vllm
    ├── __init__.py
    ├── _custom_ops.py
    ├── _ipex_ops.py
    ├── adapter_commons
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── assets
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── image.py
        └── video.py
    ├── attention
        ├── __init__.py
        ├── backends
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── blocksparse_attn.py
        │   ├── flash_attn.py
        │   ├── flashinfer.py
        │   ├── hpu_attn.py
        │   ├── ipex_attn.py
        │   ├── openvino.py
        │   ├── pallas.py
        │   ├── placeholder_attn.py
        │   ├── rocm_flash_attn.py
        │   ├── torch_sdpa.py
        │   ├── utils.py
        │   └── xformers.py
        ├── layer.py
        ├── ops
        │   ├── __init__.py
        │   ├── blocksparse_attention
        │   │   ├── __init__.py
        │   │   ├── blocksparse_attention_kernel.py
        │   │   ├── interface.py
        │   │   └── utils.py
        │   ├── hpu_paged_attn.py
        │   ├── ipex_attn.py
        │   ├── paged_attn.py
        │   ├── prefix_prefill.py
        │   └── triton_flash_attention.py
        └── selector.py
    ├── beam_search.py
    ├── block.py
    ├── compilation
        ├── __init__.py
        ├── backends.py
        ├── compile_context.py
        ├── counter.py
        ├── decorators.py
        ├── fix_functionalization.py
        ├── fusion.py
        ├── inductor_pass.py
        ├── pass_manager.py
        ├── reshapes.py
        ├── vllm_inductor_pass.py
        └── wrapper.py
    ├── config.py
    ├── connections.py
    ├── core
        ├── __init__.py
        ├── block
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── common.py
        │   ├── cpu_gpu_block_allocator.py
        │   ├── interfaces.py
        │   ├── naive_block.py
        │   ├── prefix_caching_block.py
        │   └── utils.py
        ├── block_manager.py
        ├── evictor.py
        ├── interfaces.py
        ├── placeholder_block_space_manager.py
        └── scheduler.py
    ├── distributed
        ├── __init__.py
        ├── communication_op.py
        ├── device_communicators
        │   ├── __init__.py
        │   ├── cuda_wrapper.py
        │   ├── custom_all_reduce.py
        │   ├── custom_all_reduce_utils.py
        │   ├── hpu_communicator.py
        │   ├── pynccl.py
        │   ├── pynccl_wrapper.py
        │   ├── shm_broadcast.py
        │   ├── tpu_communicator.py
        │   └── xpu_communicator.py
        ├── kv_transfer
        │   ├── README.md
        │   ├── __init__.py
        │   ├── disagg_prefill_workflow.jpg
        │   ├── kv_connector
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── factory.py
        │   │   └── simple_connector.py
        │   ├── kv_lookup_buffer
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   └── simple_buffer.py
        │   ├── kv_pipe
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   └── pynccl_pipe.py
        │   └── kv_transfer_agent.py
        ├── parallel_state.py
        └── utils.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── async_timeout.py
        ├── llm_engine.py
        ├── metrics.py
        ├── metrics_types.py
        ├── multiprocessing
        │   ├── __init__.py
        │   ├── client.py
        │   └── engine.py
        ├── output_processor
        │   ├── __init__.py
        │   ├── interfaces.py
        │   ├── multi_step.py
        │   ├── single_step.py
        │   ├── stop_checker.py
        │   └── util.py
        └── protocol.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── chat_utils.py
        ├── launcher.py
        ├── llm.py
        ├── logger.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── logits_processors.py
        │   ├── protocol.py
        │   ├── run_batch.py
        │   ├── serving_chat.py
        │   ├── serving_completion.py
        │   ├── serving_embedding.py
        │   ├── serving_engine.py
        │   ├── serving_score.py
        │   ├── serving_tokenization.py
        │   └── tool_parsers
        │       ├── __init__.py
        │       ├── abstract_tool_parser.py
        │       ├── granite_20b_fc_tool_parser.py
        │       ├── granite_tool_parser.py
        │       ├── hermes_tool_parser.py
        │       ├── internlm2_tool_parser.py
        │       ├── jamba_tool_parser.py
        │       ├── llama_tool_parser.py
        │       ├── mistral_tool_parser.py
        │       ├── pythonic_tool_parser.py
        │       └── utils.py
    ├── envs.py
    ├── executor
        ├── __init__.py
        ├── cpu_executor.py
        ├── distributed_gpu_executor.py
        ├── executor_base.py
        ├── gpu_executor.py
        ├── hpu_executor.py
        ├── msgspec_utils.py
        ├── multiproc_gpu_executor.py
        ├── multiproc_worker_utils.py
        ├── multiproc_xpu_executor.py
        ├── neuron_executor.py
        ├── openvino_executor.py
        ├── ray_gpu_executor.py
        ├── ray_hpu_executor.py
        ├── ray_tpu_executor.py
        ├── ray_utils.py
        ├── ray_xpu_executor.py
        ├── tpu_executor.py
        └── xpu_executor.py
    ├── forward_context.py
    ├── inputs
        ├── __init__.py
        ├── data.py
        ├── parse.py
        ├── preprocess.py
        └── registry.py
    ├── logger.py
    ├── logging_utils
        ├── __init__.py
        └── formatter.py
    ├── logits_process.py
    ├── lora
        ├── __init__.py
        ├── fully_sharded_layers.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── ops
        │   ├── __init__.py
        │   ├── bgmv_expand.py
        │   ├── bgmv_expand_slice.py
        │   ├── bgmv_shrink.py
        │   ├── sgmv_expand.py
        │   ├── sgmv_expand_slice.py
        │   ├── sgmv_shrink.py
        │   └── utils.py
        ├── punica.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── custom_op.py
        ├── guided_decoding
        │   ├── __init__.py
        │   ├── guided_fields.py
        │   ├── lm_format_enforcer_decoding.py
        │   ├── outlines_decoding.py
        │   └── outlines_logits_processors.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_L40S.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   └── README
        │   │   ├── fused_marlin_moe.py
        │   │   ├── fused_moe.py
        │   │   ├── layer.py
        │   │   └── moe_pallas.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── mamba
        │   │   ├── __init__.py
        │   │   ├── mamba_mixer.py
        │   │   └── ops
        │   │   │   ├── __init__.py
        │   │   │   ├── causal_conv1d.py
        │   │   │   └── mamba_ssm.py
        │   ├── pooler.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── aqlm.py
        │   │   ├── awq.py
        │   │   ├── awq_marlin.py
        │   │   ├── awq_triton.py
        │   │   ├── base_config.py
        │   │   ├── bitsandbytes.py
        │   │   ├── compressed_tensors
        │   │   │   ├── __init__.py
        │   │   │   ├── compressed_tensors.py
        │   │   │   ├── compressed_tensors_moe.py
        │   │   │   ├── schemes
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── compressed_tensors_scheme.py
        │   │   │   │   ├── compressed_tensors_w4a16_24.py
        │   │   │   │   ├── compressed_tensors_w8a16_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_int8.py
        │   │   │   │   └── compressed_tensors_wNa16.py
        │   │   │   ├── triton_scaled_mm.py
        │   │   │   └── utils.py
        │   │   ├── deepspeedfp.py
        │   │   ├── experts_int8.py
        │   │   ├── fbgemm_fp8.py
        │   │   ├── fp8.py
        │   │   ├── gguf.py
        │   │   ├── gptq.py
        │   │   ├── gptq_marlin.py
        │   │   ├── gptq_marlin_24.py
        │   │   ├── hqq_marlin.py
        │   │   ├── ipex_quant.py
        │   │   ├── kernels
        │   │   │   ├── MPLinearKernel.py
        │   │   │   ├── __init__.py
        │   │   │   ├── exllama.py
        │   │   │   ├── machete.py
        │   │   │   └── marlin.py
        │   │   ├── kv_cache.py
        │   │   ├── marlin.py
        │   │   ├── modelopt.py
        │   │   ├── neuron_quant.py
        │   │   ├── qqq.py
        │   │   ├── schema.py
        │   │   ├── tpu_int8.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── layer_utils.py
        │   │   │   ├── machete_utils.py
        │   │   │   ├── marlin_utils.py
        │   │   │   ├── marlin_utils_fp8.py
        │   │   │   ├── marlin_utils_test.py
        │   │   │   ├── marlin_utils_test_24.py
        │   │   │   ├── marlin_utils_test_qqq.py
        │   │   │   ├── quant_utils.py
        │   │   │   └── w8a8_utils.py
        │   ├── rejection_sampler.py
        │   ├── resampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   ├── spec_decode_base_sampler.py
        │   ├── typical_acceptance_sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader
        │   ├── __init__.py
        │   ├── loader.py
        │   ├── neuron.py
        │   ├── openvino.py
        │   ├── tensorizer.py
        │   ├── utils.py
        │   └── weight_utils.py
        ├── models
        │   ├── __init__.py
        │   ├── adapters.py
        │   ├── arctic.py
        │   ├── aria.py
        │   ├── baichuan.py
        │   ├── bart.py
        │   ├── bert.py
        │   ├── blip.py
        │   ├── blip2.py
        │   ├── bloom.py
        │   ├── chameleon.py
        │   ├── chatglm.py
        │   ├── clip.py
        │   ├── commandr.py
        │   ├── dbrx.py
        │   ├── decilm.py
        │   ├── deepseek.py
        │   ├── deepseek_v2.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── falcon.py
        │   ├── florence2.py
        │   ├── fuyu.py
        │   ├── gemma.py
        │   ├── gemma2.py
        │   ├── glm.py
        │   ├── glm4_vision_encoder.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── granite.py
        │   ├── granitemoe.py
        │   ├── h2ovl.py
        │   ├── idefics2_vision_model.py
        │   ├── idefics3.py
        │   ├── interfaces.py
        │   ├── interfaces_base.py
        │   ├── intern_vit.py
        │   ├── internlm2.py
        │   ├── internlm2_ve.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── jamba.py
        │   ├── llama.py
        │   ├── llava.py
        │   ├── llava_next.py
        │   ├── llava_next_video.py
        │   ├── llava_onevision.py
        │   ├── mamba.py
        │   ├── mamba_cache.py
        │   ├── medusa.py
        │   ├── minicpm.py
        │   ├── minicpm3.py
        │   ├── minicpmv.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mllama.py
        │   ├── mlp_speculator.py
        │   ├── module_mapping.py
        │   ├── molmo.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nvlm_d.py
        │   ├── olmo.py
        │   ├── olmo2.py
        │   ├── olmoe.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── paligemma.py
        │   ├── persimmon.py
        │   ├── phi.py
        │   ├── phi3.py
        │   ├── phi3_small.py
        │   ├── phi3v.py
        │   ├── phimoe.py
        │   ├── pixtral.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── qwen2_audio.py
        │   ├── qwen2_cls.py
        │   ├── qwen2_moe.py
        │   ├── qwen2_rm.py
        │   ├── qwen2_vl.py
        │   ├── registry.py
        │   ├── roberta.py
        │   ├── siglip.py
        │   ├── solar.py
        │   ├── stablelm.py
        │   ├── starcoder2.py
        │   ├── telechat2.py
        │   ├── ultravox.py
        │   └── utils.py
        ├── parameter.py
        ├── pooling_metadata.py
        ├── sampling_metadata.py
        └── utils.py
    ├── multimodal
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── image.py
        ├── inputs.py
        ├── processing.py
        ├── registry.py
        ├── utils.py
        └── video.py
    ├── outputs.py
    ├── platforms
        ├── __init__.py
        ├── cpu.py
        ├── cuda.py
        ├── hpu.py
        ├── interface.py
        ├── neuron.py
        ├── openvino.py
        ├── rocm.py
        ├── tpu.py
        └── xpu.py
    ├── plugins
        └── __init__.py
    ├── pooling_params.py
    ├── profiler
        ├── __init__.py
        ├── layerwise_profile.py
        └── utils.py
    ├── prompt_adapter
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── py.typed
    ├── sampling_params.py
    ├── scalar_type.py
    ├── scripts.py
    ├── sequence.py
    ├── spec_decode
        ├── __init__.py
        ├── batch_expansion.py
        ├── draft_model_runner.py
        ├── interfaces.py
        ├── medusa_worker.py
        ├── metrics.py
        ├── mlp_speculator_worker.py
        ├── mqa_scorer.py
        ├── multi_step_worker.py
        ├── ngram_worker.py
        ├── proposer_worker_base.py
        ├── smaller_tp_proposer_worker.py
        ├── spec_decode_worker.py
        ├── target_model_runner.py
        ├── top1_proposer.py
        └── util.py
    ├── tracing.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── aria.py
        │   ├── chatglm.py
        │   ├── dbrx.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── falcon.py
        │   ├── h2ovl.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── medusa.py
        │   ├── mllama.py
        │   ├── mlp_speculator.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nvlm_d.py
        │   ├── olmo2.py
        │   ├── solar.py
        │   ├── telechat2.py
        │   └── ultravox.py
        ├── detokenizer.py
        ├── detokenizer_utils.py
        ├── processor.py
        ├── tokenizer.py
        ├── tokenizer_group
        │   ├── __init__.py
        │   ├── base_tokenizer_group.py
        │   ├── ray_tokenizer_group.py
        │   └── tokenizer_group.py
        ├── tokenizers
        │   ├── __init__.py
        │   └── mistral.py
        └── utils.py
    ├── triton_utils
        ├── __init__.py
        ├── custom_cache_manager.py
        └── importing.py
    ├── usage
        ├── __init__.py
        └── usage_lib.py
    ├── utils.py
    ├── v1
        ├── __init__.py
        ├── attention
        │   ├── __init__.py
        │   └── backends
        │   │   ├── __init__.py
        │   │   └── flash_attn.py
        ├── core
        │   ├── __init__.py
        │   ├── encoder_cache_manager.py
        │   ├── kv_cache_manager.py
        │   ├── kv_cache_utils.py
        │   └── scheduler.py
        ├── engine
        │   ├── __init__.py
        │   ├── async_llm.py
        │   ├── async_stream.py
        │   ├── core.py
        │   ├── core_client.py
        │   ├── detokenizer.py
        │   ├── llm_engine.py
        │   ├── mm_input_mapper.py
        │   └── processor.py
        ├── executor
        │   ├── __init__.py
        │   └── gpu_executor.py
        ├── outputs.py
        ├── request.py
        ├── sample
        │   ├── __init__.py
        │   ├── metadata.py
        │   └── sampler.py
        ├── serial_utils.py
        ├── utils.py
        └── worker
        │   ├── __init__.py
        │   ├── gpu_model_runner.py
        │   └── gpu_worker.py
    ├── version.py
    ├── vllm_flash_attn
        └── .gitkeep
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── cpu_enc_dec_model_runner.py
        ├── cpu_model_runner.py
        ├── cpu_pooling_model_runner.py
        ├── cpu_worker.py
        ├── enc_dec_model_runner.py
        ├── hpu_model_runner.py
        ├── hpu_worker.py
        ├── model_runner.py
        ├── model_runner_base.py
        ├── multi_step_model_runner.py
        ├── multi_step_tpu_worker.py
        ├── multi_step_worker.py
        ├── neuron_model_runner.py
        ├── neuron_worker.py
        ├── openvino_model_runner.py
        ├── openvino_worker.py
        ├── pooling_model_runner.py
        ├── tpu_model_runner.py
        ├── tpu_worker.py
        ├── utils.py
        ├── worker.py
        ├── worker_base.py
        ├── xpu_model_runner.py
        └── xpu_worker.py


/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 2 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.671
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.664
10 | limit: 1000
11 | num_fewshot: 5
12 | trust_remote_code: True


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 2 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.905
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.905
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 2 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.892
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.892
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.752
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.754
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.753
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.753
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.755
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.755
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 2 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.753
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.753
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.764
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.764
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.728
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.728
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.758
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.759
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 2 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.756
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.752
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 2 | model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.419
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.416
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 2 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.356
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.358
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 2 | model_name: "mgoin/Minitron-4B-Base-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.233
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.236
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 2 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.86
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.86
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 2 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.624
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.624
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 2 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.616
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.632
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.578
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.585
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 2 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.593
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.588
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.595
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.582
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 2 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.792
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.824
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
 1 | Meta-Llama-3-8B-Instruct.yaml
 2 | Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 3 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 4 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 5 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 6 | Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 7 | Minitron-4B-Base-FP8.yaml
 8 | Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 9 | Qwen2-1.5B-Instruct-FP8W8.yaml
10 | Meta-Llama-3-8B-QQQ.yaml
11 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/nightly-annotation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Description
 3 | 
 4 | This file contains the downloading link for benchmarking results.
 5 | 
 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 7 | - [benchmarking results](artifact://results.zip)
 8 | - [benchmarking code](artifact://nightly-benchmarks.zip)
 9 | 
10 | Please download the visualization scripts in the post
11 | 
12 | 
13 | ## Results reproduction
14 | 
15 | - Find the docker we use in `benchmarking pipeline`
16 | - Deploy the docker, and inside the docker:
17 |   - Download `nightly-benchmarks.zip`. 
18 |   - In the same folder, run the following code
19 | ```
20 | export HF_TOKEN=<your HF token>
21 | apt update
22 | apt install -y git
23 | unzip nightly-benchmarks.zip
24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
25 | ```
26 | 
27 | And the results will be inside `./benchmarks/results`.
28 | 
29 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | 
 6 | def main(model, cachedir):
 7 |     # Load the tokenizer and save it to the specified directory
 8 |     tokenizer = AutoTokenizer.from_pretrained(model)
 9 |     tokenizer.save_pretrained(cachedir)
10 |     print(f"Tokenizer saved to {cachedir}")
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(
15 |         description="Download and save Hugging Face tokenizer")
16 |     parser.add_argument("--model",
17 |                         type=str,
18 |                         required=True,
19 |                         help="Name of the model")
20 |     parser.add_argument("--cachedir",
21 |                         type=str,
22 |                         required=True,
23 |                         help="Directory to save the tokenizer")
24 | 
25 |     args = parser.parse_args()
26 |     main(args.model, args.cachedir)
27 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
1 | from lmdeploy.serve.openai.api_client import APIClient
2 | 
3 | api_client = APIClient("http://localhost:8000")
4 | model_name = api_client.available_models[0]
5 | 
6 | print(model_name)
7 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 3 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 4 | 
 5 | TIMEOUT_SECONDS=10
 6 | 
 7 | retries=0
 8 | while [ $retries -lt 1000 ]; do
 9 |     if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
10 |         exit 0
11 |     fi
12 | 
13 |     echo "Waiting for image to be available..."
14 | 
15 |     retries=$((retries + 1))
16 |     sleep 5
17 | done
18 | 
19 | exit 1
20 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/latency-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "latency_llama8B_tp1",
 4 |         "parameters": {
 5 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 6 |             "tensor_parallel_size": 1,
 7 |             "load_format": "dummy",
 8 |             "num_iters_warmup": 5,
 9 |             "num_iters": 15
10 |         }
11 |     },
12 |     {
13 |         "test_name": "latency_llama70B_tp4",
14 |         "parameters": {
15 |             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
16 |             "tensor_parallel_size": 4,
17 |             "load_format": "dummy",
18 |             "num-iters-warmup": 5,
19 |             "num-iters": 15
20 |         }
21 |     },
22 |     {
23 |         "test_name": "latency_mixtral8x7B_tp2",
24 |         "parameters": {
25 |             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
26 |             "tensor_parallel_size": 2,
27 |             "load_format": "dummy",
28 |             "num-iters-warmup": 5,
29 |             "num-iters": 15
30 |         }
31 |     }
32 | ]


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/throughput-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "throughput_llama8B_tp1",
 4 |         "parameters": {
 5 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 6 |             "tensor_parallel_size": 1,
 7 |             "load_format": "dummy",
 8 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 9 |             "num_prompts": 200,
10 |             "backend": "vllm"
11 |         }
12 |     },
13 |     {
14 |         "test_name": "throughput_llama70B_tp4",
15 |         "parameters": {
16 |             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
17 |             "tensor_parallel_size": 4,
18 |             "load_format": "dummy",
19 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
20 |             "num_prompts": 200,
21 |             "backend": "vllm"
22 |         }
23 |     },
24 |     {
25 |         "test_name": "throughput_mixtral8x7B_tp2",
26 |         "parameters": {
27 |             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
28 |             "tensor_parallel_size": 2,
29 |             "load_format": "dummy",
30 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
31 |             "num_prompts": 200,
32 |             "backend": "vllm"
33 |         }
34 |     }
35 | ]


--------------------------------------------------------------------------------
/.buildkite/run-cpu-test-ppc64le.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the CPU docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Setup cleanup
 8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 9 | trap remove_docker_container EXIT
10 | remove_docker_container
11 | 
12 | # Try building the docker image
13 | docker build -t cpu-test -f Dockerfile.ppc64le .
14 | 
15 | 


--------------------------------------------------------------------------------
/.buildkite/run-hpu-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the CPU docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Try building the docker image
 8 | docker build -t hpu-test-env -f Dockerfile.hpu .
 9 | 
10 | # Setup cleanup
11 | remove_docker_container() { docker rm -f hpu-test || true; }
12 | trap remove_docker_container EXIT
13 | remove_docker_container
14 | 
15 | # Run the image and launch offline inference
16 | docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py


--------------------------------------------------------------------------------
/.buildkite/run-openvino-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the OpenVINO docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Try building the docker image
 8 | docker build -t openvino-test -f Dockerfile.openvino .
 9 | 
10 | # Setup cleanup
11 | remove_docker_container() { docker rm -f openvino-test || true; }
12 | trap remove_docker_container EXIT
13 | remove_docker_container
14 | 
15 | # Run the image and launch offline inference
16 | docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
17 | 


--------------------------------------------------------------------------------
/.buildkite/run-tpu-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Build the docker image.
 6 | docker build -f Dockerfile.tpu -t vllm-tpu .
 7 | 
 8 | # Set up cleanup.
 9 | remove_docker_container() { docker rm -f tpu-test || true; }
10 | trap remove_docker_container EXIT
11 | # Remove the container that might not be cleaned up in the previous run.
12 | remove_docker_container
13 | 
14 | # For HF_TOKEN.
15 | source /etc/environment
16 | # Run a simple end-to-end example.
17 | docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
18 | 


--------------------------------------------------------------------------------
/.buildkite/run-xpu-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script build the CPU docker image and run the offline inference inside the container.
 4 | # It serves a sanity check for compilation and basic model usage.
 5 | set -ex
 6 | 
 7 | # Try building the docker image
 8 | docker build -t xpu-test -f Dockerfile.xpu .
 9 | 
10 | # Setup cleanup
11 | remove_docker_container() { docker rm -f xpu-test || true; }
12 | trap remove_docker_container EXIT
13 | remove_docker_container
14 | 
15 | # Run the image and launch offline inference
16 | docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
17 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | UseTab: Never
 3 | IndentWidth: 2
 4 | ColumnLimit: 80
 5 | 
 6 | # Force pointers to the type for C++.
 7 | DerivePointerAlignment: false
 8 | PointerAlignment: Left
 9 | 
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 | 
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 | 
18 | IncludeCategories:
19 |   - Regex:           '^<'
20 |     Priority:        4
21 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 |     Priority:        3
23 |   - Regex:           '^"(qoda|\.\.)/'
24 |     Priority:        2
25 |   - Regex:           '.*'
26 |     Priority:        1
27 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | /.venv
 2 | /build
 3 | dist
 4 | vllm/*.so
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | 
11 | .mypy_cache
12 | 
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: vllm
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://docs.vllm.ai/
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | - type: checkboxes
24 |   id: askllm
25 |   attributes:
26 |     label: Before submitting a new issue...
27 |     options:
28 |       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
29 |         required: true
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-misc discussion.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Misc/random discussions that do not fit into the above categories.
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm.
14 |     description: >
15 |       Anything you want to discuss about vllm.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | - type: checkboxes
23 |   id: askllm
24 |   attributes:
25 |     label: Before submitting a new issue...
26 |     options:
27 |       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
28 |         required: true
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | FILL IN THE PR DESCRIPTION HERE
2 | 
3 | FIX #xxxx (*link existing issues this PR will resolve*)
4 | 
5 | **BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Maintain dependencies for GitHub Actions
 4 |   - package-ecosystem: "github-actions"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "weekly"
 8 |   - package-ecosystem: "pip"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "weekly"
12 |     labels: ["dependencies"]
13 |     open-pull-requests-limit: 5
14 |     reviewers: ["khluu", "simon-mo"]
15 |     allow:
16 |       - dependency-type: "all"
17 |     ignore:
18 |       - dependency-name: "*"
19 |         update-types: ["version-update:semver-patch"]
20 |       - dependency-name: "torch"
21 |       - dependency-name: "torchvision"
22 |       - dependency-name: "xformers"
23 |       - dependency-name: "lm-format-enforcer"
24 |       - dependency-name: "gguf"
25 |       - dependency-name: "compressed-tensors"
26 |       - dependency-name: "ray[adag]"
27 |       - dependency-name: "lm-eval"
28 |     groups:
29 |       minor-update:
30 |         applies-to: version-updates
31 |         update-types: ["minor"]
32 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint GitHub Actions workflows
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths:
 7 |       - '.github/workflows/*.ya?ml'
 8 |       - '.github/workflows/actionlint.*'
 9 |       - '.github/workflows/matchers/actionlint.json'
10 |   pull_request:
11 |     branches:
12 |       - "main"
13 |     paths:
14 |       - '.github/workflows/*.ya?ml'
15 |       - '.github/workflows/actionlint.*'
16 |       - '.github/workflows/matchers/actionlint.json'
17 | 
18 | env:
19 |   LC_ALL: en_US.UTF-8
20 | 
21 | defaults:
22 |   run:
23 |     shell: bash
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   actionlint:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: "Checkout"
33 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34 |         with:
35 |           fetch-depth: 0
36 | 
37 |       - name: "Run actionlint"
38 |         run: |
39 |           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
40 |           tools/actionlint.sh -color
41 | 


--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
 1 | name: Add label on auto-merge enabled
 2 | on:
 3 |     pull_request_target:
 4 |         types:
 5 |             - auto_merge_enabled
 6 | jobs:
 7 |     add-label-on-auto-merge:
 8 |         runs-on: ubuntu-latest
 9 |         steps:
10 |             -   name: Add label
11 |                 uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
12 |                 with:
13 |                     script: |
14 |                         github.rest.issues.addLabels({
15 |                             owner: context.repo.owner,
16 |                             repo: context.repo.repo,
17 |                             issue_number: context.issue.number,
18 |                             labels: ['ready']
19 |                         })
20 |                 env:
21 |                     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 | 


--------------------------------------------------------------------------------
/.github/workflows/cleanup_pr_body.yml:
--------------------------------------------------------------------------------
 1 | name: Cleanup PR Body
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, reopened, edited]
 6 | 
 7 | permissions:
 8 |   pull-requests: write
 9 | 
10 | jobs:
11 |   update-description:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
20 |         with:
21 |           python-version: '3.12'
22 | 
23 |       - name: Update PR description
24 |         env:
25 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
26 |         run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
27 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/mypy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "mypy",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "severity": 3,
11 |           "message": 4
12 |         }
13 |       ]
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/ruff.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "problemMatcher": [
 3 |       {
 4 |         "owner": "ruff",
 5 |         "pattern": [
 6 |           {
 7 |             "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
 8 |             "file": 1,
 9 |             "line": 2,
10 |             "column": 3,
11 |             "code": 4,
12 |             "message": 5
13 |           }
14 |         ]
15 |       }
16 |     ]
17 |   }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/png-lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint PNG exports from excalidraw
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths:
 7 |       - '*.excalidraw.png'
 8 |       - '.github/workflows/png-lint.yml'
 9 |   pull_request:
10 |     branches:
11 |       - "main"
12 |     paths:
13 |       - '*.excalidraw.png'
14 |       - '.github/workflows/png-lint.yml'
15 | 
16 | env:
17 |   LC_ALL: en_US.UTF-8
18 | 
19 | defaults:
20 |   run:
21 |     shell: bash
22 | 
23 | permissions:
24 |   contents: read
25 | 
26 | jobs:
27 |   actionlint:
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - name: "Checkout"
31 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
32 |         with:
33 |           fetch-depth: 0
34 | 
35 |       - name: "Run png-lint.sh to check excalidraw exported images"
36 |         run: |
37 |           tools/png-lint.sh
38 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | python_executable=python$1
 5 | cuda_home=/usr/local/cuda-$2
 6 | 
 7 | # Update paths
 8 | PATH=${cuda_home}/bin:$PATH
 9 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
10 | 
11 | # Install requirements
12 | $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure release wheels are built for the following architectures
17 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
18 | export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
19 | 
20 | bash tools/check_repo.sh
21 | 
22 | # Build
23 | $python_executable setup.py bdist_wheel --dist-dir=dist
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo "$1" | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo "$2" | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/shellcheck.yml:
--------------------------------------------------------------------------------
 1 | name: Lint shell scripts
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths:
 7 |       - '**/*.sh'
 8 |       - '.github/workflows/shellcheck.yml'
 9 |   pull_request:
10 |     branches:
11 |       - "main"
12 |     paths:
13 |       - '**/*.sh'
14 |       - '.github/workflows/shellcheck.yml'
15 | 
16 | env:
17 |   LC_ALL: en_US.UTF-8
18 | 
19 | defaults:
20 |   run:
21 |     shell: bash
22 | 
23 | permissions:
24 |   contents: read
25 | 
26 | jobs:
27 |   shellcheck:
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - name: "Checkout"
31 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
32 |         with:
33 |           fetch-depth: 0
34 | 
35 |       - name: "Check shell scripts"
36 |         run: |
37 |           tools/shellcheck.sh
38 | 


--------------------------------------------------------------------------------
/.github/workflows/sphinx-lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "docs/**"
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |     paths:
13 |       - "docs/**"
14 | 
15 | jobs:
16 |   sphinx-lint:
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       matrix:
20 |         python-version: ["3.12"]
21 |     steps:
22 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |       - name: Install dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           pip install -r requirements-lint.txt
31 |       - name: Linting docs
32 |         run: tools/sphinx-lint.sh
33 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |     paths:
10 |       - "**/*.py"
11 |       - .github/workflows/yapf.yml
12 |   pull_request:
13 |     branches:
14 |       - main
15 |     paths:
16 |       - "**/*.py"
17 |       - .github/workflows/yapf.yml
18 | 
19 | jobs:
20 |   yapf:
21 |     runs-on: ubuntu-latest
22 |     strategy:
23 |       matrix:
24 |         python-version: ["3.12"]
25 |     steps:
26 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
27 |       - name: Set up Python ${{ matrix.python-version }}
28 |         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 |       - name: Install dependencies
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install yapf==0.32.0
35 |           pip install toml==0.10.2
36 |       - name: Running yapf
37 |         run: |
38 |           yapf --diff --recursive .
39 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.12"
10 | 
11 | sphinx:
12 |   configuration: docs/source/conf.py
13 |   fail_on_warning: true
14 | 
15 | # If using Sphinx, optionally build your docs in additional formats such as PDF
16 | formats: []
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |   install:
21 |     - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/.shellcheckrc:
--------------------------------------------------------------------------------
 1 | # rules currently disabled:
 2 | #
 3 | #   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
 4 | #   SC2004 (style): $/${} is unnecessary on arithmetic variables.
 5 | #   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
 6 | #   SC2155 (warning): Declare and assign separately to avoid masking return values.
 7 | #   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
 8 | #
 9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164
10 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 | 
3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
4 | 


--------------------------------------------------------------------------------
/Dockerfile.hpu:
--------------------------------------------------------------------------------
 1 | FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 2 | 
 3 | COPY ./ /workspace/vllm
 4 | 
 5 | WORKDIR /workspace/vllm
 6 | 
 7 | RUN pip install -v -r requirements-hpu.txt
 8 | 
 9 | ENV no_proxy=localhost,127.0.0.1
10 | ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
11 | 
12 | RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
13 | 
14 | # install development dependencies (for testing)
15 | RUN python3 -m pip install -e tests/vllm_test_utils
16 | 
17 | WORKDIR /workspace/
18 | 
19 | RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
20 | 
21 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
22 | 


--------------------------------------------------------------------------------
/Dockerfile.openvino:
--------------------------------------------------------------------------------
 1 | # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 2 | # to run the OpenAI compatible server.
 3 | 
 4 | FROM ubuntu:22.04 AS dev
 5 | 
 6 | RUN apt-get update -y && \
 7 |     apt-get install -y \
 8 |         git python3-pip \
 9 |         ffmpeg libsm6 libxext6 libgl1
10 | WORKDIR /workspace
11 | 
12 | COPY . .
13 | ARG GIT_REPO_CHECK=0
14 | RUN --mount=type=bind,source=.git,target=.git \
15 |     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
16 | 
17 | # install build requirements
18 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
19 | # build vLLM with OpenVINO backend
20 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
21 | 
22 | COPY examples/ /workspace/examples
23 | COPY benchmarks/ /workspace/benchmarks
24 | 
25 | # install development dependencies (for testing)
26 | RUN python3 -m pip install -e tests/vllm_test_utils
27 | 
28 | CMD ["/bin/bash"]
29 | 


--------------------------------------------------------------------------------
/Dockerfile.tpu:
--------------------------------------------------------------------------------
 1 | ARG NIGHTLY_DATE="20241017"
 2 | ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 3 | 
 4 | FROM $BASE_IMAGE
 5 | WORKDIR /workspace/vllm
 6 | 
 7 | # Install some basic utilities
 8 | RUN apt-get update && apt-get install -y \
 9 |     git \
10 |     ffmpeg libsm6 libxext6 libgl1
11 | 
12 | # Build vLLM.
13 | COPY . .
14 | ARG GIT_REPO_CHECK=0
15 | RUN --mount=type=bind,source=.git,target=.git \
16 |     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
17 | 
18 | ENV VLLM_TARGET_DEVICE="tpu"
19 | RUN --mount=type=cache,target=/root/.cache/pip \
20 |     --mount=type=bind,source=.git,target=.git \
21 |     python3 -m pip install \
22 |         -r requirements-tpu.txt
23 | RUN python3 setup.py develop
24 | 
25 | # install development dependencies (for testing)
26 | RUN python3 -m pip install -e tests/vllm_test_utils
27 | 
28 | CMD ["/bin/bash"]
29 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements-common.txt
 3 | include requirements-cuda.txt
 4 | include requirements-rocm.txt
 5 | include requirements-neuron.txt
 6 | include requirements-cpu.txt
 7 | include CMakeLists.txt
 8 | 
 9 | recursive-include cmake *
10 | recursive-include csrc *
11 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Reporting a Vulnerability
 4 | 
 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 6 | 
 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
 8 | 
 9 | ---
10 | 
11 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
12 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking vLLM
 2 | 
 3 | ## Downloading the ShareGPT dataset
 4 | 
 5 | You can download the dataset by running:
 6 | ```bash
 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 8 | ```
 9 | 
10 | ## Downloading the ShareGPT4V dataset
11 | 
12 | The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
13 | will ignore a datapoint if the referred image is missing.
14 | ```bash
15 | wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
16 | mkdir coco -p
17 | wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
18 | unzip coco/train2017.zip -d coco/
19 | ```
20 | 


--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v "$PWD/data:/data" \
 9 |            ghcr.io/huggingface/text-generation-inference:2.2.0 \
10 |            --model-id "$MODEL" \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens "$TOKENS"
17 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 | 


--------------------------------------------------------------------------------
/csrc/core/registration.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <Python.h>
 4 | 
 5 | #define _CONCAT(A, B) A##B
 6 | #define CONCAT(A, B) _CONCAT(A, B)
 7 | 
 8 | #define _STRINGIFY(A) #A
 9 | #define STRINGIFY(A) _STRINGIFY(A)
10 | 
11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
12 | // could be a macro instead of a literal token.
13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
14 | 
15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
16 | // could be a macro instead of a literal token.
17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
18 |   TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
19 | 
20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized
21 | // via python's import statement.
22 | #define REGISTER_EXTENSION(NAME)                                               \
23 |   PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
24 |     static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
25 |                                         STRINGIFY(NAME), nullptr, 0, nullptr}; \
26 |     return PyModule_Create(&module);                                           \
27 |   }
28 | 


--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CPU_TYPES_HPP
 2 | #define CPU_TYPES_HPP
 3 | 
 4 | #if defined(__x86_64__)
 5 |   //x86 implementation
 6 |   #include "cpu_types_x86.hpp"
 7 | #elif defined(__POWER9_VECTOR__)
 8 |   //ppc implementation
 9 |   #include "cpu_types_vsx.hpp"
10 | #elif defined(__aarch64__)
11 |   //arm implementation
12 |   #include "cpu_types_arm.hpp"
13 | #else
14 |   #warning "unsupported vLLM cpu implementation"
15 | #endif
16 | 
17 | #endif


--------------------------------------------------------------------------------
/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 4 |   #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
 5 |   #define DEVICE_INLINE __forceinline__ __device__
 6 |   #define HOST_INLINE __forceinline__ __host__
 7 | #else
 8 |   #define HOST_DEVICE_INLINE inline
 9 |   #define DEVICE_INLINE inline
10 |   #define HOST_INLINE inline
11 | #endif
12 | 
13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id);
14 | 
15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
16 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
 6 |   int device, value;
 7 |   if (device_id < 0) {
 8 |     cudaGetDevice(&device);
 9 |   } else {
10 |     device = device_id;
11 |   }
12 |   cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
13 |                          device);
14 |   return value;
15 | }
16 | 
17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
18 |   int64_t attribute;
19 |   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
20 |   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
21 | 
22 | #ifdef USE_ROCM
23 |   attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
24 | #else
25 |   attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
26 | #endif
27 | 
28 |   return get_device_attribute(attribute, device_id);
29 | }
30 | 


--------------------------------------------------------------------------------
/csrc/cutlass_extensions/vllm_type_utils.cuh:
--------------------------------------------------------------------------------
 1 | #include "cutlass/bfloat16.h"
 2 | #include "cutlass/half.h"
 3 | #include "cuda_bf16.h"
 4 | 
 5 | #include "cutlass_extensions/vllm_custom_types.cuh"
 6 | 
 7 | namespace cutlass {
 8 | 
 9 | template <typename T>
10 | struct nameof {
11 |   static constexpr char const* value = "unknown";
12 | };
13 | 
14 | template <typename T>
15 | inline constexpr auto nameof_v = nameof<T>::value;
16 | 
17 | #define NAMEOF_TYPE(T)                       \
18 |   template <>                                \
19 |   struct nameof<T> {                         \
20 |     static constexpr char const* value = #T; \
21 |   };
22 | 
23 | NAMEOF_TYPE(float_e4m3_t)
24 | NAMEOF_TYPE(float_e5m2_t)
25 | NAMEOF_TYPE(half_t)
26 | NAMEOF_TYPE(nv_bfloat16)
27 | NAMEOF_TYPE(bfloat16_t)
28 | NAMEOF_TYPE(float)
29 | 
30 | NAMEOF_TYPE(int4b_t)
31 | NAMEOF_TYPE(int8_t)
32 | NAMEOF_TYPE(int32_t)
33 | NAMEOF_TYPE(int64_t)
34 | 
35 | NAMEOF_TYPE(vllm_uint4b8_t)
36 | NAMEOF_TYPE(uint4b_t)
37 | NAMEOF_TYPE(uint8_t)
38 | NAMEOF_TYPE(vllm_uint8b128_t)
39 | NAMEOF_TYPE(uint32_t)
40 | NAMEOF_TYPE(uint64_t)
41 | 
42 | };  // namespace cutlass


--------------------------------------------------------------------------------
/csrc/mamba/mamba_ssm/static_switch.h:
--------------------------------------------------------------------------------
 1 | // Inspired by
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
 4 | 
 5 | // clang-format off
 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h
 7 | #pragma once
 8 | 
 9 | /// @param COND       - a boolean expression to switch by
10 | /// @param CONST_NAME - a name given for the constexpr bool variable.
11 | /// @param ...       - code to execute for true and false
12 | ///
13 | /// Usage:
14 | /// ```
15 | /// BOOL_SWITCH(flag, BoolConst, [&] {
16 | ///     some_function<BoolConst>(...);
17 | /// });
18 | /// ```
19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \
20 |   [&] {                                    \
21 |     if (COND) {                            \
22 |       constexpr bool CONST_NAME = true;    \
23 |       return __VA_ARGS__();                \
24 |     } else {                               \
25 |       constexpr bool CONST_NAME = false;   \
26 |       return __VA_ARGS__();                \
27 |     }                                      \
28 |   }()
29 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = true;
18 | 
19 |   if (false) {
20 |   }
21 |   AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
22 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
23 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
24 |   AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4b8.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4b8(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4b8(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku8b128.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
 6 |                   torch::Tensor& token_expert_indices,
 7 |                   torch::Tensor& gating_output);
 8 | 
 9 | void moe_sum(torch::Tensor& input, torch::Tensor& output);
10 | 
11 | void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
12 |                           int64_t block_size, torch::Tensor sorted_token_ids,
13 |                           torch::Tensor experts_ids,
14 |                           torch::Tensor num_tokens_post_pad);
15 | 


--------------------------------------------------------------------------------
/csrc/prepare_inputs/advance_step.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <c10/cuda/CUDAGuard.h>
 7 | #include <cuda.h>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace prepare_inputs {
13 | 
14 | static constexpr int max_threads = 256;
15 | static constexpr bool logging = false;
16 | 
17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
18 | 
19 | }  // namespace prepare_inputs
20 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass/cutlass.h"
 4 | #include <climits>
 5 | 
 6 | /**
 7 |  * Helper function for checking CUTLASS errors
 8 |  */
 9 | #define CUTLASS_CHECK(status)                        \
10 |   {                                                  \
11 |     TORCH_CHECK(status == cutlass::Status::kSuccess, \
12 |                 cutlassGetStatusString(status))      \
13 |   }
14 | 
15 | inline uint32_t next_pow_2(uint32_t const num) {
16 |   if (num <= 1) return num;
17 |   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
18 | }
19 | 
20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
21 |   int max_shared_mem_per_block_opt_in = 0;
22 |   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
23 |                         cudaDevAttrMaxSharedMemoryPerBlockOptin,
24 |                         device);
25 |   return max_shared_mem_per_block_opt_in;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/csrc/rocm/ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
 6 |                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
 7 |                      torch::Tensor& query, torch::Tensor& key_cache,
 8 |                      torch::Tensor& value_cache, int64_t num_kv_heads,
 9 |                      double scale, torch::Tensor& block_tables,
10 |                      torch::Tensor& context_lens, int64_t block_size,
11 |                      int64_t max_context_len,
12 |                      const c10::optional<torch::Tensor>& alibi_slopes,
13 |                      const std::string& kv_cache_dtype, double k_scale,
14 |                      double v_scale);
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx==6.2.1
 2 | sphinx-book-theme==1.0.1
 3 | sphinx-copybutton==0.5.2
 4 | myst-parser==2.0.0
 5 | sphinx-argparse==0.4.0
 6 | msgspec
 7 | cloudpickle
 8 | 
 9 | # packages to install to build the documentation
10 | pydantic >= 2.8
11 | -f https://download.pytorch.org/whl/cpu
12 | torch
13 | py-cpuinfo
14 | transformers
15 | mistral_common >= 1.3.4
16 | aiohttp
17 | starlette
18 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
19 | partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args


--------------------------------------------------------------------------------
/docs/source/_static/custom.js:
--------------------------------------------------------------------------------
 1 | document.addEventListener("DOMContentLoaded", function () {
 2 |     var script = document.createElement("script");
 3 |     script.type = "module";
 4 |     script.id = "runllm-widget-script"
 5 |   
 6 |     script.src = "https://widget.runllm.com";
 7 |   
 8 |     script.setAttribute("version", "stable");
 9 |     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
10 |     script.setAttribute("runllm-name", "vLLM");
11 |     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
12 |     script.setAttribute("runllm-position-y", "20%");
13 |     script.setAttribute("runllm-position-x", "3%");
14 |     script.setAttribute("runllm-assistant-id", "207");
15 |   
16 |     script.async = true;
17 |     document.head.appendChild(script);
18 |   });


--------------------------------------------------------------------------------
/docs/source/_templates/sections/header.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 |   .notification-bar {
 3 |     width: 100vw;
 4 |     display: flex;
 5 |     justify-content: center;
 6 |     align-items: center;
 7 |     font-size: 16px;
 8 |     padding: 0 6px 0 6px;
 9 |   }
10 |   .notification-bar p {
11 |     margin: 0;
12 |   }
13 |   .notification-bar a {
14 |     font-weight: bold;
15 |     text-decoration: none;
16 |   }
17 | 
18 |   /* Light mode styles (default) */
19 |   .notification-bar {
20 |     background-color: #fff3cd;
21 |     color: #856404;
22 |   }
23 |   .notification-bar a {
24 |     color: #d97706;
25 |   }
26 | 
27 |   /* Dark mode styles */
28 |   html[data-theme=dark] .notification-bar {
29 |     background-color: #333;
30 |     color: #ddd;
31 |   }
32 |   html[data-theme=dark] .notification-bar a {
33 |     color: #ffa500; /* Brighter color for visibility */
34 |   }
35 | </style>
36 | 
37 | <div class="notification-bar">
38 |   <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
39 | </div>
40 | 


--------------------------------------------------------------------------------
/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png


--------------------------------------------------------------------------------
/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png


--------------------------------------------------------------------------------
/docs/source/assets/design/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/hierarchy.png


--------------------------------------------------------------------------------
/docs/source/assets/dev/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/dev/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/community/sponsors.md:
--------------------------------------------------------------------------------
 1 | # Sponsors
 2 | 
 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 4 | 
 5 | <!-- Note: Please sort them in alphabetical order. -->
 6 | <!-- Note: Please keep these consistent with README.md. -->
 7 | 
 8 | - a16z
 9 | - AMD
10 | - Anyscale
11 | - AWS
12 | - Crusoe Cloud
13 | - Databricks
14 | - DeepInfra
15 | - Dropbox
16 | - Google Cloud
17 | - Lambda Lab
18 | - Nebius
19 | - NVIDIA
20 | - Replicate
21 | - Roblox
22 | - RunPod
23 | - Sequoia Capital
24 | - Skywork AI
25 | - Trainy
26 | - UC Berkeley
27 | - UC San Diego
28 | - ZhenFund
29 | 
30 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
31 | 


--------------------------------------------------------------------------------
/docs/source/design/input_processing/input_processing_pipeline.rst:
--------------------------------------------------------------------------------
 1 | .. _input_processing_pipeline:
 2 | 
 3 | Input Processing Pipeline
 4 | =========================
 5 | 
 6 | 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
 7 | 
 8 | 2. Tokenize the data if necessary.
 9 | 
10 | 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
11 | 
12 |    - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
13 | 
14 | 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
15 | 
16 | 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
17 | 
18 | 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
19 | 
20 |    - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
21 | 


--------------------------------------------------------------------------------
/docs/source/design/input_processing/model_inputs_index.rst:
--------------------------------------------------------------------------------
 1 | .. _input_processing:
 2 | 
 3 | Input Processing
 4 | ================
 5 | 
 6 | .. currentmodule:: vllm.inputs
 7 | 
 8 | Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
 9 | :data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
10 | 
11 | Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
12 | data in addition to input prompt, but it can be extended to text-only language models when needed.
13 | 
14 | Guides
15 | ++++++
16 | 
17 | .. toctree::
18 |    :maxdepth: 1
19 | 
20 |    input_processing_pipeline
21 | 
22 | Module Contents
23 | +++++++++++++++
24 | 
25 | LLM Engine Inputs
26 | -----------------
27 | 
28 | .. autoclass:: vllm.inputs.DecoderOnlyInputs
29 |     :members:
30 |     :show-inheritance:
31 | 
32 | Registry
33 | --------
34 | 
35 | .. autodata:: vllm.inputs.INPUT_REGISTRY
36 | 
37 | .. automodule:: vllm.inputs.registry
38 |     :members:
39 |     :show-inheritance:
40 | 


--------------------------------------------------------------------------------
/docs/source/design/multimodal/adding_multimodal_plugin.rst:
--------------------------------------------------------------------------------
 1 | .. _adding_multimodal_plugin:
 2 | 
 3 | Adding a Multimodal Plugin
 4 | ==========================
 5 | 
 6 | This document teaches you how to add a new modality to vLLM.
 7 | 
 8 | Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 9 | For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
10 | 
11 | The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
12 | 
13 | .. note::
14 |   This article is a work in progress.
15 | 
16 | ..
17 |   TODO: Add more instructions on how to add new plugins once embeddings is in.
18 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/async_llm_engine.rst:
--------------------------------------------------------------------------------
1 | AsyncLLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.AsyncLLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/engine_index.rst:
--------------------------------------------------------------------------------
 1 | vLLM Engine
 2 | =================================
 3 | 
 4 | .. automodule:: vllm.engine
 5 | .. currentmodule:: vllm.engine
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Engines
10 | 
11 |    llm_engine
12 |    async_llm_engine
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/llm_engine.rst:
--------------------------------------------------------------------------------
1 | LLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.LLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm.rst:
--------------------------------------------------------------------------------
1 | LLM Class
2 | =========
3 | 
4 | .. autoclass:: vllm.LLM
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm_inputs.rst:
--------------------------------------------------------------------------------
 1 | LLM Inputs
 2 | ==========
 3 | 
 4 | .. autodata:: vllm.inputs.PromptType
 5 | 
 6 | .. autoclass:: vllm.inputs.TextPrompt
 7 |     :show-inheritance:
 8 |     :members:
 9 |     :member-order: bysource
10 | 
11 | .. autoclass:: vllm.inputs.TokensPrompt
12 |     :show-inheritance:
13 |     :members:
14 |     :member-order: bysource
15 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/offline_index.rst:
--------------------------------------------------------------------------------
1 | Offline Inference
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    llm
8 |    llm_inputs
9 | 


--------------------------------------------------------------------------------
/docs/source/dev/pooling_params.rst:
--------------------------------------------------------------------------------
1 | Pooling Parameters
2 | ==================
3 | 
4 | .. autoclass:: vllm.PoolingParams
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/dev/sampling_params.rst:
--------------------------------------------------------------------------------
1 | Sampling Parameters
2 | ===================
3 | 
4 | .. autoclass:: vllm.SamplingParams
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/examples/examples_index.template.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: Scripts
7 | 
8 |    %EXAMPLE_DOCS%
9 | 


--------------------------------------------------------------------------------
/docs/source/models/engine_args.rst:
--------------------------------------------------------------------------------
 1 | .. _engine_args:
 2 | 
 3 | Engine Arguments
 4 | ================
 5 | 
 6 | Below, you can find an explanation of every engine argument for vLLM:
 7 | 
 8 | .. argparse::
 9 |     :module: vllm.engine.arg_utils
10 |     :func: _engine_args_parser
11 |     :prog: vllm serve
12 |     :nodefaultconst:
13 | 
14 | Async Engine Arguments
15 | ----------------------
16 | 
17 | Below are the additional arguments related to the asynchronous engine:
18 | 
19 | .. argparse::
20 |     :module: vllm.engine.arg_utils
21 |     :func: _async_engine_args_parser
22 |     :prog: vllm serve
23 |     :nodefaultconst:


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_bentoml.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_bentoml:
2 | 
3 | Deploying with BentoML
4 | ======================
5 | 
6 | `BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_kserve.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_kserve:
2 | 
3 | Deploying with KServe
4 | ============================
5 | 
6 | vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/>`_ for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_lws.rst:
--------------------------------------------------------------------------------
 1 | .. _deploying_with_lws:
 2 | 
 3 | Deploying with LWS
 4 | ============================
 5 | 
 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 7 | A major use case is for multi-host/multi-node distributed inference.
 8 | 
 9 | vLLM can be deployed with `LWS <https://github.com/kubernetes-sigs/lws>`_ on Kubernetes for distributed model serving.
10 | 
11 | Please see `this guide <https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm>`_ for more details on
12 | deploying vLLM on Kubernetes using LWS.
13 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/env_vars.rst:
--------------------------------------------------------------------------------
 1 | Environment Variables
 2 | ========================
 3 | 
 4 | vLLM uses the following environment variables to configure the system:
 5 | 
 6 | .. warning::
 7 |     Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
 8 | 
 9 |     All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
10 | 
11 | .. literalinclude:: ../../../vllm/envs.py
12 |     :language: python
13 |     :start-after: begin-env-vars-definition
14 |     :end-before: end-env-vars-definition
15 | 


--------------------------------------------------------------------------------
/docs/source/serving/integrations.rst:
--------------------------------------------------------------------------------
 1 | Integrations
 2 | ------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    run_on_sky
 8 |    deploying_with_kserve
 9 |    deploying_with_triton
10 |    deploying_with_bentoml
11 |    deploying_with_cerebrium
12 |    deploying_with_lws
13 |    deploying_with_dstack
14 |    serving_with_langchain
15 |    serving_with_llamaindex
16 |    serving_with_llamastack
17 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_langchain.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_langchain:
 2 | 
 3 | Serving with Langchain
 4 | ============================
 5 | 
 6 | vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 7 | 
 8 | To install langchain, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install langchain langchain_community -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from langchain_community.llms import VLLM
19 | 
20 |     llm = VLLM(model="mosaicml/mpt-7b",
21 |                trust_remote_code=True,  # mandatory for hf models
22 |                max_new_tokens=128,
23 |                top_k=10,
24 |                top_p=0.95,
25 |                temperature=0.8,
26 |                # tensor_parallel_size=... # for distributed inference
27 |     )
28 | 
29 |     print(llm("What is the capital of France ?"))
30 | 
31 | Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
32 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_llamaindex.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_llamaindex:
 2 | 
 3 | Serving with llama_index
 4 | ============================
 5 | 
 6 | vLLM is also available via `llama_index <https://github.com/run-llama/llama_index>`_ .
 7 | 
 8 | To install llamaindex, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install llama-index-llms-vllm -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from llama_index.llms.vllm import Vllm
19 | 
20 |     llm = Vllm(
21 |         model="microsoft/Orca-2-7b",
22 |         tensor_parallel_size=4,
23 |         max_new_tokens=100,
24 |         vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
25 |     )
26 | 
27 | Please refer to this `Tutorial <https://docs.llamaindex.ai/en/latest/examples/llm/vllm/>`_ for more details.
28 | 


--------------------------------------------------------------------------------
/docs/source/serving/tensorizer.rst:
--------------------------------------------------------------------------------
 1 | .. _tensorizer:
 2 | 
 3 | Loading Models with CoreWeave's Tensorizer
 4 | ==========================================
 5 | vLLM supports loading models with `CoreWeave's Tensorizer <https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer>`_.
 6 | vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
 7 | at runtime extremely quickly directly to the GPU, resulting in significantly
 8 | shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
 9 | 
10 | For more information on CoreWeave's Tensorizer, please refer to
11 | `CoreWeave's Tensorizer documentation <https://github.com/coreweave/tensorizer>`_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
12 | the `vLLM example script <https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html>`_.
13 | 
14 | .. note::
15 |   Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
16 | 


--------------------------------------------------------------------------------
/examples/cpu_offload.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


--------------------------------------------------------------------------------
/examples/offline_inference_arctic.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="snowflake/snowflake-arctic-instruct",
15 |           quantization="deepspeedfp",
16 |           tensor_parallel_size=8,
17 |           trust_remote_code=True)
18 | # Generate texts from the prompts. The output is a list of RequestOutput objects
19 | # that contain the prompt, generated text, and other information.
20 | 
21 | outputs = llm.generate(prompts, sampling_params)
22 | # Print the outputs.
23 | for output in outputs:
24 |     prompt = output.prompt
25 |     generated_text = output.outputs[0].text
26 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
27 | 


--------------------------------------------------------------------------------
/examples/offline_inference_embedding.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | 
11 | # Create an LLM.
12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
13 | # Generate embedding. The output is a list of PoolingRequestOutputs.
14 | outputs = model.encode(prompts)
15 | # Print the outputs.
16 | for output in outputs:
17 |     print(output.outputs.embedding)  # list of 4096 floats
18 | 


--------------------------------------------------------------------------------
/examples/offline_inference_tpu.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | prompts = [
 4 |     "A robot may not injure a human being",
 5 |     "It is only with the heart that one can see rightly;",
 6 |     "The greatest glory in living lies not in never falling,",
 7 | ]
 8 | answers = [
 9 |     " or, through inaction, allow a human being to come to harm.",
10 |     " what is essential is invisible to the eye.",
11 |     " but in rising every time we fall.",
12 | ]
13 | N = 1
14 | # Currently, top-p sampling is disabled. `top_p` should be 1.0.
15 | sampling_params = SamplingParams(temperature=0.7,
16 |                                  top_p=1.0,
17 |                                  n=N,
18 |                                  max_tokens=16)
19 | 
20 | # Set `enforce_eager=True` to avoid ahead-of-time compilation.
21 | # In real workloads, `enforace_eager` should be `False`.
22 | llm = LLM(model="google/gemma-2b", enforce_eager=True)
23 | outputs = llm.generate(prompts, sampling_params)
24 | for output, answer in zip(outputs, answers):
25 |     prompt = output.prompt
26 |     generated_text = output.outputs[0].text
27 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
28 |     assert generated_text.startswith(answer)
29 | 


--------------------------------------------------------------------------------
/examples/offline_inference_with_profiler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | # enable torch profiler, can also be set on cmd line
 6 | os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 7 | 
 8 | # Sample prompts.
 9 | prompts = [
10 |     "Hello, my name is",
11 |     "The president of the United States is",
12 |     "The capital of France is",
13 |     "The future of AI is",
14 | ]
15 | # Create a sampling params object.
16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
17 | 
18 | # Create an LLM.
19 | llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
20 | 
21 | llm.start_profile()
22 | 
23 | # Generate texts from the prompts. The output is a list of RequestOutput objects
24 | # that contain the prompt, generated text, and other information.
25 | outputs = llm.generate(prompts, sampling_params)
26 | 
27 | llm.stop_profile()
28 | 
29 | # Print the outputs.
30 | for output in outputs:
31 |     prompt = output.prompt
32 |     generated_text = output.outputs[0].text
33 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
34 | 


--------------------------------------------------------------------------------
/examples/openai_chat_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | chat_completion = client.chat.completions.create(
17 |     messages=[{
18 |         "role": "system",
19 |         "content": "You are a helpful assistant."
20 |     }, {
21 |         "role": "user",
22 |         "content": "Who won the world series in 2020?"
23 |     }, {
24 |         "role":
25 |         "assistant",
26 |         "content":
27 |         "The Los Angeles Dodgers won the World Series in 2020."
28 |     }, {
29 |         "role": "user",
30 |         "content": "Where was it played?"
31 |     }],
32 |     model=model,
33 | )
34 | 
35 | print("Chat completion results:")
36 | print(chat_completion)
37 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | # Completion API
17 | stream = False
18 | completion = client.completions.create(
19 |     model=model,
20 |     prompt="A robot may not injure a human being",
21 |     echo=False,
22 |     n=2,
23 |     stream=stream,
24 |     logprobs=3)
25 | 
26 | print("Completion results:")
27 | if stream:
28 |     for c in completion:
29 |         print(c)
30 | else:
31 |     print(completion)
32 | 


--------------------------------------------------------------------------------
/examples/openai_embedding_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | responses = client.embeddings.create(
17 |     input=[
18 |         "Hello my name is",
19 |         "The best thing about vLLM is that it supports many different models"
20 |     ],
21 |     model=model,
22 | )
23 | 
24 | for data in responses.data:
25 |     print(data.embedding)  # list of float of len 4096
26 | 


--------------------------------------------------------------------------------
/examples/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
3 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_blip2.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'Question: ' + message['content'] + ' ' -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Answer: ' + message['content'] + ' ' -}}
 6 |     {%- endif -%}
 7 | {%- endfor -%}
 8 | 
 9 | {%- if add_generation_prompt -%}
10 |     {{- 'Answer:' -}}
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_dse_qwen2_vl.jinja:
--------------------------------------------------------------------------------
1 | {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
2 | You are a helpful assistant.<|im_end|>
3 | {% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
4 | {% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
5 | {% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
6 | {% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
7 | {% endraw %}{% endif %}<|endoftext|>


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_inkbot.jinja:
--------------------------------------------------------------------------------
 1 | <#meta#>
 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 4 | <#system#>
 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 6 | <#chat#>
 7 | {% for message in messages %}
 8 | {% if message['role'] == 'user' %}
 9 | <#user#>
10 | {{ message['content']|trim -}}
11 | {% if not loop.last %}
12 | 
13 | {% endif %}
14 | {% elif message['role'] == 'assistant' %}
15 | <#bot#>
16 | {{ message['content']|trim -}}
17 | {% if not loop.last %}
18 | 
19 | {% endif %}
20 | {% elif message['role'] == 'user_context' %}
21 | <#user_context#>
22 | {{ message['content']|trim -}}
23 | {% if not loop.last %}
24 | 
25 | {% endif %}
26 | {% endif %}
27 | {% endfor %}
28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
29 | <#bot#>
30 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_llava.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages[0]['role'] == 'system' -%}
 2 |     {%- set system_message = messages[0]['content'] -%}
 3 |     {%- set messages = messages[1:] -%}
 4 | {%- else -%}
 5 |     {% set system_message = '' -%}
 6 | {%- endif -%}
 7 | 
 8 | {{ bos_token + system_message }}
 9 | {%- for message in messages -%}
10 |     {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 |         {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 |     {%- endif -%}
13 | 
14 |     {%- if message['role'] == 'user' -%}
15 |         {{ 'USER: ' + message['content'] + '\n' }}
16 |     {%- elif message['role'] == 'assistant' -%}
17 |         {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
18 |     {%- endif -%}
19 | {%- endfor -%}
20 | 
21 | {%- if add_generation_prompt -%}
22 |     {{ 'ASSISTANT:' }}
23 | {% endif %}
24 | 


--------------------------------------------------------------------------------
/examples/template_vlm2vec.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages | length > 1 -%}
 2 |     {{ raise_exception('Embedding models should only embed one message at a time') }}
 3 | {%- endif -%}
 4 | 
 5 | {% set vars = namespace(parts=[], next_image_id=1) %}
 6 | {%- for message in messages -%}
 7 |     {%- for content in message['content'] -%}
 8 |         {%- if content['type'] == 'text' -%}
 9 |             {%- set vars.parts = vars.parts + [content['text']] %}
10 |         {%- elif content['type'] == 'image' -%}
11 |             {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
12 |             {%- set vars.next_image_id = vars.next_image_id + 1 %}
13 |         {%- endif -%}
14 |     {%- endfor -%}
15 | {%- endfor -%}
16 | {{ vars.parts | join(' ') }}
17 | 


--------------------------------------------------------------------------------
/find_cuda_init.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import traceback
 3 | from typing import Callable
 4 | from unittest.mock import patch
 5 | 
 6 | 
 7 | def find_cuda_init(fn: Callable[[], object]) -> None:
 8 |     """
 9 |     Helper function to debug CUDA re-initialization errors.
10 | 
11 |     If `fn` initializes CUDA, prints the stack trace of how this happens.
12 |     """
13 |     from torch.cuda import _lazy_init
14 | 
15 |     stack = None
16 | 
17 |     def wrapper():
18 |         nonlocal stack
19 |         stack = traceback.extract_stack()
20 |         return _lazy_init()
21 | 
22 |     with patch("torch.cuda._lazy_init", wrapper):
23 |         fn()
24 | 
25 |     if stack is not None:
26 |         print("==== CUDA Initialized ====")
27 |         print("".join(traceback.format_list(stack)).strip())
28 |         print("==========================")
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     find_cuda_init(
33 |         lambda: importlib.import_module("vllm.model_executor.models.llava"))
34 | 


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
 1 | # Should be mirrored in pyproject.toml
 2 | cmake>=3.26
 3 | ninja
 4 | packaging
 5 | setuptools>=61
 6 | setuptools-scm>=8
 7 | torch==2.5.1
 8 | wheel
 9 | jinja2
10 | 


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for CPUs
5 | torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
6 | torch==2.5.1; platform_machine == "aarch64"
7 | torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch


--------------------------------------------------------------------------------
/requirements-cuda.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for NVIDIA GPUs
 5 | ray >= 2.9
 6 | nvidia-ml-py >= 12.560.30 # for pynvml package
 7 | torch == 2.5.1
 8 | # These must be updated alongside torch
 9 | torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
10 | xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
11 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements-lint.txt
2 | -r requirements-test.txt
3 | 
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 | 


--------------------------------------------------------------------------------
/requirements-hpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for HPU code
 5 | ray
 6 | triton
 7 | pandas
 8 | tabulate
 9 | setuptools>=61
10 | setuptools-scm>=8
11 | vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
12 | 


--------------------------------------------------------------------------------
/requirements-lint.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | tomli==2.0.2
 5 | ruff==0.6.5
 6 | codespell==2.3.0
 7 | isort==5.13.2
 8 | clang-format==18.1.5
 9 | sphinx-lint==1.0.0
10 | 
11 | # type checking
12 | mypy==1.11.1
13 | types-PyYAML
14 | types-requests
15 | types-setuptools
16 | 


--------------------------------------------------------------------------------
/requirements-neuron.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for Neuron devices
5 | transformers-neuronx >= 0.12.0
6 | torch-neuronx >= 2.1.2
7 | neuronx-cc
8 | 


--------------------------------------------------------------------------------
/requirements-openvino.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | torch == 2.5.1 #  should be aligned with "common" vLLM torch version
5 | openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
6 | 
7 | optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
8 | optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
9 | 


--------------------------------------------------------------------------------
/requirements-rocm.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for AMD GPUs
 5 | awscli
 6 | boto3
 7 | botocore
 8 | ray >= 2.10.0
 9 | peft
10 | pytest-asyncio
11 | tensorizer>=2.9.0


--------------------------------------------------------------------------------
/requirements-test.in:
--------------------------------------------------------------------------------
 1 | # testing
 2 | pytest
 3 | tensorizer>=2.9.0
 4 | pytest-forked
 5 | pytest-asyncio
 6 | pytest-rerunfailures
 7 | pytest-shard
 8 | 
 9 | # testing utils
10 | awscli
11 | decord # required for video tests
12 | einops # required for MPT, qwen-vl and Mamba
13 | httpx
14 | librosa # required for audio tests
15 | peft
16 | ray[adag]==2.35
17 | sentence-transformers # required for embedding tests
18 | soundfile # required for audio tests
19 | timm # required for internvl test
20 | torch==2.5.1
21 | transformers_stream_generator # required for qwen-vl test
22 | matplotlib # required for qwen-vl test
23 | mistral_common[opencv] >= 1.4.4 # required for pixtral test
24 | datamodel_code_generator # required for minicpm3 test
25 | lm-eval[api]==0.4.4 # required for model evaluation test
26 | 
27 | # TODO: Add this after fully implementing llava(mantis)
28 | # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
29 | 
30 | # quantization
31 | bitsandbytes>=0.44.0
32 | buildkite-test-collector==0.1.9
33 | 
34 | numpy < 2.0.0
35 | 


--------------------------------------------------------------------------------
/requirements-tpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for TPU
 5 | cmake>=3.26
 6 | ninja
 7 | packaging
 8 | setuptools-scm>=8
 9 | wheel
10 | jinja2
11 | ray[default]
12 | 
13 | # Install torch_xla
14 | --pre
15 | --extra-index-url https://download.pytorch.org/whl/nightly/cpu
16 | --find-links https://storage.googleapis.com/libtpu-releases/index.html
17 | --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
18 | --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
19 | torch==2.6.0.dev20241126+cpu
20 | torchvision==0.20.0.dev20241126+cpu
21 | torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
22 | jaxlib==0.4.36.dev20241122
23 | jax==0.4.36.dev20241122
24 | 


--------------------------------------------------------------------------------
/requirements-xpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | ray >= 2.9
 5 | cmake>=3.26
 6 | ninja
 7 | packaging
 8 | setuptools-scm>=8
 9 | wheel
10 | jinja2
11 | 
12 | torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
13 | intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
14 | oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
15 | 
16 | triton-xpu == 3.0.0b1
17 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/async_engine/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/basic_correctness/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
1 | from ..utils import compare_two_settings
2 | 
3 | 
4 | def test_cpu_offload():
5 |     compare_two_settings("meta-llama/Llama-3.2-1B", [],
6 |                          ["--cpu-offload-gb", "1"])
7 | 


--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/compile/__init__.py


--------------------------------------------------------------------------------
/tests/compile/piecewise/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/compile/piecewise/__init__.py


--------------------------------------------------------------------------------
/tests/compile/test_full_graph.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.config import CompilationLevel
 4 | 
 5 | from ..utils import fork_new_process_for_each_test
 6 | from .utils import TEST_MODELS, check_full_graph_support
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("model_info", TEST_MODELS)
10 | @pytest.mark.parametrize(
11 |     "optimization_level",
12 |     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
13 | @fork_new_process_for_each_test
14 | def test_full_graph(model_info, optimization_level):
15 |     model = model_info[0]
16 |     model_kwargs = model_info[1]
17 |     check_full_graph_support(model,
18 |                              model_kwargs,
19 |                              optimization_level,
20 |                              tp_size=1)
21 | 


--------------------------------------------------------------------------------
/tests/compile/test_pass_manager.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import pytest
 4 | import torch
 5 | from torch._inductor.codecache import BypassFxGraphCache
 6 | 
 7 | from vllm.compilation.config import CompilationConfig
 8 | from vllm.compilation.inductor_pass import (CallableInductorPass,
 9 |                                             as_inductor_pass)
10 | from vllm.compilation.pass_manager import PostGradPassManager
11 | 
12 | 
13 | def simple_callable(graph: torch.fx.Graph):
14 |     pass
15 | 
16 | 
17 | @as_inductor_pass(files=(__file__, ))
18 | def callable_decorated(graph: torch.fx.Graph):
19 |     pass
20 | 
21 | 
22 | @pytest.mark.parametrize(
23 |     "works, callable",
24 |     [(False, simple_callable), (True, callable_decorated),
25 |      (True, CallableInductorPass(simple_callable, "simple_callable"))])
26 | def test_pass_manager(works: bool, callable):
27 |     config = CompilationConfig().pass_config
28 |     pass_manager = PostGradPassManager([callable])
29 |     pass_manager.configure(config)  # Adds default passes
30 | 
31 |     if works:
32 |         pickle.dumps(pass_manager)
33 |     else:
34 |         with pytest.raises(BypassFxGraphCache):
35 |             pickle.dumps(pass_manager)
36 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/block/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def should_do_global_cleanup_after_test() -> bool:
 6 |     """Disable the global cleanup fixture for tests in this directory. This
 7 |     provides a ~10x speedup for unit tests that don't load a model to GPU.
 8 | 
 9 |     This requires that tests in this directory clean up after themselves if they
10 |     use the GPU.
11 |     """
12 |     return False
13 | 


--------------------------------------------------------------------------------
/tests/core/block/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/block/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/data/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | trust_remote_code: true
5 | multi_step_stream_outputs: false
6 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/distributed/__init__.py


--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
1 | from ..entrypoints.openai.test_oot_registration import (
2 |     run_and_test_dummy_opt_api_server)
3 | 
4 | 
5 | def test_distributed_oot(dummy_opt_path: str):
6 |     run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
7 | 


--------------------------------------------------------------------------------
/tests/distributed/test_pp_cudagraph.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from ..utils import compare_two_settings, fork_new_process_for_each_test
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
 9 |     (2, "JackFram/llama-160m"),
10 | ])
11 | @pytest.mark.parametrize("ATTN_BACKEND", [
12 |     "FLASH_ATTN",
13 |     "FLASHINFER",
14 | ])
15 | @fork_new_process_for_each_test
16 | def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
17 |     cudagraph_args = [
18 |         # use half precision for speed and memory savings in CI environment
19 |         "--dtype",
20 |         "float16",
21 |         "--pipeline-parallel-size",
22 |         str(PP_SIZE),
23 |         "--distributed-executor-backend",
24 |         "mp",
25 |     ]
26 |     os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
27 | 
28 |     eager_args = cudagraph_args + ["--enforce-eager"]
29 | 
30 |     compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
31 | 


--------------------------------------------------------------------------------
/tests/distributed/test_same_node.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch.distributed as dist
 4 | 
 5 | from vllm.distributed.parallel_state import in_the_same_node_as
 6 | 
 7 | if __name__ == "__main__":
 8 |     dist.init_process_group(backend="gloo")
 9 |     test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
10 | 
11 |     expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
12 |     assert test_result == expected, f"Expected {expected}, got {test_result}"
13 |     print("Same node test passed!")
14 | 


--------------------------------------------------------------------------------
/tests/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/encoder_decoder/__init__.py


--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/engine/__init__.py


--------------------------------------------------------------------------------
/tests/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/tests/engine/test_short_mm_context.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ..conftest import IMAGE_ASSETS
 4 | 
 5 | HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 6 |     "stop_sign":
 7 |     "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
 8 |     "cherry_blossom":
 9 |     "USER: <image>\nWhat is the season?\nASSISTANT:",
10 | })
11 | 
12 | models = ["llava-hf/llava-1.5-7b-hf"]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", models)
16 | def test_context_length_too_short(vllm_runner, image_assets, model):
17 |     images = [asset.pil_image for asset in image_assets]
18 | 
19 |     with pytest.raises(ValueError, match="too long to fit into the model"):
20 |         vllm_model = vllm_runner(
21 |             model,
22 |             max_model_len=128,  # LLaVA has a feature size of 576
23 |             enforce_eager=True,
24 |         )
25 | 
26 |         with vllm_model:
27 |             vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
28 |                                        max_tokens=1,
29 |                                        images=[images[0]])
30 | 


--------------------------------------------------------------------------------
/tests/engine/test_skip_tokenizer_init.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_skip_tokenizer_initialization(model: str):
 9 |     # This test checks if the flag skip_tokenizer_init skips the initialization
10 |     # of tokenizer and detokenizer. The generated output is expected to contain
11 |     # token ids.
12 |     llm = LLM(model=model, skip_tokenizer_init=True)
13 |     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
14 | 
15 |     with pytest.raises(ValueError, match="cannot pass text prompts when"):
16 |         llm.generate("abc", sampling_params)
17 | 
18 |     outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
19 |                            sampling_params=sampling_params)
20 |     assert len(outputs) > 0
21 |     completions = outputs[0].outputs
22 |     assert len(completions) > 0
23 |     assert completions[0].text == ""
24 |     assert completions[0].token_ids
25 | 


--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/llm/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_init.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import LLM
 4 | 
 5 | from ...utils import error_on_warning
 6 | 
 7 | MODEL_NAME = "facebook/opt-125m"
 8 | 
 9 | 
10 | def test_pos_args_deprecated():
11 |     with error_on_warning(DeprecationWarning):
12 |         LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
13 | 
14 |     with error_on_warning(DeprecationWarning):
15 |         LLM(MODEL_NAME, tokenizer=MODEL_NAME)
16 | 
17 |     with pytest.warns(DeprecationWarning, match="'tokenizer'"):
18 |         LLM(MODEL_NAME, MODEL_NAME)
19 | 
20 |     with pytest.warns(DeprecationWarning,
21 |                       match="'tokenizer', 'tokenizer_mode'"):
22 |         LLM(MODEL_NAME, MODEL_NAME, "auto")
23 | 


--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import LLM
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True)
 7 | def v1(run_with_both_engines):
 8 |     # Simple autouse wrapper to run both engines for each test
 9 |     # This can be promoted up to conftest.py to run for every
10 |     # test in a package
11 |     pass
12 | 
13 | 
14 | def test_empty_prompt():
15 |     llm = LLM(model="gpt2", enforce_eager=True)
16 |     with pytest.raises(ValueError, match='Prompt cannot be empty'):
17 |         llm.generate([""])
18 | 
19 | 
20 | @pytest.mark.skip_v1
21 | def test_out_of_vocab_token():
22 |     llm = LLM(model="gpt2", enforce_eager=True)
23 |     with pytest.raises(ValueError, match='out of vocabulary'):
24 |         llm.generate({"prompt_token_ids": [999999]})
25 | 


--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/offline_mode/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/openai/tool_parsers/__init__.py


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/kernels/__init__.py


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Reference default values of atol and rtol are from
 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 6 | default_rtol = {
 7 |     torch.float16: 1e-3,
 8 |     torch.bfloat16: 1.6e-2,
 9 |     torch.float: 1.3e-6
10 | }
11 | 
12 | 
13 | def get_default_atol(output) -> float:
14 |     return default_atol[output.dtype]
15 | 
16 | 
17 | def get_default_rtol(output) -> float:
18 |     return default_rtol[output.dtype]
19 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.utils import (create_kv_caches_with_random,
 4 |                         create_kv_caches_with_random_flash)
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def kv_cache_factory():
 9 |     return create_kv_caches_with_random
10 | 
11 | 
12 | @pytest.fixture()
13 | def kv_cache_factory_flashinfer():
14 |     return create_kv_caches_with_random_flash
15 | 


--------------------------------------------------------------------------------
/tests/kernels/test_ggml.py:
--------------------------------------------------------------------------------
 1 | import gguf
 2 | import pytest
 3 | import torch
 4 | 
 5 | from tests.kernels.utils import opcheck
 6 | from vllm import _custom_ops as ops  # noqa: F401
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("quant_type", [12])
10 | def test_ggml_opcheck(quant_type):
11 |     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
12 |     shape = [256, 1152]
13 |     qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
14 |     m = qweight.shape[0]
15 |     n = qweight.shape[1] // type_size * block_size
16 |     opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
17 | 
18 |     x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
19 |     opcheck(torch.ops._C.ggml_mul_mat_a8,
20 |             (qweight, x, quant_type, qweight.shape[0]))
21 |     opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
22 |             (qweight, x, quant_type, qweight.shape[0]))
23 | 


--------------------------------------------------------------------------------
/tests/kernels/test_gptq.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from tests.kernels.utils import opcheck
 4 | from vllm import _custom_ops as ops  # noqa: F401
 5 | 
 6 | 
 7 | def test_gptq_shuffle_opcheck():
 8 |     weight = torch.randint(-2000000,
 9 |                            2000000, (1792, 4096),
10 |                            device='cuda',
11 |                            dtype=torch.int32)
12 |     perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
13 |     bit = 4
14 |     opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
15 | 
16 | 
17 | def test_gptq_gemm_opcheck():
18 |     a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
19 |     weight = torch.randint(-2000000,
20 |                            2000000, (512, 6144),
21 |                            device='cuda',
22 |                            dtype=torch.int32)
23 |     zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
24 |     scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
25 |     idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
26 |     use_exllama = True
27 |     bit = 4
28 |     opcheck(torch.ops._C.gptq_gemm,
29 |             (a, weight, zeros, scales, idx, use_exllama, bit))
30 | 


--------------------------------------------------------------------------------
/tests/kernels/test_permute_cols.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from tests.kernels.utils import opcheck
 5 | from vllm._custom_ops import permute_cols
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
 9 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
10 | def test_permute_cols(shape, dtype):
11 |     x = torch.randn(shape, dtype=dtype).cuda()
12 |     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
13 |     opcheck(torch.ops._C.permute_cols, (x, perm))
14 |     y = permute_cols(x, perm)
15 |     torch.testing.assert_close(y, x[:, perm])


--------------------------------------------------------------------------------
/tests/kernels/test_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for miscellaneous utilities
 3 | """
 4 | 
 5 | import pytest
 6 | import torch
 7 | 
 8 | from tests.kernels.utils import opcheck
 9 | from vllm.platforms import current_platform
10 | 
11 | 
12 | def test_convert_fp8_opcheck():
13 |     data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
14 |     result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
15 |     opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
16 | 
17 | 
18 | @pytest.mark.skipif(not current_platform.is_cuda(),
19 |                     reason="Only supported for CUDA")
20 | def test_cuda_utils_opcheck():
21 |     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
22 |     opcheck(
23 |         torch.ops._C_cuda_utils.
24 |         get_max_shared_memory_per_block_device_attribute, (0, ))
25 | 


--------------------------------------------------------------------------------
/tests/kv_transfer/test_lookup_buffer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python test_lookup_buffer.py &
3 | RANK=1 python test_lookup_buffer.py &


--------------------------------------------------------------------------------
/tests/kv_transfer/test_send_recv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | RANK=0 python3 test_send_recv.py &
3 | RANK=1 python3 test_send_recv.py &


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/lora/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/lora/data/__init__.py


--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/model_executor/__init__.py


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/audio_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/audio_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/vision_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/vision_language/vlm_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/vlm_utils/__init__.py


--------------------------------------------------------------------------------
/tests/models/embedding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/__init__.py


--------------------------------------------------------------------------------
/tests/models/embedding/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/embedding/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Sequence
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def check_embeddings_close(
 8 |     *,
 9 |     embeddings_0_lst: Sequence[List[float]],
10 |     embeddings_1_lst: Sequence[List[float]],
11 |     name_0: str,
12 |     name_1: str,
13 |     tol: float = 1e-3,
14 | ) -> None:
15 |     assert len(embeddings_0_lst) == len(embeddings_1_lst)
16 | 
17 |     for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
18 |             zip(embeddings_0_lst, embeddings_1_lst)):
19 |         assert len(embeddings_0) == len(embeddings_1), (
20 |             f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
21 | 
22 |         sim = F.cosine_similarity(torch.tensor(embeddings_0),
23 |                                   torch.tensor(embeddings_1),
24 |                                   dim=0)
25 | 
26 |         fail_msg = (f"Test{prompt_idx}:"
27 |                     f"\n{name_0}:\t{embeddings_0[:16]!r}"
28 |                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
29 | 
30 |         assert sim >= 1 - tol, fail_msg
31 | 


--------------------------------------------------------------------------------
/tests/models/embedding/vision_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/vision_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/vision_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/vision_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/vision_language/test_broadcast.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ....utils import multi_gpu_test
 4 | 
 5 | 
 6 | @multi_gpu_test(num_gpus=2)
 7 | @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 8 | @pytest.mark.parametrize("model", [
 9 |     "meta-llama/Llama-3.2-11B-Vision-Instruct",
10 | ])
11 | def test_models(hf_runner, vllm_runner, image_assets,
12 |                 distributed_executor_backend, model) -> None:
13 | 
14 |     dtype = "half"
15 |     max_tokens = 5
16 |     num_logprobs = 5
17 |     tensor_parallel_size = 2
18 | 
19 |     if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
20 |         from .test_mllama import models, run_test
21 |     else:
22 |         raise NotImplementedError(f"Unsupported model: {model}")
23 | 
24 |     run_test(
25 |         hf_runner,
26 |         vllm_runner,
27 |         image_assets,
28 |         model=models[0],
29 |         size_factors=[0.25, 0.5, 1.0],
30 |         dtype=dtype,
31 |         max_tokens=max_tokens,
32 |         num_logprobs=num_logprobs,
33 |         tensor_parallel_size=tensor_parallel_size,
34 |         distributed_executor_backend=distributed_executor_backend,
35 |     )
36 | 


--------------------------------------------------------------------------------
/tests/mq_llm_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/mq_llm_engine/__init__.py


--------------------------------------------------------------------------------
/tests/multi_step/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/multi_step/__init__.py


--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/multimodal/__init__.py


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='vllm_add_dummy_model',
 4 |       version='0.1',
 5 |       packages=['vllm_add_dummy_model'],
 6 |       entry_points={
 7 |           'vllm.general_plugins':
 8 |           ["register_dummy_model = vllm_add_dummy_model:register"]
 9 |       })
10 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm import ModelRegistry
 2 | 
 3 | 
 4 | def register():
 5 |     # Test directly passing the model
 6 |     from .my_opt import MyOPTForCausalLM
 7 | 
 8 |     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
 9 |         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
10 | 
11 |     # Test passing lazy model
12 |     if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
13 |         ModelRegistry.register_model(
14 |             "MyGemma2Embedding",
15 |             "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
16 |         )
17 | 
18 |     if "MyLlava" not in ModelRegistry.get_supported_archs():
19 |         ModelRegistry.register_model("MyLlava",
20 |                                      "vllm_add_dummy_model.my_llava:MyLlava")
21 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.model_executor.models.opt import OPTForCausalLM
 6 | from vllm.model_executor.sampling_metadata import SamplingMetadata
 7 | 
 8 | 
 9 | class MyOPTForCausalLM(OPTForCausalLM):
10 | 
11 |     def compute_logits(
12 |             self, hidden_states: torch.Tensor,
13 |             sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
14 |         # this dummy model always predicts the first token
15 |         logits = super().compute_logits(hidden_states, sampling_metadata)
16 |         if logits is not None:
17 |             logits.zero_()
18 |             logits[:, 0] += 1.0
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/tests/prefix_caching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/prefix_caching/__init__.py


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/quantization/test_experts_int8.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | """Tests experts_int8 quantization startup and generation, 
 3 | doesn't test correctness
 4 | """
 5 | import pytest
 6 | 
 7 | from tests.quantization.utils import is_quant_method_supported
 8 | 
 9 | MODELS = ["ai21labs/Jamba-tiny-random"]
10 | 
11 | 
12 | @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
13 |                     reason="ExpertsInt8 is not supported on this GPU type.")
14 | @pytest.mark.parametrize("model", MODELS)
15 | @pytest.mark.parametrize("dtype", ["bfloat16"])
16 | @pytest.mark.parametrize("max_tokens", [10])
17 | def test_model_experts_int8_startup(
18 |     hf_runner,
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 | ) -> None:
25 | 
26 |     with vllm_runner(model, dtype=dtype,
27 |                      quantization="experts_int8") as vllm_model:
28 |         vllm_model.generate_greedy(example_prompts, max_tokens)
29 | 


--------------------------------------------------------------------------------
/tests/quantization/test_ipex_quant.py:
--------------------------------------------------------------------------------
 1 | """Test model set-up and inference for quantized HF models supported
 2 |  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
 3 |  
 4 |  Validating the configuration and printing results for manual checking.
 5 | 
 6 |  Run `pytest tests/quantization/test_ipex_quant.py`.
 7 | """
 8 | 
 9 | import pytest
10 | 
11 | from vllm.platforms import current_platform
12 | 
13 | MODELS = [
14 |     "AMead10/Llama-3.2-1B-Instruct-AWQ",
15 |     "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
16 | ]
17 | DTYPE = ["bfloat16"]
18 | 
19 | 
20 | @pytest.mark.skipif(not current_platform.is_cpu()
21 |                     and not current_platform.is_xpu(),
22 |                     reason="only supports Intel CPU/XPU backend.")
23 | @pytest.mark.parametrize("model", MODELS)
24 | @pytest.mark.parametrize("dtype", DTYPE)
25 | def test_ipex_quant(vllm_runner, model, dtype):
26 |     with vllm_runner(model, dtype=dtype) as llm:
27 |         output = llm.generate_greedy(["The capital of France is"],
28 |                                      max_tokens=32)
29 |     assert output
30 |     print(output)
31 | 


--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.quantization import get_quantization_config
 2 | from vllm.platforms import current_platform
 3 | 
 4 | 
 5 | def is_quant_method_supported(quant_method: str) -> bool:
 6 |     # Currently, all quantization methods require Nvidia or AMD GPUs
 7 |     if not (current_platform.is_cuda() or current_platform.is_rocm()):
 8 |         return False
 9 | 
10 |     capability = current_platform.get_device_capability()
11 |     assert capability is not None
12 | 
13 |     min_capability = get_quantization_config(quant_method).get_min_capability()
14 | 
15 |     return capability.to_int() >= min_capability
16 | 


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/test_ignore_eos.py:
--------------------------------------------------------------------------------
 1 | """Make sure ignore_eos works.
 2 | 
 3 | Run `pytest tests/samplers/test_ignore_eos.py`.
 4 | """
 5 | 
 6 | import pytest
 7 | 
 8 | from vllm import SamplingParams
 9 | 
10 | # We also test with llama because it has generation_config to specify EOS
11 | # (past regression).
12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", MODELS)
16 | @pytest.mark.parametrize("dtype", ["half"])
17 | @pytest.mark.parametrize("max_tokens", [512])
18 | def test_ignore_eos(
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 | ) -> None:
25 |     with vllm_runner(model, dtype=dtype) as vllm_model:
26 |         sampling_params = SamplingParams(max_tokens=max_tokens,
27 |                                          ignore_eos=True)
28 | 
29 |         for prompt in example_prompts:
30 |             ignore_eos_output = vllm_model.model.generate(
31 |                 prompt, sampling_params=sampling_params)
32 |             output_length = len(ignore_eos_output[0].outputs[0].token_ids)
33 |             assert output_length == max_tokens
34 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/spec_decode/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tensorizer_loader/__init__.py


--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
1 | import vllm
2 | 
3 | 
4 | def test_embedded_commit_defined():
5 |     assert hasattr(vllm, "__version__")
6 |     assert hasattr(vllm, "__version_tuple__")
7 |     assert vllm.__version__ != "dev"
8 |     assert vllm.__version_tuple__ != (0, 0, "dev")
9 | 


--------------------------------------------------------------------------------
/tests/test_lazy_torch_compile.py:
--------------------------------------------------------------------------------
 1 | # Description: Test the lazy import module
 2 | # The utility function cannot be placed in `vllm.utils`
 3 | # this needs to be a standalone script
 4 | import sys
 5 | from contextlib import nullcontext
 6 | 
 7 | from vllm_test_utils import BlameResult, blame
 8 | 
 9 | module_name = "torch._inductor.async_compile"
10 | 
11 | # In CI, we only check finally if the module is imported.
12 | # If it is indeed imported, we can rerun the test with `use_blame=True`,
13 | # which will trace every function call to find the first import location,
14 | # and help find the root cause.
15 | # We don't run it in CI by default because it is slow.
16 | use_blame = False
17 | context = blame(
18 |     lambda: module_name in sys.modules) if use_blame else nullcontext()
19 | with context as result:
20 |     import vllm  # noqa
21 | 
22 | if use_blame:
23 |     assert isinstance(result, BlameResult)
24 |     print(f"the first import location is:\n{result.trace_stack}")
25 | 
26 | assert module_name not in sys.modules, (
27 |     f"Module {module_name} is imported. To see the first"
28 |     f" import location, run the test with `use_blame=True`.")
29 | 


--------------------------------------------------------------------------------
/tests/test_sampling_params.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SamplingParams class.
 2 | """
 3 | from vllm import SamplingParams
 4 | 
 5 | 
 6 | def test_max_tokens_none():
 7 |     """max_tokens=None should be allowed"""
 8 |     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     import pytest
13 |     pytest.main([__file__])
14 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_cached_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 6 | 
 7 | 
 8 | def test_cached_tokenizer():
 9 |     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
10 |     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
11 |     reference_tokenizer.add_special_tokens(
12 |         {"additional_special_tokens": ["<SEP>"]})
13 |     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
14 | 
15 |     assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
16 |         "prompt")
17 |     assert set(reference_tokenizer.all_special_ids) == set(
18 |         cached_tokenizer.all_special_ids)
19 |     assert set(reference_tokenizer.all_special_tokens) == set(
20 |         cached_tokenizer.all_special_tokens)
21 |     assert set(reference_tokenizer.all_special_tokens_extended) == set(
22 |         cached_tokenizer.all_special_tokens_extended)
23 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_get_eos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This test file includes some cases where it is inappropriate to
 3 | only get the `eos_token_id` from the tokenizer as defined by
 4 | :meth:`vllm.LLMEngine._get_eos_token_id`.
 5 | """
 6 | from vllm.transformers_utils.config import try_get_generation_config
 7 | from vllm.transformers_utils.tokenizer import get_tokenizer
 8 | 
 9 | 
10 | def test_get_llama3_eos_token():
11 |     model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
12 | 
13 |     tokenizer = get_tokenizer(model_name)
14 |     assert tokenizer.eos_token_id == 128009
15 | 
16 |     generation_config = try_get_generation_config(model_name,
17 |                                                   trust_remote_code=False)
18 |     assert generation_config is not None
19 |     assert generation_config.eos_token_id == [128001, 128009]
20 | 
21 | 
22 | def test_get_blip2_eos_token():
23 |     model_name = "Salesforce/blip2-opt-2.7b"
24 | 
25 |     tokenizer = get_tokenizer(model_name)
26 |     assert tokenizer.eos_token_id == 2
27 | 
28 |     generation_config = try_get_generation_config(model_name,
29 |                                                   trust_remote_code=False)
30 |     assert generation_config is not None
31 |     assert generation_config.eos_token_id == 50118
32 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import PreTrainedTokenizerBase
 3 | 
 4 | from vllm.transformers_utils.tokenizer import get_tokenizer
 5 | 
 6 | TOKENIZER_NAMES = [
 7 |     "facebook/opt-125m",
 8 |     "gpt2",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
13 | def test_tokenizer_revision(tokenizer_name: str):
14 |     # Assume that "main" branch always exists
15 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
16 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
17 | 
18 |     # Assume that "never" branch always does not exist
19 |     with pytest.raises(OSError, match='not a valid git identifier'):
20 |         get_tokenizer(tokenizer_name, revision="never")
21 | 


--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tool_use/__init__.py


--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tpu/__init__.py


--------------------------------------------------------------------------------
/tests/tpu/test_custom_dispatcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from vllm.config import CompilationLevel
 4 | 
 5 | from ..utils import compare_two_settings
 6 | 
 7 | # --enforce-eager on TPU causes graph compilation
 8 | # this times out default Health Check in the MQLLMEngine,
 9 | # so we set the timeout here to 30s
10 | os.environ["VLLM_RPC_TIMEOUT"] = "30000"
11 | 
12 | 
13 | def test_custom_dispatcher():
14 |     compare_two_settings(
15 |         "google/gemma-2b",
16 |         arg1=[
17 |             "--enforce-eager",
18 |             f"-O{CompilationLevel.DYNAMO_ONCE}",
19 |         ],
20 |         arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
21 |         env1={},
22 |         env2={})
23 | 


--------------------------------------------------------------------------------
/tests/tracing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tracing/__init__.py


--------------------------------------------------------------------------------
/tests/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/v1/__init__.py


--------------------------------------------------------------------------------
/tests/v1/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/v1/engine/__init__.py


--------------------------------------------------------------------------------
/tests/vllm_test_utils/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     name='vllm_test_utils',
5 |     version='0.1',
6 |     packages=['vllm_test_utils'],
7 | )
8 | 


--------------------------------------------------------------------------------
/tests/vllm_test_utils/vllm_test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | vllm_utils is a package for vLLM testing utilities.
3 | It does not import any vLLM modules.
4 | """
5 | 
6 | from .blame import BlameResult, blame
7 | 
8 | __all__ = ["blame", "BlameResult"]
9 | 


--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
5 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main


--------------------------------------------------------------------------------
/tests/weight_loading/run_model_weight_loading_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SUCCESS=0
 3 | 
 4 | while getopts "c:" OPT; do
 5 |   case ${OPT} in
 6 |     c ) 
 7 |         CONFIG="$OPTARG"
 8 |         ;;
 9 |     \? )
10 |         usage
11 |         exit 1
12 |         ;;
13 |   esac
14 | done
15 | 
16 | 
17 | IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
18 | 
19 | for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
20 | do
21 |     LOCAL_SUCCESS=0
22 |     IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
23 |     
24 |     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
25 | 
26 |     export QUANTIZATION=${array[0]}
27 |     export MODEL_NAME=${array[1]}
28 |     export REVISION=${array[2]}
29 |     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
30 | 
31 |     if [[ $LOCAL_SUCCESS == 0 ]]; then
32 |         echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
33 |     else
34 |         echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
35 |     fi
36 | 
37 |     SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
38 | 
39 | done
40 | 
41 | if [ "${SUCCESS}" -eq "0" ]; then
42 |     exit 0
43 | else
44 |     exit 1
45 | fi
46 | 


--------------------------------------------------------------------------------
/tests/weight_loading/test_weight_loading.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | MAX_MODEL_LEN = 1024
 6 | MODEL_NAME = os.environ.get("MODEL_NAME",
 7 |                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 8 | REVISION = os.environ.get("REVISION", "main")
 9 | QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
10 | 
11 | 
12 | def test_weight_loading(vllm_runner):
13 |     """
14 |     Test parameter weight loading with tp>1.
15 |     """
16 |     with vllm_runner(model_name=MODEL_NAME,
17 |                      revision=REVISION,
18 |                      dtype=torch.half if QUANTIZATION == "gptq" else "auto",
19 |                      quantization=QUANTIZATION,
20 |                      max_model_len=MAX_MODEL_LEN,
21 |                      tensor_parallel_size=2) as model:
22 | 
23 |         output = model.generate_greedy("Hello world!", max_tokens=20)
24 |         print(output)
25 |         assert output
26 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/worker/__init__.py


--------------------------------------------------------------------------------
/tools/actionlint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if command -v actionlint &> /dev/null; then
 4 |     actionlint "$@"
 5 |     exit 0
 6 | elif [ -x ./actionlint ]; then
 7 |     ./actionlint "$@"
 8 |     exit 0
 9 | fi
10 | 
11 | # download a binary to the current directory - v1.7.3
12 | bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
13 | ./actionlint "$@"
14 | 


--------------------------------------------------------------------------------
/tools/check_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
 3 | 
 4 | if ! git diff --quiet; then
 5 | 	echo "Repo is dirty" >&2
 6 | 
 7 | 	exit 1
 8 | fi
 9 | 
10 | if ! git describe --tags; then
11 | 	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
12 | 
13 | 	exit 1
14 | fi
15 | 


--------------------------------------------------------------------------------
/tools/mypy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CI=${1:-0}
 4 | PYTHON_VERSION=${2:-3.9}
 5 | 
 6 | if [ "$CI" -eq 1 ]; then
 7 |     set -e
 8 | fi
 9 | 
10 | run_mypy() {
11 |     echo "Running mypy on $1"
12 |     if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
13 |         mypy --python-version "${PYTHON_VERSION}" "$@"
14 |         return
15 |     fi
16 |     mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
17 | }
18 | 
19 | run_mypy # Note that this is less strict than CI
20 | run_mypy tests
21 | run_mypy vllm/attention
22 | run_mypy vllm/compilation
23 | run_mypy vllm/distributed
24 | run_mypy vllm/engine
25 | run_mypy vllm/executor
26 | run_mypy vllm/lora
27 | run_mypy vllm/model_executor
28 | run_mypy vllm/plugins
29 | run_mypy vllm/prompt_adapter
30 | run_mypy vllm/spec_decode
31 | run_mypy vllm/worker
32 | 


--------------------------------------------------------------------------------
/tools/png-lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure that *.excalidraw.png files have the excalidraw metadata
 4 | # embedded in them. This ensures they can be loaded back into
 5 | # the tool and edited in the future.
 6 | 
 7 | find . -iname '*.excalidraw.png' | while read -r file; do
 8 | 	if git check-ignore -q "$file"; then
 9 | 		continue
10 | 	fi
11 | 	if ! grep -q "excalidraw+json" "$file"; then
12 | 		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
13 | 		exit 1
14 | 	fi
15 | done
16 | 


--------------------------------------------------------------------------------
/tools/shellcheck.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | scversion="stable"
 5 | 
 6 | if [ -d "shellcheck-${scversion}" ]; then
 7 |     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
 8 | fi
 9 | 
10 | if ! [ -x "$(command -v shellcheck)" ]; then
11 |     if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
12 |         echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
13 |         exit 1
14 |     fi
15 | 
16 |     # automatic local install if linux x86_64
17 |     wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
18 |     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
19 | fi
20 | 
21 | # TODO - fix warnings in .buildkite/run-amd-test.sh
22 | find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
23 | 


--------------------------------------------------------------------------------
/tools/sphinx-lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sphinx-lint --disable trailing-whitespace,missing-final-newline docs
4 | 


--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | requires_files = glob.glob('requirements*.txt')
 4 | requires_files += ["pyproject.toml"]
 5 | for file in requires_files:
 6 |     print(f">>> cleaning {file}")
 7 |     with open(file) as f:
 8 |         lines = f.readlines()
 9 |     if "torch" in "".join(lines).lower():
10 |         print("removed:")
11 |         with open(file, 'w') as f:
12 |             for line in lines:
13 |                 if 'torch' not in line.lower():
14 |                     f.write(line)
15 |                 else:
16 |                     print(line.strip())
17 |     print(f"<<< done cleaning {file}")
18 |     print()
19 | 


--------------------------------------------------------------------------------
/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/adapter_commons/__init__.py


--------------------------------------------------------------------------------
/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class AdapterMapping:
 7 |     # Per every token in input_ids:
 8 |     index_mapping: Tuple[int, ...]
 9 |     # Per sampled token:
10 |     prompt_mapping: Tuple[int, ...]
11 | 
12 |     def __post_init__(self):
13 |         self.index_mapping = tuple(self.index_mapping)
14 |         self.prompt_mapping = tuple(self.prompt_mapping)


--------------------------------------------------------------------------------
/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class AdapterRequest(ABC):
 5 |     """
 6 |     Base class for adapter requests.
 7 |     """
 8 | 
 9 |     @property
10 |     @abstractmethod
11 |     def adapter_id(self) -> int:
12 |         raise NotImplementedError
13 | 
14 |     def __post_init__(self) -> None:
15 |         if self.adapter_id < 1:
16 |             raise ValueError(f"id must be > 0, got {self.adapter_id}")
17 | 
18 |     def __eq__(self, value: object) -> bool:
19 |         return isinstance(
20 |             value, self.__class__) and self.adapter_id == value.adapter_id
21 | 
22 |     def __hash__(self) -> int:
23 |         return hash(self.adapter_id)
24 | 


--------------------------------------------------------------------------------
/vllm/adapter_commons/worker_manager.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Optional, Set
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbstractWorkerManager(ABC):
 8 | 
 9 |     def __init__(self, device: torch.device):
10 |         self.device = device
11 | 
12 |     @property
13 |     @abstractmethod
14 |     def is_enabled(self) -> bool:
15 |         raise NotImplementedError
16 | 
17 |     @abstractmethod
18 |     def set_active_adapters(self, requests: Set[Any],
19 |                             mapping: Optional[Any]) -> None:
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def add_adapter(self, adapter_request: Any) -> bool:
24 |         raise NotImplementedError
25 | 
26 |     @abstractmethod
27 |     def remove_adapter(self, adapter_id: int) -> bool:
28 |         raise NotImplementedError
29 | 
30 |     @abstractmethod
31 |     def remove_all_adapters(self) -> None:
32 |         raise NotImplementedError
33 | 
34 |     @abstractmethod
35 |     def list_adapters(self) -> Set[int]:
36 |         raise NotImplementedError
37 | 


--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/assets/__init__.py


--------------------------------------------------------------------------------
/vllm/assets/audio.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal, Tuple
 3 | from urllib.parse import urljoin
 4 | 
 5 | import librosa
 6 | import numpy as np
 7 | 
 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
 9 | 
10 | ASSET_DIR = "multimodal_asset"
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class AudioAsset:
15 |     name: Literal["winning_call", "mary_had_lamb"]
16 | 
17 |     @property
18 |     def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
19 | 
20 |         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
21 |                                             s3_prefix=ASSET_DIR)
22 |         y, sr = librosa.load(audio_path, sr=None)
23 |         assert isinstance(sr, int)
24 |         return y, sr
25 | 
26 |     @property
27 |     def url(self) -> str:
28 |         return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
29 | 


--------------------------------------------------------------------------------
/vllm/assets/image.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | 
 7 | from vllm.assets.base import get_vllm_public_assets
 8 | 
 9 | VLM_IMAGES_DIR = "vision_model_images"
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | class ImageAsset:
14 |     name: Literal["stop_sign", "cherry_blossom"]
15 | 
16 |     @property
17 |     def pil_image(self) -> Image.Image:
18 | 
19 |         image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
20 |                                             s3_prefix=VLM_IMAGES_DIR)
21 |         return Image.open(image_path)
22 | 
23 |     @property
24 |     def image_embeds(self) -> torch.Tensor:
25 |         """
26 |         Image embeddings, only used for testing purposes with llava 1.5.
27 |         """
28 |         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
29 |                                             s3_prefix=VLM_IMAGES_DIR)
30 |         return torch.load(image_path, map_location="cpu")
31 | 


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.attention.backends.abstract import (AttentionBackend,
 2 |                                               AttentionMetadata,
 3 |                                               AttentionMetadataBuilder,
 4 |                                               AttentionState, AttentionType)
 5 | from vllm.attention.layer import Attention
 6 | from vllm.attention.selector import get_attn_backend
 7 | 
 8 | __all__ = [
 9 |     "Attention",
10 |     "AttentionBackend",
11 |     "AttentionMetadata",
12 |     "AttentionType",
13 |     "AttentionMetadataBuilder",
14 |     "Attention",
15 |     "AttentionState",
16 |     "get_attn_backend",
17 | ]
18 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/ops/blocksparse_attention/__init__.py


--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/compilation/__init__.py


--------------------------------------------------------------------------------
/vllm/compilation/compile_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from typing import Any
 3 | 
 4 | _compile_context: Any = None
 5 | 
 6 | 
 7 | def get_compile_context() -> Any:
 8 |     """Get the current compile context."""
 9 |     return _compile_context
10 | 
11 | 
12 | @contextmanager
13 | def set_compile_context(context: Any):
14 |     """A context manager that stores the current compile context,
15 |     usually it is a list of sizes to specialize.
16 |     """
17 |     global _compile_context
18 |     prev_context = _compile_context
19 |     _compile_context = context
20 |     try:
21 |         yield
22 |     finally:
23 |         _compile_context = prev_context
24 | 


--------------------------------------------------------------------------------
/vllm/compilation/counter.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import dataclasses
 3 | from contextlib import contextmanager
 4 | 
 5 | 
 6 | @dataclasses.dataclass
 7 | class CompilationCounter:
 8 |     num_models_seen: int = 0
 9 |     num_graphs_seen: int = 0
10 |     # including the splitting ops
11 |     num_piecewise_graphs_seen: int = 0
12 |     # not including the splitting ops
13 |     num_piecewise_capturable_graphs_seen: int = 0
14 |     num_inductor_compilations: int = 0
15 |     num_cudagraph_caputured: int = 0
16 | 
17 |     def clone(self) -> "CompilationCounter":
18 |         return copy.deepcopy(self)
19 | 
20 |     @contextmanager
21 |     def expect(self, **kwargs):
22 |         old = self.clone()
23 |         yield
24 |         for k, v in kwargs.items():
25 |             assert getattr(self, k) - getattr(old, k) == v, (
26 |                 f"{k} not as expected, before it is {getattr(old, k)}"
27 |                 f", after it is {getattr(self, k)}, "
28 |                 f"expected diff is {v}")
29 | 
30 | 
31 | compilation_counter = CompilationCounter()
32 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/utils.py:
--------------------------------------------------------------------------------
 1 | """Block manager utils."""
 2 | from vllm.sequence import SequenceGroup
 3 | from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
 4 |                         STR_NOT_IMPL_ENC_DEC_SWA)
 5 | 
 6 | 
 7 | def check_no_caching_or_swa_for_blockmgr_encdec(
 8 |         block_mgr, seq_group: SequenceGroup) -> None:
 9 |     '''
10 |     Enforce that prefix caching & sliding-window attention (SWA)
11 |     are currently unsupported *specifically* for encoder/decoder models.
12 | 
13 |     Raises NotImplementedError if unsupported scenario is detected.
14 | 
15 |     Arguments:
16 | 
17 |     * block_mgr: BlockSpaceManager instance
18 |     * seq_group: SequenceGroup passed to block_mgr
19 |     '''
20 | 
21 |     if seq_group.is_encoder_decoder():
22 |         if block_mgr.max_block_sliding_window is not None:
23 |             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
24 | 
25 |         if block_mgr.enable_caching:
26 |             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
27 | 


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_connector/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_connector/factory.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from .base import KVConnectorBase
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from vllm.config import VllmConfig
 7 | 
 8 | 
 9 | class KVConnectorFactory:
10 | 
11 |     @staticmethod
12 |     def create_connector(rank: int, local_rank: int,
13 |                          config: "VllmConfig") -> KVConnectorBase:
14 |         if config.kv_transfer_config.kv_connector == 'PyNcclConnector':
15 |             from .simple_connector import SimpleConnector
16 |             return SimpleConnector(rank, local_rank, config)
17 |         else:
18 |             raise ValueError(f"Unsupported connector type: "
19 |                              f"{config.kv_connector}")
20 | 


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/kv_transfer/kv_pipe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_pipe/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Sequence as GenericSequence
 3 | from typing import cast
 4 | 
 5 | from vllm.model_executor.layers.sampler import SamplerOutput
 6 | from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput
 7 | 
 8 | 
 9 | def create_output_by_sequence_group(
10 |         outputs: GenericSequence[SamplerOutput],
11 |         num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
12 |     """Helper method which transforms a 2d list organized by
13 |     [step][sequence group] into [sequence group][step].
14 |     """
15 |     output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [
16 |         [] for _ in range(num_seq_groups)
17 |     ]
18 |     for step in outputs:
19 |         sequence_group_output: CompletionSequenceGroupOutput
20 |         for i, sequence_group_output in enumerate(step):
21 |             output_by_sequence_group[i].append(sequence_group_output)
22 | 
23 |     # Cast to the more generic type that CompletionSequenceGroupOutput
24 |     # inherits from.
25 |     return cast(List[List[SequenceGroupOutput]], output_by_sequence_group)
26 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstract_tool_parser import ToolParser, ToolParserManager
 2 | from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 3 | from .granite_tool_parser import GraniteToolParser
 4 | from .hermes_tool_parser import Hermes2ProToolParser
 5 | from .internlm2_tool_parser import Internlm2ToolParser
 6 | from .jamba_tool_parser import JambaToolParser
 7 | from .llama_tool_parser import Llama3JsonToolParser
 8 | from .mistral_tool_parser import MistralToolParser
 9 | from .pythonic_tool_parser import PythonicToolParser
10 | 
11 | __all__ = [
12 |     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
13 |     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
14 |     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
15 |     "PythonicToolParser"
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/msgspec_utils.py:
--------------------------------------------------------------------------------
 1 | from array import array
 2 | from typing import Any, Type
 3 | 
 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
 5 | 
 6 | 
 7 | def encode_hook(obj: Any) -> Any:
 8 |     """Custom msgspec enc hook that supports array types.
 9 | 
10 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
11 |     """
12 |     if isinstance(obj, array):
13 |         assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
14 |             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
15 |             f"Given array has a type code of {obj.typecode}.")
16 |         return obj.tobytes()
17 | 
18 | 
19 | def decode_hook(type: Type, obj: Any) -> Any:
20 |     """Custom msgspec dec hook that supports array types.
21 | 
22 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
23 |     """
24 |     if type is array:
25 |         deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
26 |         deserialized.frombytes(obj)
27 |         return deserialized
28 | 


--------------------------------------------------------------------------------
/vllm/executor/multiproc_xpu_executor.py:
--------------------------------------------------------------------------------
 1 | import vllm.envs as envs
 2 | from vllm.executor.multiproc_gpu_executor import (
 3 |     MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
 4 | from vllm.executor.xpu_executor import XPUExecutor
 5 | from vllm.logger import init_logger
 6 | from vllm.utils import make_async
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
12 |     """Python multiprocessing-based multi-XPU executor"""
13 | 
14 |     def _check_executor_parameters(self):
15 |         mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
16 |         if mp_method != "spawn":
17 |             raise RuntimeError(
18 |                 "XPU multiprocess executor only support spawn as mp method")
19 | 
20 | 
21 | class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
22 |                                       MultiprocessingGPUExecutorAsync):
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__(*args, **kwargs)
26 |         self.driver_exec_model = make_async(self.driver_worker.execute_model)
27 | 


--------------------------------------------------------------------------------
/vllm/forward_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from dataclasses import dataclass
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | from vllm.config import VllmConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class ForwardContext:
10 |     static_forward_context: Dict[str, Any]
11 |     # TODO: extend to support per-layer dynamic forward context
12 |     dynamic_forward_context: Any
13 | 
14 | 
15 | _forward_context: Optional[ForwardContext] = None
16 | 
17 | 
18 | def get_forward_context() -> ForwardContext:
19 |     """Get the current forward context."""
20 |     assert _forward_context is not None, (
21 |         "Forward context is not set. "
22 |         "Please use `set_forward_context` to set the forward context.")
23 |     return _forward_context
24 | 
25 | 
26 | @contextmanager
27 | def set_forward_context(context: Any, vllm_config: VllmConfig):
28 |     """A context manager that stores the current forward context,
29 |     can be attention metadata, etc."""
30 |     global _forward_context
31 |     prev_context = _forward_context
32 |     _forward_context = ForwardContext(
33 |         static_forward_context=vllm_config.compilation_config.
34 |         static_forward_context,
35 |         dynamic_forward_context=context)
36 |     try:
37 |         yield
38 |     finally:
39 |         _forward_context = prev_context
40 | 


--------------------------------------------------------------------------------
/vllm/logging_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging_utils.formatter import NewLineFormatter
2 | 
3 | __all__ = [
4 |     "NewLineFormatter",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/logging_utils/formatter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NewLineFormatter(logging.Formatter):
 5 |     """Adds logging prefix to newlines to align multi-line messages."""
 6 | 
 7 |     def __init__(self, fmt, datefmt=None, style="%"):
 8 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 9 | 
10 |     def format(self, record):
11 |         msg = logging.Formatter.format(self, record)
12 |         if record.message != "":
13 |             parts = msg.split(record.message)
14 |             msg = msg.replace("\n", "\r\n" + parts[0])
15 |         return msg
16 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/lora/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.parameter import (BasevLLMParameter,
 2 |                                            PackedvLLMParameter)
 3 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
 4 |                                                    SamplingMetadataCache)
 5 | from vllm.model_executor.utils import set_random_seed
 6 | 
 7 | __all__ = [
 8 |     "SamplingMetadata",
 9 |     "SamplingMetadataCache",
10 |     "set_random_seed",
11 |     "BasevLLMParameter",
12 |     "PackedvLLMParameter",
13 | ]
14 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/mamba/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/mamba/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compressed_tensors_scheme import CompressedTensorsScheme
 2 | from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
 3 |                                           CompressedTensorsW4A16Sparse24)
 4 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 5 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 6 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 7 | from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
 8 |                                        CompressedTensorsWNA16)
 9 | 
10 | __all__ = [
11 |     "CompressedTensorsScheme",
12 |     "CompressedTensorsWNA16",
13 |     "CompressedTensorsW8A16Fp8",
14 |     "CompressedTensorsW4A16Sparse24",
15 |     "CompressedTensorsW8A8Int8",
16 |     "CompressedTensorsW8A8Fp8",
17 |     "WNA16_SUPPORTED_BITS",
18 |     "W4A16SPARSE24_SUPPORTED_BITS",
19 | ]
20 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_utils import replace_parameter, update_tensor_inplace
2 | 
3 | __all__ = ['update_tensor_inplace', 'replace_parameter']
4 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/machete_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.scalar_type import ScalarType, scalar_types
 6 | 
 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
 9 | 
10 | 
11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
12 |     if zero_points:
13 |         return [scalar_types.uint4, scalar_types.uint8]
14 |     else:
15 |         return [scalar_types.uint4b8, scalar_types.uint8b128]
16 | 
17 | 
18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
19 |     return [torch.float16, torch.bfloat16]
20 | 
21 | 
22 | def check_machete_supports_shape(in_features: int, out_featrues: int) \
23 |     -> Tuple[bool, Optional[str]]:
24 |     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
25 |         return False, "Input features size must be divisible by "\
26 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
27 |     if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
28 |         return False, "Output features size must be divisible by "\
29 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
30 |     return True, None
31 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from vllm.config import VllmConfig
 4 | from vllm.model_executor.model_loader.loader import (BaseModelLoader,
 5 |                                                      get_model_loader)
 6 | from vllm.model_executor.model_loader.utils import (
 7 |     get_architecture_class_name, get_model_architecture)
 8 | 
 9 | 
10 | def get_model(*, vllm_config: VllmConfig) -> nn.Module:
11 |     loader = get_model_loader(vllm_config.load_config)
12 |     return loader.load_model(vllm_config=vllm_config)
13 | 
14 | 
15 | __all__ = [
16 |     "get_model", "get_model_loader", "BaseModelLoader",
17 |     "get_architecture_class_name", "get_model_architecture"
18 | ]
19 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
 2 |                          SupportsPP, has_inner_state, supports_lora,
 3 |                          supports_multimodal, supports_pp)
 4 | from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
 5 |                               is_pooling_model, is_text_generation_model)
 6 | from .registry import ModelRegistry
 7 | 
 8 | __all__ = [
 9 |     "ModelRegistry",
10 |     "VllmModelForPooling",
11 |     "is_pooling_model",
12 |     "VllmModelForTextGeneration",
13 |     "is_text_generation_model",
14 |     "HasInnerState",
15 |     "has_inner_state",
16 |     "SupportsLoRA",
17 |     "supports_lora",
18 |     "SupportsMultiModal",
19 |     "supports_multimodal",
20 |     "SupportsPP",
21 |     "supports_pp",
22 | ]
23 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/glm.py:
--------------------------------------------------------------------------------
 1 | """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 2 | from vllm.config import VllmConfig
 3 | from vllm.model_executor.models.llama import LlamaForCausalLM
 4 | 
 5 | from .utils import PPMissingLayer
 6 | 
 7 | 
 8 | class GlmForCausalLM(LlamaForCausalLM):
 9 | 
10 |     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
11 |         super().__init__(vllm_config=vllm_config, prefix=prefix)
12 |         # Hack Llama model to fit HF format GLM implementation
13 |         # Attention difference between GLM and Llama:
14 |         # 1. Half partial rotary_dim and no Neox style.
15 |         # 2. There is no bias for o_proj in attention
16 |         for layer in self.model.layers:
17 |             if not isinstance(layer, PPMissingLayer):
18 |                 layer.self_attn.rotary_emb.rotary_dim //= 2
19 |                 layer.self_attn.rotary_emb.is_neox_style = False
20 |                 layer.self_attn.o_proj.bias = None
21 |                 layer.self_attn.o_proj.skip_bias_add = True
22 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
 1 | # Adapted from llama.py
 2 | """Inference-only Phi3 model code inherit from Llama.py"""
 3 | 
 4 | from vllm.model_executor.models.llama import LlamaForCausalLM
 5 | 
 6 | 
 7 | class Phi3ForCausalLM(LlamaForCausalLM):
 8 | 
 9 |     packed_modules_mapping = {
10 |         "qkv_proj": [
11 |             "qkv_proj",
12 |         ],
13 |         "gate_up_proj": [
14 |             "gate_up_proj",
15 |         ],
16 |     }
17 | 
18 |     # BitandBytes specific attributes
19 |     # Initialize an empty dict when there is no stacked parameter mapping.
20 |     bitsandbytes_stacked_params_mapping = {}
21 | 


--------------------------------------------------------------------------------
/vllm/multimodal/audio.py:
--------------------------------------------------------------------------------
 1 | from vllm.inputs.registry import InputContext
 2 | 
 3 | from .base import MultiModalPlugin
 4 | from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 5 | 
 6 | 
 7 | class AudioPlugin(MultiModalPlugin):
 8 |     """Plugin for audio data."""
 9 | 
10 |     def get_data_key(self) -> str:
11 |         return "audio"
12 | 
13 |     def _default_input_mapper(
14 |         self,
15 |         ctx: InputContext,
16 |         data: MultiModalData[AudioItem],
17 |         **mm_processor_kwargs,
18 |     ) -> MultiModalKwargs:
19 |         raise NotImplementedError("There is no default audio input mapper")
20 | 
21 |     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
22 |         raise NotImplementedError(
23 |             "There is no default maximum multimodal tokens")
24 | 


--------------------------------------------------------------------------------
/vllm/platforms/neuron.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from .interface import Platform, PlatformEnum
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from vllm.config import VllmConfig
 7 | else:
 8 |     VllmConfig = None
 9 | 
10 | 
11 | class NeuronPlatform(Platform):
12 |     _enum = PlatformEnum.NEURON
13 |     device_name: str = "neuron"
14 |     device_type: str = "neuron"
15 |     supported_quantization: list[str] = ["neuron_quant"]
16 | 
17 |     @classmethod
18 |     def get_device_name(cls, device_id: int = 0) -> str:
19 |         return "neuron"
20 | 
21 |     @classmethod
22 |     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
23 |         parallel_config = vllm_config.parallel_config
24 |         if parallel_config.worker_cls == "auto":
25 |             parallel_config.worker_cls = \
26 |                 "vllm.worker.neuron_worker.NeuronWorker"
27 | 


--------------------------------------------------------------------------------
/vllm/pooling_params.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | import msgspec
 4 | 
 5 | 
 6 | class PoolingParams(
 7 |         msgspec.Struct,
 8 |         omit_defaults=True,  # type: ignore[call-arg]
 9 |         array_like=True):  # type: ignore[call-arg]
10 |     """Pooling parameters for embeddings API.
11 | 
12 |     Attributes:
13 |         additional_data: Any additional data needed for pooling.
14 |     """
15 |     additional_data: Optional[Any] = None
16 | 
17 |     def clone(self) -> "PoolingParams":
18 |         """Returns a deep copy of the PoolingParams instance."""
19 |         return PoolingParams(additional_data=self.additional_data)
20 | 
21 |     def __repr__(self) -> str:
22 |         return (f"PoolingParams("
23 |                 f"additional_metadata={self.additional_data})")
24 | 


--------------------------------------------------------------------------------
/vllm/profiler/__init__.py:
--------------------------------------------------------------------------------
1 | from .layerwise_profile import layerwise_profile
2 | 
3 | __all__ = [
4 |     "layerwise_profile",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/prompt_adapter/__init__.py


--------------------------------------------------------------------------------
/vllm/prompt_adapter/request.py:
--------------------------------------------------------------------------------
 1 | import msgspec
 2 | 
 3 | from vllm.adapter_commons.request import AdapterRequest
 4 | 
 5 | 
 6 | class PromptAdapterRequest(
 7 |         msgspec.Struct,
 8 |         array_like=True,  # type: ignore[call-arg]
 9 |         omit_defaults=True,  # type: ignore[call-arg]
10 |         frozen=True):  # type: ignore[call-arg]
11 |     """
12 |     Request for a Prompt adapter.
13 |     """
14 |     __metaclass__ = AdapterRequest
15 | 
16 |     prompt_adapter_name: str
17 |     prompt_adapter_id: int
18 |     prompt_adapter_local_path: str
19 |     prompt_adapter_num_virtual_tokens: int
20 | 
21 |     def __hash__(self):
22 |         return super().__hash__()
23 | 
24 |     @property
25 |     def adapter_id(self):
26 |         return self.prompt_adapter_id
27 | 
28 |     @property
29 |     def name(self):
30 |         return self.prompt_adapter_name
31 | 
32 |     @property
33 |     def local_path(self):
34 |         return self.prompt_adapter_local_path
35 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.envs import VLLM_USE_MODELSCOPE
 2 | 
 3 | if VLLM_USE_MODELSCOPE:
 4 |     # Patch here, before each import happens
 5 |     import modelscope
 6 |     from packaging import version
 7 | 
 8 |     # patch_hub begins from modelscope>=1.18.1
 9 |     if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
10 |         raise ImportError(
11 |             'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
12 |             'install by `pip install modelscope -U`')
13 | 
14 |     from modelscope.utils.hf_util import patch_hub
15 | 
16 |     # Patch hub to download models from modelscope to speed up.
17 |     patch_hub()
18 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/h2ovl.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
 3 | # --------------------------------------------------------
 4 | # H2OVL-Mississippi
 5 | # Copyright (c) 2024 H2O.AI
 6 | # Licensed under Apache 2.0 License [see LICENSE for details]
 7 | # --------------------------------------------------------
 8 | 
 9 | from .internvl import InternVLChatConfig
10 | 
11 | 
12 | class H2OVLChatConfig(InternVLChatConfig):
13 |     model_type = "h2ovl_chat"
14 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/mllama.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config
 2 | 
 3 | 
 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
 5 |     '''
 6 |     Use this class to override is_encoder_decoder:
 7 |     - transformers regards mllama as is_encoder_decoder=False
 8 |     - vllm needs is_encoder_decoder=True to enable cross-attention
 9 |     '''
10 | 
11 |     def __init__(
12 |         self,
13 |         **kwargs,
14 |     ):
15 |         super().__init__(**kwargs)
16 |         self.is_encoder_decoder = True
17 | 
18 | 
19 | class MllamaConfig(mllama_hf_config.MllamaConfig):
20 | 
21 |     def __init__(
22 |         self,
23 |         text_config=None,
24 |         **kwargs,
25 |     ):
26 |         if isinstance(text_config, dict):
27 |             text_config = MllamaTextConfig(**text_config)
28 |         super().__init__(text_config=text_config, **kwargs)
29 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 3 | # --------------------------------------------------------
 4 | # NVLM-D
 5 | # Copyright (c) 2024 NVIDIA
 6 | # Licensed under Apache 2.0 License [see LICENSE for details]
 7 | # --------------------------------------------------------
 8 | from .internvl import InternVLChatConfig
 9 | 
10 | 
11 | class NVLM_D_Config(InternVLChatConfig):
12 |     model_type = 'NVLM_D'
13 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .mistral import MistralTokenizer, maybe_serialize_tool_calls
2 | 
3 | __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
4 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/utils.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from pathlib import Path
 3 | from typing import Union
 4 | 
 5 | 
 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool:
 7 |     """Check if the file is a GGUF model."""
 8 |     model = Path(model)
 9 |     if not model.is_file():
10 |         return False
11 |     elif model.suffix == ".gguf":
12 |         return True
13 | 
14 |     with open(model, "rb") as f:
15 |         header = f.read(4)
16 |     return header == b"GGUF"
17 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.triton_utils.importing import HAS_TRITON
 2 | 
 3 | __all__ = ["HAS_TRITON"]
 4 | 
 5 | if HAS_TRITON:
 6 | 
 7 |     from vllm.triton_utils.custom_cache_manager import (
 8 |         maybe_set_triton_cache_manager)
 9 | 
10 |     __all__ += ["maybe_set_triton_cache_manager"]
11 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/importing.py:
--------------------------------------------------------------------------------
 1 | from importlib.util import find_spec
 2 | 
 3 | from vllm.logger import init_logger
 4 | from vllm.platforms import current_platform
 5 | 
 6 | logger = init_logger(__name__)
 7 | 
 8 | HAS_TRITON = (
 9 |     find_spec("triton") is not None
10 |     and not current_platform.is_xpu()  # Not compatible
11 |     and not current_platform.is_neuron()  # neuron has too old torch
12 | )
13 | 
14 | if not HAS_TRITON:
15 |     logger.info("Triton not installed or not compatible; certain GPU-related"
16 |                 " functions will not be available.")
17 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/attention/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/core/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/outputs.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Dict, List, Optional
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | @dataclass
 8 | class SamplerOutput:
 9 | 
10 |     # [num_reqs]
11 |     sampled_token_ids: torch.Tensor
12 | 
13 |     # [num_reqs, max_num_logprobs + 1]
14 |     logprob_token_ids: Optional[torch.Tensor]
15 |     # [num_reqs, max_num_logprobs + 1]
16 |     logprobs: Optional[torch.Tensor]
17 | 
18 |     # TODO: Support prompt logprobs.
19 |     prompt_logprob_token_ids: Optional[torch.Tensor]
20 |     prompt_logprobs: Optional[torch.Tensor]
21 | 
22 | 
23 | @dataclass
24 | class ModelRunnerOutput:
25 | 
26 |     # [num_reqs]
27 |     req_ids: List[str]
28 |     # req_id -> index
29 |     req_id_to_index: Dict[str, int]
30 | 
31 |     # [num_reqs]
32 |     sampled_token_ids_cpu: torch.Tensor
33 | 
34 |     # [num_reqs, max_num_logprobs + 1]
35 |     logprob_token_ids_cpu: Optional[torch.Tensor]
36 |     # [num_reqs, max_num_logprobs + 1]
37 |     logprobs_cpu: Optional[torch.Tensor]
38 | 


--------------------------------------------------------------------------------
/vllm/v1/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/sample/__init__.py


--------------------------------------------------------------------------------
/vllm/v1/sample/metadata.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Dict
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | @dataclass
 8 | class SamplingMetadata:
 9 | 
10 |     temperature: torch.Tensor
11 |     all_greedy: bool
12 |     all_random: bool
13 | 
14 |     top_p: torch.Tensor
15 |     top_k: torch.Tensor
16 |     no_top_p: bool
17 |     no_top_k: bool
18 | 
19 |     generators: Dict[int, torch.Generator]
20 | 
21 |     max_num_logprobs: int
22 | 


--------------------------------------------------------------------------------
/vllm/v1/serial_utils.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | 
 4 | class PickleEncoder:
 5 | 
 6 |     def encode(self, obj):
 7 |         return pickle.dumps(obj)
 8 | 
 9 |     def decode(self, data):
10 |         return pickle.loads(data)
11 | 


--------------------------------------------------------------------------------
/vllm/v1/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/worker/__init__.py


--------------------------------------------------------------------------------
/vllm/version.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from ._version import __version__, __version_tuple__
 3 | except Exception as e:
 4 |     import warnings
 5 | 
 6 |     warnings.warn(f"Failed to read commit hash:\n{e}",
 7 |                   RuntimeWarning,
 8 |                   stacklevel=2)
 9 | 
10 |     __version__ = "dev"
11 |     __version_tuple__ = (0, 0, __version__)
12 | 


--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/vllm_flash_attn/.gitkeep


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/worker/__init__.py


--------------------------------------------------------------------------------