├── .buildkite
    ├── check-wheel-size.py
    ├── lm-eval-harness
    │   ├── configs
    │   │   ├── DeepSeek-V2-Lite-Chat.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-70B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-FP8.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
    │   │   ├── Meta-Llama-3-8B-Instruct.yaml
    │   │   ├── Meta-Llama-3-8B-QQQ.yaml
    │   │   ├── Minitron-4B-Base-FP8.yaml
    │   │   ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml
    │   │   ├── Mixtral-8x7B-Instruct-v0.1.yaml
    │   │   ├── Qwen2-1.5B-Instruct-FP8W8.yaml
    │   │   ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
    │   │   ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
    │   │   ├── Qwen2-57B-A14-Instruct.yaml
    │   │   ├── models-large.txt
    │   │   └── models-small.txt
    │   ├── run-lm-eval-gsm-hf-baseline.sh
    │   ├── run-lm-eval-gsm-vllm-baseline.sh
    │   ├── run-tests.sh
    │   └── test_lm_eval_correctness.py
    ├── nightly-benchmarks
    │   ├── README.md
    │   ├── benchmark-pipeline.yaml
    │   ├── nightly-annotation.md
    │   ├── nightly-descriptions.md
    │   ├── nightly-pipeline.yaml
    │   ├── performance-benchmarks-descriptions.md
    │   ├── scripts
    │   │   ├── convert-results-json-to-markdown.py
    │   │   ├── download-tokenizer.py
    │   │   ├── generate-nightly-markdown.py
    │   │   ├── get-lmdeploy-modelname.py
    │   │   ├── launch-server.sh
    │   │   ├── nightly-annotate.sh
    │   │   ├── run-nightly-benchmarks.sh
    │   │   ├── run-performance-benchmarks.sh
    │   │   ├── summary-nightly-results.py
    │   │   └── wait-for-image.sh
    │   └── tests
    │   │   ├── latency-tests.json
    │   │   ├── nightly-tests.json
    │   │   ├── serving-tests.json
    │   │   └── throughput-tests.json
    ├── release-pipeline.yaml
    ├── run-amd-test.sh
    ├── run-benchmarks.sh
    ├── run-cpu-test-ppc64le.sh
    ├── run-cpu-test.sh
    ├── run-multi-node-test.sh
    ├── run-neuron-test.sh
    ├── run-openvino-test.sh
    ├── run-tpu-test.sh
    ├── run-xpu-test.sh
    └── test-pipeline.yaml
├── .clang-format
├── .dockerignore
├── .github
    ├── CODEOWNERS
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug report.yml
    │   ├── 500-feature request.yml
    │   ├── 600-new model.yml
    │   ├── 700-performance discussion.yml
    │   ├── 750-RFC.yml
    │   ├── 800-misc discussion.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── actionlint.yml
    │   ├── add_label_automerge.yml
    │   ├── clang-format.yml
    │   ├── matchers
    │       └── actionlint.json
    │   ├── mypy.yaml
    │   ├── publish.yml
    │   ├── reminder_comment.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── .yapfignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.cpu
├── Dockerfile.neuron
├── Dockerfile.openvino
├── Dockerfile.ppc64le
├── Dockerfile.rocm
├── Dockerfile.tpu
├── Dockerfile.xpu
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── benchmarks
    ├── README.md
    ├── backend_request_func.py
    ├── benchmark_latency.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_prioritization.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── cutlass_benchmarks
    │   ├── w8a8_benchmarks.py
    │   └── weight_shapes.py
    ├── kernels
    │   ├── benchmark_aqlm.py
    │   ├── benchmark_layernorm.py
    │   ├── benchmark_machete.py
    │   ├── benchmark_marlin.py
    │   ├── benchmark_moe.py
    │   ├── benchmark_paged_attention.py
    │   ├── benchmark_quant.py
    │   ├── benchmark_rope.py
    │   ├── benchmark_shapes.py
    │   ├── graph_machete_bench.py
    │   ├── requirements.txt
    │   └── weight_shapes.py
    ├── launch_tgi_server.sh
    ├── overheads
    │   └── benchmark_hashing.py
    └── sonnet.txt
├── cmake
    ├── cpu_extension.cmake
    ├── hipify.py
    └── utils.cmake
├── collect_env.py
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cu
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   ├── dtype_float32.cuh
    │   └── dtype_fp8.cuh
    ├── cache.h
    ├── cache_kernels.cu
    ├── core
    │   ├── exception.hpp
    │   ├── registration.h
    │   ├── scalar_type.hpp
    │   └── torch_bindings.cpp
    ├── cpu
    │   ├── activation.cpp
    │   ├── attention.cpp
    │   ├── cache.cpp
    │   ├── cpu_types.hpp
    │   ├── cpu_types_vsx.hpp
    │   ├── cpu_types_x86.hpp
    │   ├── dnnl_helper.hpp
    │   ├── layernorm.cpp
    │   ├── pos_encoding.cpp
    │   ├── quant.cpp
    │   ├── torch_bindings.cpp
    │   └── utils.cpp
    ├── cuda_compat.h
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── custom_all_reduce.cu
    ├── custom_all_reduce.cuh
    ├── custom_all_reduce_test.cu
    ├── cutlass_extensions
    │   ├── cute_utils.cuh
    │   ├── torch_utils.hpp
    │   ├── vllm_collective_builder.cuh
    │   ├── vllm_custom_types.cuh
    │   ├── vllm_cutlass_library_extension.py
    │   └── vllm_numeric_conversion.cuh
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── mamba
    │   ├── causal_conv1d
    │   │   ├── causal_conv1d.cu
    │   │   ├── causal_conv1d.h
    │   │   └── static_switch.h
    │   └── mamba_ssm
    │   │   ├── selective_scan.h
    │   │   ├── selective_scan_fwd.cu
    │   │   └── static_switch.h
    ├── moe
    │   ├── marlin_kernels
    │   │   ├── marlin_moe_kernel.h
    │   │   ├── marlin_moe_kernel_ku4.cu
    │   │   ├── marlin_moe_kernel_ku4.h
    │   │   ├── marlin_moe_kernel_ku4b8.cu
    │   │   ├── marlin_moe_kernel_ku4b8.h
    │   │   ├── marlin_moe_kernel_ku8b128.cu
    │   │   └── marlin_moe_kernel_ku8b128.h
    │   ├── marlin_moe_ops.cu
    │   ├── moe_ops.h
    │   ├── topk_softmax_kernels.cu
    │   └── torch_bindings.cpp
    ├── moe_align_block_size_kernels.cu
    ├── ops.h
    ├── permute_cols.cu
    ├── pos_encoding_kernels.cu
    ├── prepare_inputs
    │   ├── advance_step.cu
    │   └── advance_step.cuh
    ├── quantization
    │   ├── aqlm
    │   │   └── gemm_kernels.cu
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   ├── compressed_tensors
    │   │   └── int8_quant_kernels.cu
    │   ├── cutlass_w8a8
    │   │   ├── Epilogues.md
    │   │   ├── broadcast_load_epilogue_c2x.hpp
    │   │   ├── broadcast_load_epilogue_c3x.hpp
    │   │   ├── common.hpp
    │   │   ├── scaled_mm_c2x.cu
    │   │   ├── scaled_mm_c2x.cuh
    │   │   ├── scaled_mm_c2x_sm75_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm80_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh
    │   │   ├── scaled_mm_c2x_sm89_int8_dispatch.cuh
    │   │   ├── scaled_mm_c3x.cu
    │   │   └── scaled_mm_entry.cu
    │   ├── fp8
    │   │   ├── amd
    │   │   │   ├── hip_float8.h
    │   │   │   ├── hip_float8_impl.h
    │   │   │   └── quant_utils.cuh
    │   │   ├── common.cu
    │   │   ├── fp8_marlin.cu
    │   │   └── nvidia
    │   │   │   └── quant_utils.cuh
    │   ├── gguf
    │   │   ├── dequantize.cuh
    │   │   ├── ggml-common.h
    │   │   ├── gguf_kernel.cu
    │   │   ├── mmq.cuh
    │   │   ├── mmvq.cuh
    │   │   └── vecdotq.cuh
    │   ├── gptq
    │   │   ├── compat.cuh
    │   │   ├── matrix_view.cuh
    │   │   ├── q_gemm.cu
    │   │   ├── qdq_2.cuh
    │   │   ├── qdq_3.cuh
    │   │   ├── qdq_4.cuh
    │   │   ├── qdq_8.cuh
    │   │   └── qdq_util.cuh
    │   ├── gptq_marlin
    │   │   ├── awq_marlin_repack.cu
    │   │   ├── gptq_marlin.cu
    │   │   ├── gptq_marlin_repack.cu
    │   │   ├── marlin.cuh
    │   │   └── marlin_dtypes.cuh
    │   ├── machete
    │   │   ├── Readme.md
    │   │   ├── generate.py
    │   │   ├── machete_collective_builder.cuh
    │   │   ├── machete_interleaving_utils.cuh
    │   │   ├── machete_mainloop.cuh
    │   │   ├── machete_mm_kernel.cuh
    │   │   ├── machete_mm_launcher.cuh
    │   │   ├── machete_prepack_kernel.cuh
    │   │   ├── machete_prepack_launcher.cuh
    │   │   ├── machete_prepacked_layout.cuh
    │   │   └── machete_pytorch.cu
    │   └── marlin
    │   │   ├── dense
    │   │       ├── LICENSE
    │   │       ├── common
    │   │       │   ├── base.h
    │   │       │   └── mem.h
    │   │       └── marlin_cuda_kernel.cu
    │   │   ├── qqq
    │   │       └── marlin_qqq_gemm_kernel.cu
    │   │   └── sparse
    │   │       ├── LICENSE
    │   │       ├── common
    │   │           ├── base.h
    │   │           ├── mem.h
    │   │           └── mma.h
    │   │       └── marlin_24_cuda_kernel.cu
    ├── rocm
    │   ├── attention.cu
    │   ├── ops.h
    │   └── torch_bindings.cpp
    └── torch_bindings.cpp
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── _static
    │       └── custom.js
    │   ├── _templates
    │       └── sections
    │       │   └── header.html
    │   ├── assets
    │       ├── dev
    │       │   └── dockerfile-stages-dependency.png
    │       ├── kernel
    │       │   ├── k_vecs.png
    │       │   ├── key.png
    │       │   ├── logits_vec.png
    │       │   ├── q_vecs.png
    │       │   ├── query.png
    │       │   ├── v_vec.png
    │       │   └── value.png
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── automatic_prefix_caching
    │       ├── apc.rst
    │       └── details.md
    │   ├── community
    │       ├── meetups.rst
    │       └── sponsors.md
    │   ├── conf.py
    │   ├── dev
    │       ├── dockerfile
    │       │   └── dockerfile.rst
    │       ├── engine
    │       │   ├── async_llm_engine.rst
    │       │   ├── engine_index.rst
    │       │   └── llm_engine.rst
    │       ├── input_processing
    │       │   ├── input_processing_pipeline.rst
    │       │   └── model_inputs_index.rst
    │       ├── kernel
    │       │   └── paged_attention.rst
    │       ├── multimodal
    │       │   ├── adding_multimodal_plugin.rst
    │       │   └── multimodal_index.rst
    │       ├── offline_inference
    │       │   ├── llm.rst
    │       │   ├── llm_inputs.rst
    │       │   └── offline_index.rst
    │       ├── profiling
    │       │   └── profiling_index.rst
    │       └── sampling_params.rst
    │   ├── generate_examples.py
    │   ├── getting_started
    │       ├── amd-installation.rst
    │       ├── cpu-installation.rst
    │       ├── debugging.rst
    │       ├── examples
    │       │   └── examples_index.template.rst
    │       ├── installation.rst
    │       ├── neuron-installation.rst
    │       ├── openvino-installation.rst
    │       ├── quickstart.rst
    │       ├── tpu-installation.rst
    │       └── xpu-installation.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── enabling_multimodal_inputs.rst
    │       ├── engine_args.rst
    │       ├── lora.rst
    │       ├── performance.rst
    │       ├── spec_decode.rst
    │       ├── supported_models.rst
    │       └── vlm.rst
    │   ├── performance_benchmark
    │       └── benchmarks.rst
    │   ├── quantization
    │       ├── auto_awq.rst
    │       ├── bnb.rst
    │       ├── fp8.rst
    │       ├── fp8_e4m3_kvcache.rst
    │       ├── fp8_e5m2_kvcache.rst
    │       ├── gguf.rst
    │       ├── int8.rst
    │       └── supported_hardware.rst
    │   └── serving
    │       ├── compatibility_matrix.rst
    │       ├── deploying_with_bentoml.rst
    │       ├── deploying_with_cerebrium.rst
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_dstack.rst
    │       ├── deploying_with_k8s.rst
    │       ├── deploying_with_kserve.rst
    │       ├── deploying_with_lws.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       ├── env_vars.rst
    │       ├── faq.rst
    │       ├── integrations.rst
    │       ├── metrics.rst
    │       ├── openai_compatible_server.md
    │       ├── run_on_sky.rst
    │       ├── serving_with_langchain.rst
    │       ├── serving_with_llamaindex.rst
    │       ├── tensorizer.rst
    │       └── usage_stats.md
├── examples
    ├── api_client.py
    ├── aqlm_example.py
    ├── cpu_offload.py
    ├── fp8
    │   ├── README.md
    │   ├── extract_scales.py
    │   └── quantizer
    │   │   ├── README.md
    │   │   └── quantize.py
    ├── gguf_inference.py
    ├── gradio_openai_chatbot_webserver.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    ├── logging_configuration.md
    ├── lora_with_quantization_inference.py
    ├── multilora_inference.py
    ├── offline_chat_with_tools.py
    ├── offline_inference.py
    ├── offline_inference_arctic.py
    ├── offline_inference_audio_language.py
    ├── offline_inference_chat.py
    ├── offline_inference_distributed.py
    ├── offline_inference_embedding.py
    ├── offline_inference_encoder_decoder.py
    ├── offline_inference_mlpspeculator.py
    ├── offline_inference_neuron.py
    ├── offline_inference_neuron_int8_quantization.py
    ├── offline_inference_openai.md
    ├── offline_inference_pixtral.py
    ├── offline_inference_tpu.py
    ├── offline_inference_vision_language.py
    ├── offline_inference_vision_language_multi_image.py
    ├── offline_inference_with_prefix.py
    ├── offline_inference_with_profiler.py
    ├── openai_audio_api_client.py
    ├── openai_chat_completion_client.py
    ├── openai_chat_completion_client_with_tools.py
    ├── openai_completion_client.py
    ├── openai_embedding_client.py
    ├── openai_example_batch.jsonl
    ├── openai_vision_api_client.py
    ├── production_monitoring
    │   ├── Otel.md
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── dummy_client.py
    │   ├── grafana.json
    │   └── prometheus.yaml
    ├── run_cluster.sh
    ├── save_sharded_state.py
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_blip2.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    ├── template_inkbot.jinja
    ├── template_llava.jinja
    ├── tensorize_vllm_model.py
    ├── tool_chat_template_hermes.jinja
    ├── tool_chat_template_internlm2_tool.jinja
    ├── tool_chat_template_llama3.1_json.jinja
    ├── tool_chat_template_llama3.2_json.jinja
    ├── tool_chat_template_mistral.jinja
    └── tool_chat_template_mistral_parallel.jinja
├── find_cuda_init.py
├── format.sh
├── pyproject.toml
├── python_only_dev.py
├── requirements-build.txt
├── requirements-common.txt
├── requirements-cpu.txt
├── requirements-cuda.txt
├── requirements-dev.txt
├── requirements-lint.txt
├── requirements-neuron.txt
├── requirements-openvino.txt
├── requirements-rocm.txt
├── requirements-test.txt
├── requirements-tpu.txt
├── requirements-xpu.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── __init__.py
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   ├── __init__.py
    │   ├── test_basic_correctness.py
    │   ├── test_chunked_prefill.py
    │   ├── test_cpu_offload.py
    │   └── test_preemption.py
    ├── compile
    │   ├── __init__.py
    │   ├── test_basic_correctness.py
    │   ├── test_full_graph.py
    │   ├── test_wrapper.py
    │   └── utils.py
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── block
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── e2e
    │   │   │   ├── __init__.py
    │   │   │   ├── conftest.py
    │   │   │   ├── test_correctness.py
    │   │   │   └── test_correctness_sliding_window.py
    │   │   ├── test_block_manager_v2.py
    │   │   ├── test_block_table.py
    │   │   ├── test_common.py
    │   │   ├── test_cpu_gpu_block_allocator.py
    │   │   ├── test_naive_block.py
    │   │   └── test_prefix_caching_block.py
    │   ├── test_block_manager.py
    │   ├── test_chunked_prefill_scheduler.py
    │   ├── test_num_computed_tokens_update.py
    │   ├── test_scheduler.py
    │   ├── test_scheduler_encoder_decoder.py
    │   ├── test_serialization.py
    │   └── utils.py
    ├── data
    │   └── test_config.yaml
    ├── distributed
    │   ├── __init__.py
    │   ├── test_comm_ops.py
    │   ├── test_custom_all_reduce.py
    │   ├── test_distributed_oot.py
    │   ├── test_multi_node_assignment.py
    │   ├── test_pipeline_parallel.py
    │   ├── test_pipeline_partition.py
    │   ├── test_pp_cudagraph.py
    │   ├── test_pynccl.py
    │   ├── test_same_node.py
    │   ├── test_shm_broadcast.py
    │   └── test_utils.py
    ├── encoder_decoder
    │   ├── __init__.py
    │   └── test_e2e_correctness.py
    ├── engine
    │   ├── __init__.py
    │   ├── output_processor
    │   │   ├── __init__.py
    │   │   ├── test_multi_step.py
    │   │   └── test_stop_checker.py
    │   ├── test_arg_utils.py
    │   ├── test_computed_prefix_blocks.py
    │   ├── test_custom_executor.py
    │   ├── test_detokenization.py
    │   ├── test_multiproc_workers.py
    │   ├── test_skip_tokenizer_init.py
    │   ├── test_stop_reason.py
    │   └── test_stop_strings.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── test_encode.py
    │   │   ├── test_generate.py
    │   │   ├── test_generate_multiple_loras.py
    │   │   ├── test_guided_generate.py
    │   │   ├── test_lazy_outlines.py
    │   │   └── test_prompt_validation.py
    │   ├── offline_mode
    │   │   ├── __init__.py
    │   │   └── test_offline_mode.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── test_accuracy.py
    │   │   ├── test_audio.py
    │   │   ├── test_basic.py
    │   │   ├── test_chat.py
    │   │   ├── test_chat_template.py
    │   │   ├── test_cli_args.py
    │   │   ├── test_completion.py
    │   │   ├── test_embedding.py
    │   │   ├── test_encoder_decoder.py
    │   │   ├── test_lora_lineage.py
    │   │   ├── test_metrics.py
    │   │   ├── test_models.py
    │   │   ├── test_oot_registration.py
    │   │   ├── test_prompt_validation.py
    │   │   ├── test_return_tokens_as_ids.py
    │   │   ├── test_run_batch.py
    │   │   ├── test_serving_chat.py
    │   │   ├── test_serving_engine.py
    │   │   ├── test_shutdown.py
    │   │   ├── test_tokenization.py
    │   │   └── test_vision.py
    │   └── test_chat_utils.py
    ├── fp8_kv
    │   ├── llama2-70b-fp8-kv
    │   │   └── kv_cache_scales.json
    │   └── llama2-7b-fp8-kv
    │   │   └── kv_cache_scales.json
    ├── kernels
    │   ├── __init__.py
    │   ├── allclose_default.py
    │   ├── conftest.py
    │   ├── quant_utils.py
    │   ├── test_activation.py
    │   ├── test_aqlm.py
    │   ├── test_attention.py
    │   ├── test_attention_selector.py
    │   ├── test_awq.py
    │   ├── test_awq_marlin.py
    │   ├── test_awq_triton.py
    │   ├── test_blocksparse_attention.py
    │   ├── test_cache.py
    │   ├── test_causal_conv1d.py
    │   ├── test_cutlass.py
    │   ├── test_encoder_decoder_attn.py
    │   ├── test_flash_attn.py
    │   ├── test_flashinfer.py
    │   ├── test_fp8_quant.py
    │   ├── test_ggml.py
    │   ├── test_gguf.py
    │   ├── test_gptq.py
    │   ├── test_int8_quant.py
    │   ├── test_layernorm.py
    │   ├── test_machete_gemm.py
    │   ├── test_mamba_ssm.py
    │   ├── test_marlin_gemm.py
    │   ├── test_moe.py
    │   ├── test_permute_cols.py
    │   ├── test_pos_encoding.py
    │   ├── test_prefix_prefill.py
    │   ├── test_rotary_embedding.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── long_context_test_data.py
    │   ├── test_baichuan.py
    │   ├── test_chatglm3.py
    │   ├── test_gemma.py
    │   ├── test_layers.py
    │   ├── test_llama.py
    │   ├── test_long_context.py
    │   ├── test_lora_checkpoints.py
    │   ├── test_lora_huggingface.py
    │   ├── test_lora_manager.py
    │   ├── test_minicpmv.py
    │   ├── test_minicpmv_tp.py
    │   ├── test_mixtral.py
    │   ├── test_phi.py
    │   ├── test_punica_sizes.py
    │   ├── test_punica_variation.py
    │   ├── test_quant_model.py
    │   ├── test_tokenizer_group.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   ├── __init__.py
    │   └── test_metrics.py
    ├── model_executor
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_guided_processors.py
    │   └── weight_utils.py
    ├── models
    │   ├── __init__.py
    │   ├── decoder_only
    │   │   ├── __init__.py
    │   │   ├── audio_language
    │   │   │   ├── __init__.py
    │   │   │   └── test_ultravox.py
    │   │   ├── language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_aqlm.py
    │   │   │   ├── test_big_models.py
    │   │   │   ├── test_danube3_4b.py
    │   │   │   ├── test_fp8.py
    │   │   │   ├── test_gguf.py
    │   │   │   ├── test_gptq_marlin.py
    │   │   │   ├── test_gptq_marlin_24.py
    │   │   │   ├── test_granite.py
    │   │   │   ├── test_granitemoe.py
    │   │   │   ├── test_jamba.py
    │   │   │   ├── test_mamba.py
    │   │   │   ├── test_marlin.py
    │   │   │   ├── test_mistral.py
    │   │   │   ├── test_modelopt.py
    │   │   │   ├── test_models.py
    │   │   │   └── test_phimoe.py
    │   │   └── vision_language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_blip2.py
    │   │   │   ├── test_broadcast.py
    │   │   │   ├── test_chameleon.py
    │   │   │   ├── test_fuyu.py
    │   │   │   ├── test_glm4.py
    │   │   │   ├── test_intern_vit.py
    │   │   │   ├── test_internvl.py
    │   │   │   ├── test_llava.py
    │   │   │   ├── test_llava_image_embeds.py
    │   │   │   ├── test_llava_next.py
    │   │   │   ├── test_llava_next_video.py
    │   │   │   ├── test_llava_onevision.py
    │   │   │   ├── test_minicpmv.py
    │   │   │   ├── test_paligemma.py
    │   │   │   ├── test_phi3v.py
    │   │   │   ├── test_pixtral.py
    │   │   │   └── test_qwen.py
    │   ├── embedding
    │   │   ├── __init__.py
    │   │   └── language
    │   │   │   ├── __init__.py
    │   │   │   └── test_embedding.py
    │   ├── encoder_decoder
    │   │   ├── __init__.py
    │   │   ├── language
    │   │   │   ├── __init__.py
    │   │   │   └── test_bart.py
    │   │   └── vision_language
    │   │   │   ├── __init__.py
    │   │   │   ├── test_broadcast.py
    │   │   │   └── test_mllama.py
    │   ├── fixtures
    │   │   ├── pixtral_chat.json
    │   │   └── pixtral_chat_engine.json
    │   ├── test_oot_registration.py
    │   ├── test_registry.py
    │   └── utils.py
    ├── mq_llm_engine
    │   ├── __init__.py
    │   ├── test_abort.py
    │   ├── test_error_handling.py
    │   ├── test_load.py
    │   └── utils.py
    ├── multi_step
    │   ├── __init__.py
    │   ├── test_correctness_async_llm.py
    │   └── test_correctness_llm.py
    ├── multimodal
    │   ├── __init__.py
    │   ├── test_base.py
    │   ├── test_mapper.py
    │   ├── test_processor_kwargs.py
    │   └── test_utils.py
    ├── plugins
    │   └── vllm_add_dummy_model
    │   │   ├── setup.py
    │   │   └── vllm_add_dummy_model
    │   │       ├── __init__.py
    │   │       ├── my_gemma_embedding.py
    │   │       ├── my_llava.py
    │   │       └── my_opt.py
    ├── prefix_caching
    │   ├── __init__.py
    │   ├── test_disable_sliding_window.py
    │   └── test_prefix_caching.py
    ├── prompt_adapter
    │   ├── test_bloom.py
    │   ├── test_multi_adapter_inference.py
    │   └── test_pa_lora.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── quantization
    │   ├── __init__.py
    │   ├── test_bitsandbytes.py
    │   ├── test_compressed_tensors.py
    │   ├── test_configs.py
    │   ├── test_cpu_offload.py
    │   ├── test_experts_int8.py
    │   ├── test_fp8.py
    │   ├── test_ipex_quant.py
    │   ├── test_lm_head.py
    │   └── utils.py
    ├── samplers
    │   ├── __init__.py
    │   ├── test_beam_search.py
    │   ├── test_ignore_eos.py
    │   ├── test_logits_processor.py
    │   ├── test_logprobs.py
    │   ├── test_ranks.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   ├── test_seeded_generate.py
    │   └── test_typical_acceptance_sampler.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compatibility.py
    │   │   ├── test_eagle_correctness.py
    │   │   ├── test_integration.py
    │   │   ├── test_integration_dist_tp2.py
    │   │   ├── test_integration_dist_tp4.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_medusa_correctness.py
    │   │   ├── test_mlp_correctness.py
    │   │   ├── test_multistep_correctness.py
    │   │   ├── test_ngram_correctness.py
    │   │   └── test_seed.py
    │   ├── test_batch_expansion.py
    │   ├── test_dynamic_spec_decode.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_ngram_worker.py
    │   ├── test_scorer.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── tensorizer_loader
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_tensorizer.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_embedded_commit.py
    ├── test_inputs.py
    ├── test_logger.py
    ├── test_logits_processor.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_scalartype.py
    ├── test_sequence.py
    ├── test_sharded_state_loader.py
    ├── test_utils.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   ├── test_get_eos.py
    │   ├── test_tokenizer.py
    │   └── test_tokenizer_group.py
    ├── tool_use
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_chat_completion_request_validations.py
    │   ├── test_chat_completions.py
    │   ├── test_parallel_tool_calls.py
    │   ├── test_tool_calls.py
    │   └── utils.py
    ├── tpu
    │   ├── __init__.py
    │   ├── test_compilation.py
    │   └── test_custom_dispatcher.py
    ├── tracing
    │   ├── __init__.py
    │   └── test_tracing.py
    ├── utils.py
    ├── weight_loading
    │   ├── models-large.txt
    │   ├── models.txt
    │   ├── run_model_weight_loading_test.sh
    │   └── test_weight_loading.py
    └── worker
    │   ├── __init__.py
    │   ├── test_encoder_decoder_model_runner.py
    │   ├── test_model_input.py
    │   ├── test_model_runner.py
    │   └── test_swap.py
├── tools
    ├── actionlint.sh
    ├── mypy.sh
    └── report_build_time_ninja.py
├── use_existing_torch.py
└── vllm
    ├── __init__.py
    ├── _core_ext.py
    ├── _custom_ops.py
    ├── _ipex_ops.py
    ├── adapter_commons
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── assets
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── image.py
        └── video.py
    ├── attention
        ├── __init__.py
        ├── backends
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── blocksparse_attn.py
        │   ├── flash_attn.py
        │   ├── flashinfer.py
        │   ├── ipex_attn.py
        │   ├── openvino.py
        │   ├── pallas.py
        │   ├── placeholder_attn.py
        │   ├── rocm_flash_attn.py
        │   ├── torch_sdpa.py
        │   ├── utils.py
        │   └── xformers.py
        ├── layer.py
        ├── ops
        │   ├── __init__.py
        │   ├── blocksparse_attention
        │   │   ├── __init__.py
        │   │   ├── blocksparse_attention_kernel.py
        │   │   ├── interface.py
        │   │   └── utils.py
        │   ├── ipex_attn.py
        │   ├── paged_attn.py
        │   ├── prefix_prefill.py
        │   └── triton_flash_attention.py
        └── selector.py
    ├── beam_search.py
    ├── block.py
    ├── compilation
        ├── __init__.py
        ├── backends.py
        ├── compile_context.py
        ├── decorators.py
        ├── levels.py
        └── wrapper.py
    ├── config.py
    ├── connections.py
    ├── core
        ├── __init__.py
        ├── block
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── common.py
        │   ├── cpu_gpu_block_allocator.py
        │   ├── interfaces.py
        │   ├── naive_block.py
        │   ├── prefix_caching_block.py
        │   └── utils.py
        ├── block_manager_v1.py
        ├── block_manager_v2.py
        ├── evictor_v1.py
        ├── evictor_v2.py
        ├── interfaces.py
        ├── placeholder_block_space_manager.py
        └── scheduler.py
    ├── distributed
        ├── __init__.py
        ├── communication_op.py
        ├── device_communicators
        │   ├── __init__.py
        │   ├── cuda_wrapper.py
        │   ├── custom_all_reduce.py
        │   ├── custom_all_reduce_utils.py
        │   ├── pynccl.py
        │   ├── pynccl_wrapper.py
        │   ├── shm_broadcast.py
        │   └── tpu_communicator.py
        ├── parallel_state.py
        └── utils.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── async_timeout.py
        ├── llm_engine.py
        ├── metrics.py
        ├── metrics_types.py
        ├── multiprocessing
        │   ├── __init__.py
        │   ├── client.py
        │   └── engine.py
        ├── output_processor
        │   ├── __init__.py
        │   ├── interfaces.py
        │   ├── multi_step.py
        │   ├── single_step.py
        │   ├── stop_checker.py
        │   └── util.py
        └── protocol.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── chat_utils.py
        ├── launcher.py
        ├── llm.py
        ├── logger.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── logits_processors.py
        │   ├── protocol.py
        │   ├── run_batch.py
        │   ├── serving_chat.py
        │   ├── serving_completion.py
        │   ├── serving_embedding.py
        │   ├── serving_engine.py
        │   ├── serving_tokenization.py
        │   └── tool_parsers
        │       ├── __init__.py
        │       ├── abstract_tool_parser.py
        │       ├── hermes_tool_parser.py
        │       ├── internlm2_tool_parser.py
        │       ├── llama_tool_parser.py
        │       ├── mistral_tool_parser.py
        │       └── utils.py
    ├── envs.py
    ├── executor
        ├── __init__.py
        ├── cpu_executor.py
        ├── distributed_gpu_executor.py
        ├── executor_base.py
        ├── gpu_executor.py
        ├── msgspec_utils.py
        ├── multiproc_gpu_executor.py
        ├── multiproc_worker_utils.py
        ├── multiproc_xpu_executor.py
        ├── neuron_executor.py
        ├── openvino_executor.py
        ├── ray_gpu_executor.py
        ├── ray_tpu_executor.py
        ├── ray_utils.py
        ├── ray_xpu_executor.py
        ├── tpu_executor.py
        └── xpu_executor.py
    ├── forward_context.py
    ├── inputs
        ├── __init__.py
        ├── data.py
        ├── parse.py
        ├── preprocess.py
        └── registry.py
    ├── logger.py
    ├── logging
        ├── __init__.py
        └── formatter.py
    ├── lora
        ├── __init__.py
        ├── fully_sharded_layers.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── ops
        │   ├── __init__.py
        │   ├── bgmv_expand.py
        │   ├── bgmv_expand_slice.py
        │   ├── bgmv_shrink.py
        │   ├── sgmv_expand.py
        │   ├── sgmv_expand_slice.py
        │   ├── sgmv_shrink.py
        │   └── utils.py
        ├── punica.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── custom_op.py
        ├── guided_decoding
        │   ├── __init__.py
        │   ├── guided_fields.py
        │   ├── lm_format_enforcer_decoding.py
        │   ├── outlines_decoding.py
        │   └── outlines_logits_processors.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
        │   │   │   ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_L40S.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
        │   │   │   └── README
        │   │   ├── fused_marlin_moe.py
        │   │   ├── fused_moe.py
        │   │   ├── layer.py
        │   │   └── moe_pallas.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── mamba
        │   │   ├── __init__.py
        │   │   └── ops
        │   │   │   ├── __init__.py
        │   │   │   ├── causal_conv1d.py
        │   │   │   └── mamba_ssm.py
        │   ├── pooler.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── aqlm.py
        │   │   ├── awq.py
        │   │   ├── awq_marlin.py
        │   │   ├── awq_triton.py
        │   │   ├── base_config.py
        │   │   ├── bitsandbytes.py
        │   │   ├── compressed_tensors
        │   │   │   ├── __init__.py
        │   │   │   ├── compressed_tensors.py
        │   │   │   ├── compressed_tensors_moe.py
        │   │   │   ├── schemes
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── compressed_tensors_scheme.py
        │   │   │   │   ├── compressed_tensors_w4a16_24.py
        │   │   │   │   ├── compressed_tensors_w8a16_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_fp8.py
        │   │   │   │   ├── compressed_tensors_w8a8_int8.py
        │   │   │   │   └── compressed_tensors_wNa16.py
        │   │   │   └── utils.py
        │   │   ├── deepspeedfp.py
        │   │   ├── experts_int8.py
        │   │   ├── fbgemm_fp8.py
        │   │   ├── fp8.py
        │   │   ├── gguf.py
        │   │   ├── gptq.py
        │   │   ├── gptq_marlin.py
        │   │   ├── gptq_marlin_24.py
        │   │   ├── ipex_quant.py
        │   │   ├── kernels
        │   │   │   ├── MPLinearKernel.py
        │   │   │   ├── __init__.py
        │   │   │   ├── machete.py
        │   │   │   └── marlin.py
        │   │   ├── kv_cache.py
        │   │   ├── marlin.py
        │   │   ├── modelopt.py
        │   │   ├── neuron_quant.py
        │   │   ├── qqq.py
        │   │   ├── schema.py
        │   │   ├── tpu_int8.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── layer_utils.py
        │   │   │   ├── machete_utils.py
        │   │   │   ├── marlin_utils.py
        │   │   │   ├── marlin_utils_fp8.py
        │   │   │   ├── marlin_utils_test.py
        │   │   │   ├── marlin_utils_test_24.py
        │   │   │   ├── marlin_utils_test_qqq.py
        │   │   │   ├── quant_utils.py
        │   │   │   └── w8a8_utils.py
        │   ├── rejection_sampler.py
        │   ├── resampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   ├── spec_decode_base_sampler.py
        │   ├── typical_acceptance_sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader
        │   ├── __init__.py
        │   ├── loader.py
        │   ├── neuron.py
        │   ├── openvino.py
        │   ├── tensorizer.py
        │   ├── utils.py
        │   └── weight_utils.py
        ├── models
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── baichuan.py
        │   ├── bart.py
        │   ├── blip.py
        │   ├── blip2.py
        │   ├── bloom.py
        │   ├── chameleon.py
        │   ├── chatglm.py
        │   ├── clip.py
        │   ├── commandr.py
        │   ├── dbrx.py
        │   ├── decilm.py
        │   ├── deepseek.py
        │   ├── deepseek_v2.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── falcon.py
        │   ├── fuyu.py
        │   ├── gemma.py
        │   ├── gemma2.py
        │   ├── gemma2_embedding.py
        │   ├── glm4_vision_encoder.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── granite.py
        │   ├── granitemoe.py
        │   ├── idefics2_vision_model.py
        │   ├── interfaces.py
        │   ├── interfaces_base.py
        │   ├── intern_vit.py
        │   ├── internlm2.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── jamba.py
        │   ├── llama.py
        │   ├── llama_embedding.py
        │   ├── llava.py
        │   ├── llava_next.py
        │   ├── llava_next_video.py
        │   ├── llava_onevision.py
        │   ├── mamba.py
        │   ├── mamba_cache.py
        │   ├── medusa.py
        │   ├── minicpm.py
        │   ├── minicpm3.py
        │   ├── minicpmv.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mllama.py
        │   ├── mlp_speculator.py
        │   ├── module_mapping.py
        │   ├── molmo.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nvlm_d.py
        │   ├── olmo.py
        │   ├── olmoe.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── paligemma.py
        │   ├── persimmon.py
        │   ├── phi.py
        │   ├── phi3.py
        │   ├── phi3_small.py
        │   ├── phi3v.py
        │   ├── phimoe.py
        │   ├── pixtral.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── qwen2_moe.py
        │   ├── qwen2_rm.py
        │   ├── qwen2_vl.py
        │   ├── registry.py
        │   ├── siglip.py
        │   ├── solar.py
        │   ├── stablelm.py
        │   ├── starcoder2.py
        │   ├── ultravox.py
        │   ├── utils.py
        │   └── xverse.py
        ├── parameter.py
        ├── pooling_metadata.py
        ├── sampling_metadata.py
        └── utils.py
    ├── multimodal
        ├── __init__.py
        ├── audio.py
        ├── base.py
        ├── image.py
        ├── registry.py
        ├── utils.py
        └── video.py
    ├── outputs.py
    ├── platforms
        ├── __init__.py
        ├── cpu.py
        ├── cuda.py
        ├── interface.py
        ├── rocm.py
        ├── tpu.py
        └── xpu.py
    ├── plugins
        └── __init__.py
    ├── pooling_params.py
    ├── prompt_adapter
        ├── __init__.py
        ├── layers.py
        ├── models.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── py.typed
    ├── sampling_params.py
    ├── scalar_type.py
    ├── scripts.py
    ├── sequence.py
    ├── spec_decode
        ├── __init__.py
        ├── batch_expansion.py
        ├── draft_model_runner.py
        ├── interfaces.py
        ├── medusa_worker.py
        ├── metrics.py
        ├── mlp_speculator_worker.py
        ├── mqa_scorer.py
        ├── multi_step_worker.py
        ├── ngram_worker.py
        ├── proposer_worker_base.py
        ├── smaller_tp_proposer_worker.py
        ├── spec_decode_worker.py
        ├── target_model_runner.py
        ├── top1_proposer.py
        └── util.py
    ├── tracing.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── chatglm.py
        │   ├── dbrx.py
        │   ├── eagle.py
        │   ├── exaone.py
        │   ├── falcon.py
        │   ├── internvl.py
        │   ├── jais.py
        │   ├── medusa.py
        │   ├── mllama.py
        │   ├── mlp_speculator.py
        │   ├── mpt.py
        │   ├── nemotron.py
        │   ├── nvlm_d.py
        │   ├── qwen2vl.py
        │   ├── solar.py
        │   └── ultravox.py
        ├── detokenizer.py
        ├── processor.py
        ├── tokenizer.py
        ├── tokenizer_group
        │   ├── __init__.py
        │   ├── base_tokenizer_group.py
        │   ├── ray_tokenizer_group.py
        │   └── tokenizer_group.py
        ├── tokenizers
        │   ├── __init__.py
        │   └── mistral.py
        └── utils.py
    ├── triton_utils
        ├── __init__.py
        ├── custom_cache_manager.py
        ├── importing.py
        └── libentry.py
    ├── usage
        ├── __init__.py
        └── usage_lib.py
    ├── utils.py
    ├── version.py
    ├── vllm_flash_attn
        └── .gitkeep
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── cpu_enc_dec_model_runner.py
        ├── cpu_model_runner.py
        ├── cpu_worker.py
        ├── embedding_model_runner.py
        ├── enc_dec_model_runner.py
        ├── model_runner.py
        ├── model_runner_base.py
        ├── multi_step_model_runner.py
        ├── multi_step_tpu_worker.py
        ├── multi_step_worker.py
        ├── neuron_model_runner.py
        ├── neuron_worker.py
        ├── openvino_model_runner.py
        ├── openvino_worker.py
        ├── tpu_model_runner.py
        ├── tpu_worker.py
        ├── utils.py
        ├── worker.py
        ├── worker_base.py
        ├── xpu_model_runner.py
        └── xpu_worker.py


/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 2 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.671
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.664
10 | limit: 1000
11 | num_fewshot: 5
12 | trust_remote_code: True


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 2 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.905
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.905
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 2 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.892
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.892
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.752
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.754
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.753
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.753
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.755
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.755
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 2 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.753
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.753
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.764
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.764
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.728
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.728
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.758
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.759
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 2 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.756
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.752
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 2 | model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.419
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.416
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 2 | model_name: "mgoin/Minitron-4B-Base-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.233
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.236
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 2 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.86
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.86
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 2 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.624
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.624
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 2 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.616
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.632
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.578
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.585
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 2 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.593
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.588
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml:
--------------------------------------------------------------------------------
 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.595
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.582
10 | limit: 1000
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml:
--------------------------------------------------------------------------------
 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 2 | model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 3 | tasks:
 4 | - name: "gsm8k"
 5 |   metrics:
 6 |   - name: "exact_match,strict-match"
 7 |     value: 0.792
 8 |   - name: "exact_match,flexible-extract"
 9 |     value: 0.824
10 | limit: 250
11 | num_fewshot: 5
12 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-large.txt:
--------------------------------------------------------------------------------
1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
2 | Meta-Llama-3-70B-Instruct.yaml
3 | Mixtral-8x7B-Instruct-v0.1.yaml
4 | Qwen2-57B-A14-Instruct.yaml
5 | DeepSeek-V2-Lite-Chat.yaml
6 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/configs/models-small.txt:
--------------------------------------------------------------------------------
 1 | Meta-Llama-3-8B-Instruct.yaml
 2 | Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 3 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 4 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 5 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 6 | Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 7 | Minitron-4B-Base-FP8.yaml
 8 | Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 9 | Qwen2-1.5B-Instruct-FP8W8.yaml
10 | Meta-Llama-3-8B-QQQ.yaml
11 | 


--------------------------------------------------------------------------------
/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # We can use this script to compute baseline accuracy on GSM for transformers.
 3 | #
 4 | # Make sure you have lm-eval-harness installed:
 5 | #   pip install lm-eval==0.4.4
 6 | 
 7 | usage() {
 8 |     echo``
 9 |     echo "Runs lm eval harness on GSM8k using huggingface transformers."
10 |     echo "This pathway is intended to be used to create baselines for "
11 |     echo "our automated nm-test-accuracy workflow"
12 |     echo
13 |     echo "usage: ${0} <options>"
14 |     echo
15 |     echo "  -m    - huggingface stub or local directory of the model"
16 |     echo "  -b    - batch size to run the evaluation at"
17 |     echo "  -l    - limit number of samples to run"
18 |     echo "  -f    - number of fewshot samples to use"
19 |     echo
20 | }
21 | 
22 | while getopts "m:b:l:f:" OPT; do
23 |   case ${OPT} in
24 |     m ) 
25 |         MODEL="$OPTARG"
26 |         ;;
27 |     b ) 
28 |         BATCH_SIZE="$OPTARG"
29 |         ;;
30 |     l ) 
31 |         LIMIT="$OPTARG"
32 |         ;;
33 |     f ) 
34 |         FEWSHOT="$OPTARG"
35 |         ;;
36 |     \? ) 
37 |         usage
38 |         exit 1
39 |         ;;
40 |   esac
41 | done
42 | 
43 | lm_eval --model hf \
44 |   --model_args pretrained=$MODEL,parallelize=True \
45 |   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
46 |   --batch_size $BATCH_SIZE
47 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/nightly-annotation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Description
 3 | 
 4 | This file contains the downloading link for benchmarking results.
 5 | 
 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 7 | - [benchmarking results](artifact://results.zip)
 8 | - [benchmarking code](artifact://nightly-benchmarks.zip)
 9 | 
10 | Please download the visualization scripts in the post
11 | 
12 | 
13 | ## Results reproduction
14 | 
15 | - Find the docker we use in `benchmarking pipeline`
16 | - Deploy the docker, and inside the docker:
17 |   - Download `nightly-benchmarks.zip`. 
18 |   - In the same folder, run the following code
19 | ```
20 | export HF_TOKEN=<your HF token>
21 | apt update
22 | apt install -y git
23 | unzip nightly-benchmarks.zip
24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
25 | ```
26 | 
27 | And the results will be inside `./benchmarks/results`.
28 | 
29 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | 
 6 | def main(model, cachedir):
 7 |     # Load the tokenizer and save it to the specified directory
 8 |     tokenizer = AutoTokenizer.from_pretrained(model)
 9 |     tokenizer.save_pretrained(cachedir)
10 |     print(f"Tokenizer saved to {cachedir}")
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(
15 |         description="Download and save Hugging Face tokenizer")
16 |     parser.add_argument("--model",
17 |                         type=str,
18 |                         required=True,
19 |                         help="Name of the model")
20 |     parser.add_argument("--cachedir",
21 |                         type=str,
22 |                         required=True,
23 |                         help="Directory to save the tokenizer")
24 | 
25 |     args = parser.parse_args()
26 |     main(args.model, args.cachedir)
27 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py:
--------------------------------------------------------------------------------
1 | from lmdeploy.serve.openai.api_client import APIClient
2 | 
3 | api_client = APIClient("http://localhost:8000")
4 | model_name = api_client.available_models[0]
5 | 
6 | print(model_name)
7 | 


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 3 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 4 | 
 5 | TIMEOUT_SECONDS=10
 6 | 
 7 | retries=0
 8 | while [ $retries -lt 1000 ]; do
 9 |     if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
10 |         exit 0
11 |     fi
12 | 
13 |     echo "Waiting for image to be available..."
14 | 
15 |     retries=$((retries + 1))
16 |     sleep 5
17 | done
18 | 
19 | exit 1


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/latency-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "latency_llama8B_tp1",
 4 |         "parameters": {
 5 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 6 |             "tensor_parallel_size": 1,
 7 |             "load_format": "dummy",
 8 |             "num_iters_warmup": 5,
 9 |             "num_iters": 15
10 |         }
11 |     },
12 |     {
13 |         "test_name": "latency_llama70B_tp4",
14 |         "parameters": {
15 |             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
16 |             "tensor_parallel_size": 4,
17 |             "load_format": "dummy",
18 |             "num-iters-warmup": 5,
19 |             "num-iters": 15
20 |         }
21 |     },
22 |     {
23 |         "test_name": "latency_mixtral8x7B_tp2",
24 |         "parameters": {
25 |             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
26 |             "tensor_parallel_size": 2,
27 |             "load_format": "dummy",
28 |             "num-iters-warmup": 5,
29 |             "num-iters": 15
30 |         }
31 |     }
32 | ]


--------------------------------------------------------------------------------
/.buildkite/nightly-benchmarks/tests/throughput-tests.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test_name": "throughput_llama8B_tp1",
 4 |         "parameters": {
 5 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 6 |             "tensor_parallel_size": 1,
 7 |             "load_format": "dummy",
 8 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 9 |             "num_prompts": 200,
10 |             "backend": "vllm"
11 |         }
12 |     },
13 |     {
14 |         "test_name": "throughput_llama70B_tp4",
15 |         "parameters": {
16 |             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
17 |             "tensor_parallel_size": 4,
18 |             "load_format": "dummy",
19 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
20 |             "num_prompts": 200,
21 |             "backend": "vllm"
22 |         }
23 |     },
24 |     {
25 |         "test_name": "throughput_mixtral8x7B_tp2",
26 |         "parameters": {
27 |             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
28 |             "tensor_parallel_size": 2,
29 |             "load_format": "dummy",
30 |             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
31 |             "num_prompts": 200,
32 |             "backend": "vllm"
33 |         }
34 |     }
35 | ]


--------------------------------------------------------------------------------
/.buildkite/run-openvino-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the OpenVINO docker image and run the offline inference inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -ex
 4 | 
 5 | # Try building the docker image
 6 | docker build -t openvino-test -f Dockerfile.openvino .
 7 | 
 8 | # Setup cleanup
 9 | remove_docker_container() { docker rm -f openvino-test || true; }
10 | trap remove_docker_container EXIT
11 | remove_docker_container
12 | 
13 | # Run the image and launch offline inference
14 | docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
15 | 


--------------------------------------------------------------------------------
/.buildkite/run-tpu-test.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | # Build the docker image.
 4 | docker build -f Dockerfile.tpu -t vllm-tpu .
 5 | 
 6 | # Set up cleanup.
 7 | remove_docker_container() { docker rm -f tpu-test || true; }
 8 | trap remove_docker_container EXIT
 9 | # Remove the container that might not be cleaned up in the previous run.
10 | remove_docker_container
11 | 
12 | # For HF_TOKEN.
13 | source /etc/environment
14 | # Run a simple end-to-end example.
15 | docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
16 | 


--------------------------------------------------------------------------------
/.buildkite/run-xpu-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the CPU docker image and run the offline inference inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -ex
 4 | 
 5 | # Try building the docker image
 6 | docker build -t xpu-test -f Dockerfile.xpu .
 7 | 
 8 | # Setup cleanup
 9 | remove_docker_container() { docker rm -f xpu-test || true; }
10 | trap remove_docker_container EXIT
11 | remove_docker_container
12 | 
13 | # Run the image and launch offline inference
14 | docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
15 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | UseTab: Never
 3 | IndentWidth: 2
 4 | ColumnLimit: 80
 5 | 
 6 | # Force pointers to the type for C++.
 7 | DerivePointerAlignment: false
 8 | PointerAlignment: Left
 9 | 
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 | 
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 | 
18 | IncludeCategories:
19 |   - Regex:           '^<'
20 |     Priority:        4
21 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 |     Priority:        3
23 |   - Regex:           '^"(qoda|\.\.)/'
24 |     Priority:        2
25 |   - Regex:           '.*'
26 |     Priority:        1
27 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | /.github/
 2 | /.venv
 3 | /build
 4 | dist
 5 | vllm/*.so
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | .mypy_cache
13 | 
14 | # Distribution / packaging
15 | .Python
16 | /build/
17 | cmake-build-*/
18 | CMakeUserPresets.json
19 | develop-eggs/
20 | /dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vllm-project]
2 | open_collective: [vllm]
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://docs.vllm.ai/
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | - type: checkboxes
24 |   id: askllm
25 |   attributes:
26 |     label: Before submitting a new issue...
27 |     options:
28 |       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
29 |         required: true
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-misc discussion.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Misc/random discussions that do not fit into the above categories.
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm.
14 |     description: >
15 |       Anything you want to discuss about vllm.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | - type: checkboxes
23 |   id: askllm
24 |   attributes:
25 |     label: Before submitting a new issue...
26 |     options:
27 |       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
28 |         required: true
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   # Maintain dependencies for GitHub Actions
4 |   - package-ecosystem: "github-actions"
5 |     directory: "/"
6 |     schedule:
7 |       interval: "weekly"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint GitHub Actions workflows
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths:
 7 |       - '.github/workflows/*.ya?ml'
 8 |       - '.github/workflows/actionlint.*'
 9 |   pull_request:
10 |     branches:
11 |       - "main"
12 |     paths:
13 |       - '.github/workflows/*.ya?ml'
14 |       - '.github/workflows/actionlint.*'
15 | 
16 | env:
17 |   LC_ALL: en_US.UTF-8
18 | 
19 | defaults:
20 |   run:
21 |     shell: bash
22 | 
23 | permissions:
24 |   contents: read
25 | 
26 | jobs:
27 |   actionlint:
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - name: "Checkout"
31 |         uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
32 |         with:
33 |           fetch-depth: 0
34 | 
35 |       - name: "Run actionlint"
36 |         run: |
37 |           tools/actionlint.sh -color
38 | 


--------------------------------------------------------------------------------
/.github/workflows/add_label_automerge.yml:
--------------------------------------------------------------------------------
 1 | name: Add label on auto-merge enabled
 2 | on:
 3 |     pull_request_target:
 4 |         types:
 5 |             - auto_merge_enabled
 6 | jobs:
 7 |     add-label-on-auto-merge:
 8 |         runs-on: ubuntu-latest
 9 |         steps:
10 |             -   name: Add label
11 |                 uses: actions/github-script@v7
12 |                 with:
13 |                     script: |
14 |                         github.rest.issues.addLabels({
15 |                             owner: context.repo.owner,
16 |                             repo: context.repo.repo,
17 |                             issue_number: context.issue.number,
18 |                             labels: ['ready']
19 |                         })
20 |                 env:
21 |                     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 | 


--------------------------------------------------------------------------------
/.github/workflows/clang-format.yml:
--------------------------------------------------------------------------------
 1 | name: clang-format
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   clang-format:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install clang-format==18.1.5
29 |     - name: Running clang-format
30 |       run: |
31 |         EXCLUDES=(
32 |             'csrc/moe/topk_softmax_kernels.cu'
33 |             'csrc/quantization/gguf/ggml-common.h'
34 |             'csrc/quantization/gguf/dequantize.cuh'
35 |             'csrc/quantization/gguf/vecdotq.cuh'
36 |             'csrc/quantization/gguf/mmq.cuh'
37 |             'csrc/quantization/gguf/mmvq.cuh'
38 |         )
39 |         find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
40 |             | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
41 |             | xargs clang-format --dry-run --Werror


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/mypy.yaml:
--------------------------------------------------------------------------------
 1 | name: mypy
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   mypy:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install mypy==1.11.1
29 |         pip install types-setuptools
30 |         pip install types-PyYAML
31 |         pip install types-requests
32 |         pip install types-setuptools
33 |     - name: Mypy
34 |       run: |
35 |         tools/mypy.sh
36 | 


--------------------------------------------------------------------------------
/.github/workflows/reminder_comment.yml:
--------------------------------------------------------------------------------
 1 | name: PR Reminder Comment Bot
 2 | on:
 3 |   pull_request_target:
 4 |     types: [opened]
 5 | 
 6 | jobs:
 7 |   pr_reminder:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Remind to run full CI on PR
11 |         uses: actions/github-script@v7
12 |         with:
13 |           script: |
14 |             github.rest.issues.createComment({
15 |               owner: context.repo.owner,
16 |               repo: context.repo.repo,
17 |               issue_number: context.issue.number,
18 |               body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
19 |             })
20 |         env:
21 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: ruff
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install -r requirements-lint.txt
29 |     - name: Analysing the code with ruff
30 |       run: |
31 |         ruff check .
32 |     - name: Spelling check with codespell
33 |       run: |
34 |         codespell --toml pyproject.toml
35 |     - name: Run isort
36 |       run: |
37 |         isort . --check-only
38 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
12 | 
13 | # Limit the number of parallel jobs to avoid OOM
14 | export MAX_JOBS=1
15 | # Make sure release wheels are built for the following architectures
16 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
17 | export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
18 | # Build
19 | $python_executable setup.py bdist_wheel --dist-dir=dist
20 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive .
32 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 |    fail_on_warning: true
14 | 
15 | # If using Sphinx, optionally build your docs in additional formats such as PDF
16 | formats: []
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 
23 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/Dockerfile.openvino:
--------------------------------------------------------------------------------
 1 | # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 2 | # to run the OpenAI compatible server.
 3 | 
 4 | FROM ubuntu:22.04 AS dev
 5 | 
 6 | RUN apt-get update -y && \
 7 |     apt-get install -y \
 8 |         git python3-pip \
 9 |         ffmpeg libsm6 libxext6 libgl1
10 | WORKDIR /workspace
11 | 
12 | COPY . .
13 | 
14 | # install build requirements
15 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
16 | # build vLLM with OpenVINO backend
17 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
18 | 
19 | COPY examples/ /workspace/vllm/examples
20 | COPY benchmarks/ /workspace/vllm/benchmarks
21 | 
22 | CMD ["/bin/bash"]
23 | 


--------------------------------------------------------------------------------
/Dockerfile.tpu:
--------------------------------------------------------------------------------
 1 | ARG NIGHTLY_DATE="20240828"
 2 | ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 3 | 
 4 | FROM $BASE_IMAGE
 5 | WORKDIR /workspace
 6 | 
 7 | # Install some basic utilities
 8 | RUN apt-get update && apt-get install -y \
 9 |     git \
10 |     ffmpeg libsm6 libxext6 libgl1
11 | 
12 | # Install the TPU and Pallas dependencies.
13 | RUN --mount=type=cache,target=/root/.cache/pip \
14 |     python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
15 | RUN --mount=type=cache,target=/root/.cache/pip \
16 |     python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
17 | 
18 | # Build vLLM.
19 | COPY . /workspace/vllm
20 | ENV VLLM_TARGET_DEVICE="tpu"
21 | RUN --mount=type=cache,target=/root/.cache/pip \
22 |     --mount=type=bind,source=.git,target=.git \
23 |      cd /workspace/vllm && \
24 |     python3 -m pip install \
25 |         cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
26 |         -r requirements-tpu.txt
27 | RUN cd /workspace/vllm && python3 setup.py develop
28 | 
29 | CMD ["/bin/bash"]
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements-common.txt
 3 | include requirements-cuda.txt
 4 | include requirements-rocm.txt
 5 | include requirements-neuron.txt
 6 | include requirements-cpu.txt
 7 | include CMakeLists.txt
 8 | 
 9 | recursive-include cmake *
10 | recursive-include csrc *
11 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Reporting a Vulnerability
 4 | 
 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 6 | 
 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
 8 | 
 9 | ---
10 | 
11 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
12 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/benchmarks/cutlass_benchmarks/weight_shapes.py:
--------------------------------------------------------------------------------
 1 | # Weight Shapes are in the format
 2 | # ([K, N], TP_SPLIT_DIM)
 3 | # Example:
 4 | #  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
 5 | #   - TP1 : K = 14336, N = 4096
 6 | #   - TP2 : K = 7168, N = 4096
 7 | #  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
 8 | #   - TP1 : K = 4096, N = 6144
 9 | #   - TP4 : K = 4096, N = 1536
10 | 
11 | # TP1 shapes
12 | WEIGHT_SHAPES = {
13 |     "mistralai/Mistral-7B-v0.1": [
14 |         ([4096, 6144], 1),
15 |         ([4096, 4096], 0),
16 |         ([4096, 28672], 1),
17 |         ([14336, 4096], 0),
18 |     ],
19 |     "meta-llama/Llama-2-7b-hf": [
20 |         ([4096, 12288], 1),
21 |         ([4096, 4096], 0),
22 |         ([4096, 22016], 1),
23 |         ([11008, 4096], 0),
24 |     ],
25 |     "meta-llama/Llama-3-8b": [
26 |         ([4096, 6144], 1),
27 |         ([4096, 4096], 0),
28 |         ([4096, 28672], 1),
29 |         ([14336, 4096], 0),
30 |     ],
31 |     "meta-llama/Llama-2-13b-hf": [
32 |         ([5120, 15360], 1),
33 |         ([5120, 5120], 0),
34 |         ([5120, 27648], 1),
35 |         ([13824, 5120], 0),
36 |     ],
37 |     "meta-llama/Llama-2-70b-hf": [
38 |         ([8192, 10240], 1),
39 |         ([8192, 8192], 0),
40 |         ([8192, 57344], 1),
41 |         ([28672, 8192], 0),
42 |     ],
43 | }
44 | 


--------------------------------------------------------------------------------
/benchmarks/kernels/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/benchmarks/kernels/weight_shapes.py:
--------------------------------------------------------------------------------
 1 | # Weight Shapes are in the format
 2 | # ([K, N], TP_SPLIT_DIM)
 3 | # Example:
 4 | #  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
 5 | #   - TP1 : K = 14336, N = 4096
 6 | #   - TP2 : K = 7168, N = 4096
 7 | #  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
 8 | #   - TP1 : K = 4096, N = 6144
 9 | #   - TP4 : K = 4096, N = 1536
10 | 
11 | # TP1 shapes
12 | WEIGHT_SHAPES = {
13 |     "mistralai/Mistral-7B-v0.1": [
14 |         ([4096, 6144], 1),
15 |         ([4096, 4096], 0),
16 |         ([4096, 28672], 1),
17 |         ([14336, 4096], 0),
18 |     ],
19 |     "meta-llama/Llama-2-7b-hf": [
20 |         ([4096, 12288], 1),
21 |         ([4096, 4096], 0),
22 |         ([4096, 22016], 1),
23 |         ([11008, 4096], 0),
24 |     ],
25 |     "meta-llama/Llama-3-8b": [
26 |         ([4096, 6144], 1),
27 |         ([4096, 4096], 0),
28 |         ([4096, 28672], 1),
29 |         ([14336, 4096], 0),
30 |     ],
31 |     "meta-llama/Llama-2-13b-hf": [
32 |         ([5120, 15360], 1),
33 |         ([5120, 5120], 0),
34 |         ([5120, 27648], 1),
35 |         ([13824, 5120], 0),
36 |     ],
37 |     "meta-llama/Llama-2-70b-hf": [
38 |         ([8192, 10240], 1),
39 |         ([8192, 8192], 0),
40 |         ([8192, 57344], 1),
41 |         ([28672, 8192], 0),
42 |     ],
43 | }
44 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:2.2.0 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/csrc/core/exception.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define VLLM_IMPLIES(p, q) (!(p) || (q))
4 | 


--------------------------------------------------------------------------------
/csrc/core/registration.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <Python.h>
 4 | 
 5 | #define _CONCAT(A, B) A##B
 6 | #define CONCAT(A, B) _CONCAT(A, B)
 7 | 
 8 | #define _STRINGIFY(A) #A
 9 | #define STRINGIFY(A) _STRINGIFY(A)
10 | 
11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
12 | // could be a macro instead of a literal token.
13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
14 | 
15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
16 | // could be a macro instead of a literal token.
17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
18 |   TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
19 | 
20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized
21 | // via python's import statement.
22 | #define REGISTER_EXTENSION(NAME)                                               \
23 |   PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
24 |     static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
25 |                                         STRINGIFY(NAME), nullptr, 0, nullptr}; \
26 |     return PyModule_Create(&module);                                           \
27 |   }
28 | 


--------------------------------------------------------------------------------
/csrc/core/torch_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/library.h>
 2 | 
 3 | #include "scalar_type.hpp"
 4 | #include "registration.h"
 5 | 
 6 | // Note the CORE exstension will be built for (almost) all hardware targets so
 7 | // new additions must account for this. (currently not built for TPU and Neuron)
 8 | 
 9 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
10 |   // ScalarType, a custom class for representing data types that supports
11 |   // quantized types, declared here so it can be used when creating interfaces
12 |   // for custom ops.
13 |   vllm::ScalarTypeTorch::bind_class(lib);
14 | }
15 | 
16 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
17 | 


--------------------------------------------------------------------------------
/csrc/cpu/cpu_types.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef CPU_TYPES_HPP
 3 | #define CPU_TYPES_HPP
 4 | 
 5 | #if defined(__x86_64__)
 6 |   //x86 implementation
 7 |   #include "cpu_types_x86.hpp"
 8 | #elif defined(__POWER9_VECTOR__)
 9 |   //ppc implementation
10 |   #include "cpu_types_vsx.hpp"
11 | #else
12 |   #warning "unsupported vLLM cpu implementation"
13 | #endif
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 4 |   #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
 5 |   #define DEVICE_INLINE __forceinline__ __device__
 6 |   #define HOST_INLINE __forceinline__ __host__
 7 | #else
 8 |   #define HOST_DEVICE_INLINE inline
 9 |   #define DEVICE_INLINE inline
10 |   #define HOST_INLINE inline
11 | #endif
12 | 
13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id);
14 | 
15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
16 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
 6 |   int device, value;
 7 |   if (device_id < 0) {
 8 |     cudaGetDevice(&device);
 9 |   } else {
10 |     device = device_id;
11 |   }
12 |   cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
13 |                          device);
14 |   return value;
15 | }
16 | 
17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
18 |   int64_t attribute;
19 |   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
20 |   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
21 | 
22 | #ifdef USE_ROCM
23 |   attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
24 | #else
25 |   attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
26 | #endif
27 | 
28 |   return get_device_attribute(attribute, device_id);
29 | }
30 | 


--------------------------------------------------------------------------------
/csrc/mamba/mamba_ssm/static_switch.h:
--------------------------------------------------------------------------------
 1 | // Inspired by
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
 4 | 
 5 | // clang-format off
 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h
 7 | #pragma once
 8 | 
 9 | /// @param COND       - a boolean expression to switch by
10 | /// @param CONST_NAME - a name given for the constexpr bool variable.
11 | /// @param ...       - code to execute for true and false
12 | ///
13 | /// Usage:
14 | /// ```
15 | /// BOOL_SWITCH(flag, BoolConst, [&] {
16 | ///     some_function<BoolConst>(...);
17 | /// });
18 | /// ```
19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \
20 |   [&] {                                    \
21 |     if (COND) {                            \
22 |       constexpr bool CONST_NAME = true;    \
23 |       return __VA_ARGS__();                \
24 |     } else {                               \
25 |       constexpr bool CONST_NAME = false;   \
26 |       return __VA_ARGS__();                \
27 |     }                                      \
28 |   }()
29 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = true;
18 | 
19 |   if (false) {
20 |   }
21 |   AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
22 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
23 |   AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
24 |   AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku4b8.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku4b8(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | // We return bool so we can create these different kernel calls as a sequence
 8 | // of if-elseif's.
 9 | bool call_marlin_moe_kernel_ku4b8(
10 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
11 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
12 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
13 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
14 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
15 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
16 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
17 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
18 |     int m_block, int max_par, int cfg_max_m_blocks);
19 | 
20 | }  // namespace marlin_moe
21 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu:
--------------------------------------------------------------------------------
 1 | #include "marlin_moe_kernel_ku8b128.h"
 2 | 
 3 | namespace marlin_moe {
 4 | 
 5 | // We return bool so we can create these different kernel calls as a sequence
 6 | // of if-elseif's.
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks) {
17 |   bool has_zp = false;
18 | 
19 |   if (false) {
20 |   }
21 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
22 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
23 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
24 |   GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
25 |   else {
26 |     return false;
27 |   }
28 |   return true;
29 | }
30 | 
31 | }  // namespace marlin_moe
32 | 


--------------------------------------------------------------------------------
/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "marlin_moe_kernel.h"
 4 | 
 5 | namespace marlin_moe {
 6 | 
 7 | bool call_marlin_moe_kernel_ku8b128(
 8 |     vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
 9 |     bool has_act_order, int group_blocks, int num_threads, int blocks,
10 |     int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
11 |     const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
12 |     const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
13 |     const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
14 |     int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
15 |     int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
16 |     int m_block, int max_par, int cfg_max_m_blocks);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/all.h>
4 | 
5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
6 |                   torch::Tensor& token_expert_indices,
7 |                   torch::Tensor& gating_output);
8 | 


--------------------------------------------------------------------------------
/csrc/moe/torch_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "core/registration.h"
 2 | #include "moe_ops.h"
 3 | 
 4 | TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 5 |   // Apply topk softmax to the gating outputs.
 6 |   m.def(
 7 |       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
 8 |       "token_expert_indices, Tensor gating_output) -> ()");
 9 |   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
10 | 
11 | #ifndef USE_ROCM
12 |   m.def(
13 |       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
14 |       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
15 |       "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
16 |       "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
17 |       "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
18 |       "int moe_block_size, bool replicate_input, bool apply_weights)"
19 |       " -> Tensor");
20 |   // conditionally compiled so impl registration is in source file
21 | #endif
22 | }
23 | 
24 | REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
25 | 


--------------------------------------------------------------------------------
/csrc/prepare_inputs/advance_step.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <c10/cuda/CUDAGuard.h>
 7 | #include <cuda.h>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace prepare_inputs {
13 | 
14 | static constexpr int max_threads = 256;
15 | static constexpr bool logging = false;
16 | 
17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
18 | 
19 | }  // namespace prepare_inputs
20 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass/cutlass.h"
 4 | #include <climits>
 5 | 
 6 | /**
 7 |  * Helper function for checking CUTLASS errors
 8 |  */
 9 | #define CUTLASS_CHECK(status)                        \
10 |   {                                                  \
11 |     TORCH_CHECK(status == cutlass::Status::kSuccess, \
12 |                 cutlassGetStatusString(status))      \
13 |   }
14 | 
15 | inline uint32_t next_pow_2(uint32_t const num) {
16 |   if (num <= 1) return num;
17 |   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
18 | }
19 | 
20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
21 |   int max_shared_mem_per_block_opt_in = 0;
22 |   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
23 |                         cudaDevAttrMaxSharedMemoryPerBlockOptin,
24 |                         device);
25 |   return max_shared_mem_per_block_opt_in;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/csrc/quantization/marlin/dense/common/base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Modified by HandH1998
 3 |  * Modified by Neural Magic
 4 |  * Copyright (C) Marlin.2024 Elias Frantar
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *         http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | #pragma once
20 | 
21 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
22 | 
23 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed
24 | // for instance as inputs to tensor core operations. Consequently, all
25 | // corresponding index accesses must be compile-time constants, which is why we
26 | // extensively use `#pragma unroll` throughout the kernel code to guarantee
27 | // this.
28 | template <typename T, int n>
29 | struct Vec {
30 |   T elems[n];
31 |   __device__ T& operator[](int i) { return elems[i]; }
32 | };
33 | 


--------------------------------------------------------------------------------
/csrc/rocm/ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
 6 |                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
 7 |                      torch::Tensor& query, torch::Tensor& key_cache,
 8 |                      torch::Tensor& value_cache, int64_t num_kv_heads,
 9 |                      double scale, torch::Tensor& block_tables,
10 |                      torch::Tensor& context_lens, int64_t block_size,
11 |                      int64_t max_context_len,
12 |                      const c10::optional<torch::Tensor>& alibi_slopes,
13 |                      const std::string& kv_cache_dtype, double k_scale,
14 |                      double v_scale);
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx==6.2.1
 2 | sphinx-book-theme==1.0.1
 3 | sphinx-copybutton==0.5.2
 4 | myst-parser==2.0.0
 5 | sphinx-argparse==0.4.0
 6 | msgspec
 7 | cloudpickle
 8 | 
 9 | # packages to install to build the documentation
10 | pydantic >= 2.8
11 | -f https://download.pytorch.org/whl/cpu
12 | torch
13 | py-cpuinfo
14 | transformers
15 | mistral_common >= 1.3.4
16 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
17 | partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args


--------------------------------------------------------------------------------
/docs/source/_static/custom.js:
--------------------------------------------------------------------------------
 1 | document.addEventListener("DOMContentLoaded", function () {
 2 |     var script = document.createElement("script");
 3 |     script.type = "module";
 4 |     script.id = "runllm-widget-script"
 5 |   
 6 |     script.src = "https://widget.runllm.com";
 7 |   
 8 |     script.setAttribute("version", "stable");
 9 |     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
10 |     script.setAttribute("runllm-name", "vLLM");
11 |     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
12 |     script.setAttribute("runllm-assistant-id", "207");
13 |   
14 |     script.async = true;
15 |     document.head.appendChild(script);
16 |   });


--------------------------------------------------------------------------------
/docs/source/_templates/sections/header.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 |   .notification-bar {
 3 |     width: 100vw;
 4 |     display: flex;
 5 |     justify-content: center;
 6 |     align-items: center;
 7 |     font-size: 16px;
 8 |     padding: 0 6px 0 6px;
 9 |   }
10 |   .notification-bar p {
11 |     margin: 0;
12 |   }
13 |   .notification-bar a {
14 |     font-weight: bold;
15 |     text-decoration: none;
16 |   }
17 | 
18 |   /* Light mode styles (default) */
19 |   .notification-bar {
20 |     background-color: #fff3cd;
21 |     color: #856404;
22 |   }
23 |   .notification-bar a {
24 |     color: #d97706;
25 |   }
26 | 
27 |   /* Dark mode styles */
28 |   html[data-theme=dark] .notification-bar {
29 |     background-color: #333;
30 |     color: #ddd;
31 |   }
32 |   html[data-theme=dark] .notification-bar a {
33 |     color: #ffa500; /* Brighter color for visibility */
34 |   }
35 | </style>
36 | 
37 | <div class="notification-bar">
38 |   <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
39 | </div>
40 | 


--------------------------------------------------------------------------------
/docs/source/assets/dev/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/dev/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/community/sponsors.md:
--------------------------------------------------------------------------------
 1 | # Sponsors
 2 | 
 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 4 | 
 5 | <!-- Note: Please sort them in alphabetical order. -->
 6 | <!-- Note: Please keep these consistent with README.md. -->
 7 | 
 8 | - a16z
 9 | - AMD
10 | - Anyscale
11 | - AWS
12 | - Crusoe Cloud
13 | - Databricks
14 | - DeepInfra
15 | - Dropbox
16 | - Google Cloud
17 | - Lambda Lab
18 | - NVIDIA
19 | - Replicate
20 | - Roblox
21 | - RunPod
22 | - Sequoia Capital
23 | - Skywork AI
24 | - Trainy
25 | - UC Berkeley
26 | - UC San Diego
27 | - ZhenFund
28 | 
29 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
30 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/async_llm_engine.rst:
--------------------------------------------------------------------------------
1 | AsyncLLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.AsyncLLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/engine_index.rst:
--------------------------------------------------------------------------------
 1 | vLLM Engine
 2 | =================================
 3 | 
 4 | .. automodule:: vllm.engine
 5 | .. currentmodule:: vllm.engine
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Engines
10 | 
11 |    llm_engine
12 |    async_llm_engine
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/llm_engine.rst:
--------------------------------------------------------------------------------
1 | LLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.LLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/input_processing/input_processing_pipeline.rst:
--------------------------------------------------------------------------------
 1 | .. _input_processing_pipeline:
 2 | 
 3 | Input Processing Pipeline
 4 | =========================
 5 | 
 6 | 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
 7 | 
 8 | 2. Tokenize the data if necessary.
 9 | 
10 | 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
11 | 
12 |    - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
13 | 
14 | 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
15 | 
16 | 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
17 | 
18 | 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
19 | 
20 |    - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
21 | 


--------------------------------------------------------------------------------
/docs/source/dev/input_processing/model_inputs_index.rst:
--------------------------------------------------------------------------------
 1 | .. _input_processing:
 2 | 
 3 | Input Processing
 4 | ================
 5 | 
 6 | .. currentmodule:: vllm.inputs
 7 | 
 8 | Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
 9 | :data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
10 | 
11 | Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
12 | data in addition to input prompt, but it can be extended to text-only language models when needed.
13 | 
14 | Guides
15 | ++++++
16 | 
17 | .. toctree::
18 |    :maxdepth: 1
19 | 
20 |    input_processing_pipeline
21 | 
22 | Module Contents
23 | +++++++++++++++
24 | 
25 | LLM Engine Inputs
26 | -----------------
27 | 
28 | .. autoclass:: vllm.inputs.LLMInputs
29 |     :members:
30 |     :show-inheritance:
31 | 
32 | Registry
33 | --------
34 | 
35 | .. autodata:: vllm.inputs.INPUT_REGISTRY
36 | 
37 | .. automodule:: vllm.inputs.registry
38 |     :members:
39 |     :show-inheritance:
40 | 


--------------------------------------------------------------------------------
/docs/source/dev/multimodal/adding_multimodal_plugin.rst:
--------------------------------------------------------------------------------
 1 | .. _adding_multimodal_plugin:
 2 | 
 3 | Adding a Multimodal Plugin
 4 | ==========================
 5 | 
 6 | This document teaches you how to add a new modality to vLLM.
 7 | 
 8 | Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 9 | For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
10 | 
11 | The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
12 | 
13 | .. note::
14 |   This article is a work in progress.
15 | 
16 | ..
17 |   TODO: Add more instructions on how to add new plugins once embeddings is in.
18 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm.rst:
--------------------------------------------------------------------------------
1 | LLM Class
2 | =========
3 | 
4 | .. autoclass:: vllm.LLM
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm_inputs.rst:
--------------------------------------------------------------------------------
 1 | LLM Inputs
 2 | ==========
 3 | 
 4 | .. autodata:: vllm.inputs.PromptType
 5 | 
 6 | .. autoclass:: vllm.inputs.TextPrompt
 7 |     :show-inheritance:
 8 |     :members:
 9 |     :member-order: bysource
10 | 
11 | .. autoclass:: vllm.inputs.TokensPrompt
12 |     :show-inheritance:
13 |     :members:
14 |     :member-order: bysource
15 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/offline_index.rst:
--------------------------------------------------------------------------------
1 | Offline Inference
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    llm
8 |    llm_inputs
9 | 


--------------------------------------------------------------------------------
/docs/source/dev/sampling_params.rst:
--------------------------------------------------------------------------------
1 | Sampling Parameters
2 | ===================
3 | 
4 | .. autoclass:: vllm.SamplingParams
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/examples/examples_index.template.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: Scripts
7 | 
8 |    %EXAMPLE_DOCS%
9 | 


--------------------------------------------------------------------------------
/docs/source/models/engine_args.rst:
--------------------------------------------------------------------------------
 1 | .. _engine_args:
 2 | 
 3 | Engine Arguments
 4 | ================
 5 | 
 6 | Below, you can find an explanation of every engine argument for vLLM:
 7 | 
 8 | .. argparse::
 9 |     :module: vllm.engine.arg_utils
10 |     :func: _engine_args_parser
11 |     :prog: vllm serve
12 |     :nodefaultconst:
13 | 
14 | Async Engine Arguments
15 | ----------------------
16 | 
17 | Below are the additional arguments related to the asynchronous engine:
18 | 
19 | .. argparse::
20 |     :module: vllm.engine.arg_utils
21 |     :func: _async_engine_args_parser
22 |     :prog: vllm serve
23 |     :nodefaultconst:


--------------------------------------------------------------------------------
/docs/source/performance_benchmark/benchmarks.rst:
--------------------------------------------------------------------------------
 1 | .. _benchmarks:
 2 | 
 3 | Benchmark suites of vLLM
 4 | ========================
 5 | 
 6 | 
 7 | 
 8 | vLLM contains two sets of benchmarks:
 9 | 
10 | + **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard <https://perf.vllm.ai>`_ for the latest performance results.
11 | 
12 | + **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README <https://github.com/vllm-project/vllm/blob/main/README.md>`_.
13 | 
14 | 
15 | Trigger a benchmark
16 | -------------------
17 | 
18 | The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
19 | 
20 | 
21 | .. note::
22 | 
23 |    Please refer to `vLLM performance benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md>`_ and `vLLM nightly benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`_ for detailed descriptions on benchmark environment, workload and metrics.
24 | 


--------------------------------------------------------------------------------
/docs/source/quantization/fp8_e5m2_kvcache.rst:
--------------------------------------------------------------------------------
 1 | .. _fp8_kv_cache:
 2 | 
 3 | FP8 E5M2 KV Cache
 4 | ==================
 5 | 
 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
 8 | 
 9 | Here is an example of how to enable this feature:
10 | 
11 | .. code-block:: python
12 | 
13 |     from vllm import LLM, SamplingParams
14 |     # Sample prompts.
15 |     prompts = [
16 |         "Hello, my name is",
17 |         "The president of the United States is",
18 |         "The capital of France is",
19 |         "The future of AI is",
20 |     ]
21 |     # Create a sampling params object.
22 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
23 |     # Create an LLM.
24 |     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
25 |     # Generate texts from the prompts. The output is a list of RequestOutput objects
26 |     # that contain the prompt, generated text, and other information.
27 |     outputs = llm.generate(prompts, sampling_params)
28 |     # Print the outputs.
29 |     for output in outputs:
30 |         prompt = output.prompt
31 |         generated_text = output.outputs[0].text
32 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_bentoml.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_bentoml:
2 | 
3 | Deploying with BentoML
4 | ======================
5 | 
6 | `BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_kserve.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_kserve:
2 | 
3 | Deploying with KServe
4 | ============================
5 | 
6 | vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/>`_ for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_lws.rst:
--------------------------------------------------------------------------------
 1 | .. _deploying_with_lws:
 2 | 
 3 | Deploying with LWS
 4 | ============================
 5 | 
 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 7 | A major use case is for multi-host/multi-node distributed inference.
 8 | 
 9 | vLLM can be deployed with `LWS <https://github.com/kubernetes-sigs/lws>`_ on Kubernetes for distributed model serving.
10 | 
11 | Please see `this guide <https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm>`_ for more details on
12 | deploying vLLM on Kubernetes using LWS.
13 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/env_vars.rst:
--------------------------------------------------------------------------------
 1 | Environment Variables
 2 | ========================
 3 | 
 4 | vLLM uses the following environment variables to configure the system:
 5 | 
 6 | .. warning::
 7 |     Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
 8 | 
 9 |     All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
10 | 
11 | .. literalinclude:: ../../../vllm/envs.py
12 |     :language: python
13 |     :start-after: begin-env-vars-definition
14 |     :end-before: end-env-vars-definition
15 | 


--------------------------------------------------------------------------------
/docs/source/serving/integrations.rst:
--------------------------------------------------------------------------------
 1 | Integrations
 2 | ------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    run_on_sky
 8 |    deploying_with_kserve
 9 |    deploying_with_triton
10 |    deploying_with_bentoml
11 |    deploying_with_cerebrium
12 |    deploying_with_lws
13 |    deploying_with_dstack
14 |    serving_with_langchain
15 |    serving_with_llamaindex
16 | 


--------------------------------------------------------------------------------
/docs/source/serving/metrics.rst:
--------------------------------------------------------------------------------
 1 | Production Metrics
 2 | ==================
 3 | 
 4 | vLLM exposes a number of metrics that can be used to monitor the health of the
 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 6 | OpenAI compatible API server.
 7 | 
 8 | The following metrics are exposed:
 9 | 
10 | .. literalinclude:: ../../../vllm/engine/metrics.py
11 |     :language: python
12 |     :start-after: begin-metrics-definitions
13 |     :end-before: end-metrics-definitions
14 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_langchain.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_langchain:
 2 | 
 3 | Serving with Langchain
 4 | ============================
 5 | 
 6 | vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 7 | 
 8 | To install langchain, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install langchain langchain_community -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from langchain_community.llms import VLLM
19 | 
20 |     llm = VLLM(model="mosaicml/mpt-7b",
21 |                trust_remote_code=True,  # mandatory for hf models
22 |                max_new_tokens=128,
23 |                top_k=10,
24 |                top_p=0.95,
25 |                temperature=0.8,
26 |                # tensor_parallel_size=... # for distributed inference
27 |     )
28 | 
29 |     print(llm("What is the capital of France ?"))
30 | 
31 | Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
32 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_llamaindex.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_llamaindex:
 2 | 
 3 | Serving with llama_index
 4 | ============================
 5 | 
 6 | vLLM is also available via `llama_index <https://github.com/run-llama/llama_index>`_ .
 7 | 
 8 | To install llamaindex, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install llama-index-llms-vllm -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from llama_index.llms.vllm import Vllm
19 | 
20 |     llm = Vllm(
21 |         model="microsoft/Orca-2-7b",
22 |         tensor_parallel_size=4,
23 |         max_new_tokens=100,
24 |         vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
25 |     )
26 | 
27 | Please refer to this `Tutorial <https://docs.llamaindex.ai/en/latest/examples/llm/vllm/>`_ for more details.
28 | 


--------------------------------------------------------------------------------
/docs/source/serving/tensorizer.rst:
--------------------------------------------------------------------------------
 1 | .. _tensorizer:
 2 | 
 3 | Loading Models with CoreWeave's Tensorizer
 4 | ==========================================
 5 | vLLM supports loading models with `CoreWeave's Tensorizer <https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer>`_.
 6 | vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
 7 | at runtime extremely quickly directly to the GPU, resulting in significantly
 8 | shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
 9 | 
10 | For more information on CoreWeave's Tensorizer, please refer to
11 | `CoreWeave's Tensorizer documentation <https://github.com/coreweave/tensorizer>`_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
12 | the `vLLM example script <https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html>`_.


--------------------------------------------------------------------------------
/examples/cpu_offload.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference_arctic.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="snowflake/snowflake-arctic-instruct",
15 |           quantization="deepspeedfp",
16 |           tensor_parallel_size=8,
17 |           trust_remote_code=True)
18 | # Generate texts from the prompts. The output is a list of RequestOutput objects
19 | # that contain the prompt, generated text, and other information.
20 | 
21 | outputs = llm.generate(prompts, sampling_params)
22 | # Print the outputs.
23 | for output in outputs:
24 |     prompt = output.prompt
25 |     generated_text = output.outputs[0].text
26 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
27 | 


--------------------------------------------------------------------------------
/examples/offline_inference_embedding.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | 
11 | # Create an LLM.
12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
13 | # Generate embedding. The output is a list of EmbeddingRequestOutputs.
14 | outputs = model.encode(prompts)
15 | # Print the outputs.
16 | for output in outputs:
17 |     print(output.outputs.embedding)  # list of 4096 floats
18 | 


--------------------------------------------------------------------------------
/examples/offline_inference_tpu.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | prompts = [
 4 |     "A robot may not injure a human being",
 5 |     "It is only with the heart that one can see rightly;",
 6 |     "The greatest glory in living lies not in never falling,",
 7 | ]
 8 | answers = [
 9 |     " or, through inaction, allow a human being to come to harm.",
10 |     " what is essential is invisible to the eye.",
11 |     " but in rising every time we fall.",
12 | ]
13 | N = 1
14 | # Currently, top-p sampling is disabled. `top_p` should be 1.0.
15 | sampling_params = SamplingParams(temperature=0.7,
16 |                                  top_p=1.0,
17 |                                  n=N,
18 |                                  max_tokens=16)
19 | 
20 | # Set `enforce_eager=True` to avoid ahead-of-time compilation.
21 | # In real workloads, `enforace_eager` should be `False`.
22 | llm = LLM(model="google/gemma-2b", enforce_eager=True)
23 | outputs = llm.generate(prompts, sampling_params)
24 | for output, answer in zip(outputs, answers):
25 |     prompt = output.prompt
26 |     generated_text = output.outputs[0].text
27 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
28 |     assert generated_text.startswith(answer)
29 | 


--------------------------------------------------------------------------------
/examples/offline_inference_with_profiler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | # enable torch profiler, can also be set on cmd line
 6 | os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 7 | 
 8 | # Sample prompts.
 9 | prompts = [
10 |     "Hello, my name is",
11 |     "The president of the United States is",
12 |     "The capital of France is",
13 |     "The future of AI is",
14 | ]
15 | # Create a sampling params object.
16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
17 | 
18 | # Create an LLM.
19 | llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
20 | 
21 | llm.start_profile()
22 | 
23 | # Generate texts from the prompts. The output is a list of RequestOutput objects
24 | # that contain the prompt, generated text, and other information.
25 | outputs = llm.generate(prompts, sampling_params)
26 | 
27 | llm.stop_profile()
28 | 
29 | # Print the outputs.
30 | for output in outputs:
31 |     prompt = output.prompt
32 |     generated_text = output.outputs[0].text
33 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
34 | 


--------------------------------------------------------------------------------
/examples/openai_chat_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | chat_completion = client.chat.completions.create(
17 |     messages=[{
18 |         "role": "system",
19 |         "content": "You are a helpful assistant."
20 |     }, {
21 |         "role": "user",
22 |         "content": "Who won the world series in 2020?"
23 |     }, {
24 |         "role":
25 |         "assistant",
26 |         "content":
27 |         "The Los Angeles Dodgers won the World Series in 2020."
28 |     }, {
29 |         "role": "user",
30 |         "content": "Where was it played?"
31 |     }],
32 |     model=model,
33 | )
34 | 
35 | print("Chat completion results:")
36 | print(chat_completion)
37 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | # Completion API
17 | stream = False
18 | completion = client.completions.create(
19 |     model=model,
20 |     prompt="A robot may not injure a human being",
21 |     echo=False,
22 |     n=2,
23 |     stream=stream,
24 |     logprobs=3)
25 | 
26 | print("Completion results:")
27 | if stream:
28 |     for c in completion:
29 |         print(c)
30 | else:
31 |     print(completion)
32 | 


--------------------------------------------------------------------------------
/examples/openai_embedding_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | responses = client.embeddings.create(
17 |     input=[
18 |         "Hello my name is",
19 |         "The best thing about vLLM is that it supports many different models"
20 |     ],
21 |     model=model,
22 | )
23 | 
24 | for data in responses.data:
25 |     print(data.embedding)  # list of float of len 4096
26 | 


--------------------------------------------------------------------------------
/examples/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
3 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_blip2.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'Question: ' + message['content'] + ' ' -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Answer: ' + message['content'] + ' ' -}}
 6 |     {%- endif -%}
 7 | {%- endfor -%}
 8 | 
 9 | {%- if add_generation_prompt -%}
10 |     {{- 'Answer:' -}}
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_inkbot.jinja:
--------------------------------------------------------------------------------
 1 | <#meta#>
 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 4 | <#system#>
 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 6 | <#chat#>
 7 | {% for message in messages %}
 8 | {% if message['role'] == 'user' %}
 9 | <#user#>
10 | {{ message['content']|trim -}}
11 | {% if not loop.last %}
12 | 
13 | {% endif %}
14 | {% elif message['role'] == 'assistant' %}
15 | <#bot#>
16 | {{ message['content']|trim -}}
17 | {% if not loop.last %}
18 | 
19 | {% endif %}
20 | {% elif message['role'] == 'user_context' %}
21 | <#user_context#>
22 | {{ message['content']|trim -}}
23 | {% if not loop.last %}
24 | 
25 | {% endif %}
26 | {% endif %}
27 | {% endfor %}
28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
29 | <#bot#>
30 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_llava.jinja:
--------------------------------------------------------------------------------
 1 | {%- if messages[0]['role'] == 'system' -%}
 2 |     {%- set system_message = messages[0]['content'] -%}
 3 |     {%- set messages = messages[1:] -%}
 4 | {%- else -%}
 5 |     {% set system_message = '' -%}
 6 | {%- endif -%}
 7 | 
 8 | {{ bos_token + system_message }}
 9 | {%- for message in messages -%}
10 |     {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
11 |         {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
12 |     {%- endif -%}
13 | 
14 |     {%- if message['role'] == 'user' -%}
15 |         {{ 'USER: ' + message['content'] + '\n' }}
16 |     {%- elif message['role'] == 'assistant' -%}
17 |         {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
18 |     {%- endif -%}
19 | {%- endfor -%}
20 | 
21 | {%- if add_generation_prompt -%}
22 |     {{ 'ASSISTANT:' }}
23 | {% endif %}
24 | 


--------------------------------------------------------------------------------
/find_cuda_init.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import traceback
 3 | from typing import Callable
 4 | from unittest.mock import patch
 5 | 
 6 | 
 7 | def find_cuda_init(fn: Callable[[], object]) -> None:
 8 |     """
 9 |     Helper function to debug CUDA re-initialization errors.
10 | 
11 |     If `fn` initializes CUDA, prints the stack trace of how this happens.
12 |     """
13 |     from torch.cuda import _lazy_init
14 | 
15 |     stack = None
16 | 
17 |     def wrapper():
18 |         nonlocal stack
19 |         stack = traceback.extract_stack()
20 |         return _lazy_init()
21 | 
22 |     with patch("torch.cuda._lazy_init", wrapper):
23 |         fn()
24 | 
25 |     if stack is not None:
26 |         print("==== CUDA Initialized ====")
27 |         print("".join(traceback.format_list(stack)).strip())
28 |         print("==========================")
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     find_cuda_init(
33 |         lambda: importlib.import_module("vllm.model_executor.models.llava"))
34 | 


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
 1 | # Should be mirrored in pyproject.toml
 2 | cmake>=3.26
 3 | ninja
 4 | packaging
 5 | setuptools>=61
 6 | setuptools-scm>=8
 7 | torch==2.4.0
 8 | wheel
 9 | jinja2
10 | 


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for x86_64 CPUs
5 | torch == 2.4.0+cpu; platform_machine != "ppc64le"
6 | torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
7 | 


--------------------------------------------------------------------------------
/requirements-cuda.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for NVIDIA GPUs
 5 | ray >= 2.9
 6 | nvidia-ml-py # for pynvml package
 7 | torch == 2.4.0
 8 | # These must be updated alongside torch
 9 | torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
10 | xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
11 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements-lint.txt
2 | -r requirements-test.txt
3 | 
4 | # Avoid adding requirements directly to this file.
5 | # Instead, modify the two files referenced above.
6 | 


--------------------------------------------------------------------------------
/requirements-lint.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | tomli==2.0.1
 5 | ruff==0.6.5
 6 | codespell==2.3.0
 7 | isort==5.13.2
 8 | clang-format==18.1.5
 9 | 
10 | # type checking
11 | mypy==1.11.1
12 | types-PyYAML
13 | types-requests
14 | types-setuptools
15 | 


--------------------------------------------------------------------------------
/requirements-neuron.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for Neuron devices
5 | transformers-neuronx >= 0.12.0
6 | torch-neuronx >= 2.1.2
7 | neuronx-cc
8 | 


--------------------------------------------------------------------------------
/requirements-openvino.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | torch == 2.4.0 #  should be aligned with "common" vLLM torch version
5 | openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
6 | 
7 | optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
8 | optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
9 | 


--------------------------------------------------------------------------------
/requirements-rocm.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for AMD GPUs
 5 | awscli
 6 | boto3
 7 | botocore
 8 | ray >= 2.10.0
 9 | peft
10 | pytest-asyncio
11 | tensorizer>=2.9.0


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | # testing
 2 | pytest
 3 | tensorizer>=2.9.0
 4 | pytest-forked
 5 | pytest-asyncio
 6 | pytest-rerunfailures
 7 | pytest-shard
 8 | 
 9 | # testing utils
10 | awscli
11 | einops # required for MPT, qwen-vl and Mamba
12 | httpx
13 | librosa # required for audio tests
14 | opencv-python # required for video tests
15 | peft
16 | requests
17 | ray[adag]==2.35
18 | sentence-transformers # required for embedding
19 | soundfile # required for audio test
20 | compressed-tensors==0.4.0 # required for compressed-tensors
21 | timm # required for internvl test
22 | transformers_stream_generator # required for qwen-vl test
23 | matplotlib # required for qwen-vl test
24 | datamodel_code_generator # required for minicpm3 test
25 | lm-eval[api]==0.4.4 # required for model evaluation test
26 | 
27 | # TODO: Add this after fully implementing llava(mantis)
28 | # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
29 | 
30 | # Benchmarking
31 | aiohttp
32 | 
33 | # quantization
34 | bitsandbytes>=0.44.0
35 | buildkite-test-collector==0.1.8
36 | 


--------------------------------------------------------------------------------
/requirements-tpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for TPU
5 | # Currently, the TPU backend uses a nightly version of PyTorch XLA.
6 | # You can install the dependencies in Dockerfile.tpu.
7 | ray[default]
8 | 


--------------------------------------------------------------------------------
/requirements-xpu.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | ray >= 2.9
 5 | cmake>=3.26
 6 | ninja
 7 | packaging
 8 | setuptools-scm>=8
 9 | wheel
10 | jinja2
11 | # Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
12 | torch == 2.3.1+cxx11.abi
13 | intel-extension-for-pytorch == 2.3.110+xpu
14 | oneccl_bind_pt == 2.3.100+xpu
15 | 
16 | triton-xpu == 3.0.0b2
17 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/async_engine/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/basic_correctness/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/test_cpu_offload.py:
--------------------------------------------------------------------------------
1 | from ..utils import compare_two_settings
2 | 
3 | 
4 | def test_cpu_offload():
5 |     compare_two_settings("meta-llama/Llama-2-7b-hf", [],
6 |                          ["--cpu-offload-gb", "4"])
7 | 


--------------------------------------------------------------------------------
/tests/compile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/compile/__init__.py


--------------------------------------------------------------------------------
/tests/compile/test_full_graph.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.compilation.levels import CompilationLevel
 4 | 
 5 | from ..utils import fork_new_process_for_each_test
 6 | from .utils import TEST_MODELS, check_full_graph_support
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("model_info", TEST_MODELS)
10 | @pytest.mark.parametrize(
11 |     "optimization_level",
12 |     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR])
13 | @fork_new_process_for_each_test
14 | def test_full_graph(model_info, optimization_level):
15 |     model = model_info[0]
16 |     model_kwargs = model_info[1]
17 |     check_full_graph_support(model,
18 |                              model_kwargs,
19 |                              optimization_level,
20 |                              tp_size=1)
21 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/block/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def should_do_global_cleanup_after_test() -> bool:
 6 |     """Disable the global cleanup fixture for tests in this directory. This
 7 |     provides a ~10x speedup for unit tests that don't load a model to GPU.
 8 | 
 9 |     This requires that tests in this directory clean up after themselves if they
10 |     use the GPU.
11 |     """
12 |     return False
13 | 


--------------------------------------------------------------------------------
/tests/core/block/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/core/block/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/core/test_serialization.py:
--------------------------------------------------------------------------------
 1 | import msgspec
 2 | 
 3 | from vllm.executor.msgspec_utils import decode_hook, encode_hook
 4 | from vllm.sequence import ExecuteModelRequest
 5 | 
 6 | from ..spec_decode.utils import create_batch
 7 | 
 8 | 
 9 | def test_msgspec_serialization():
10 |     num_lookahead_slots = 4
11 |     seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
12 |     execute_model_req = ExecuteModelRequest(
13 |         seq_group_metadata_list=seq_group_metadata_list,
14 |         num_lookahead_slots=num_lookahead_slots,
15 |         running_queue_size=4)
16 | 
17 |     encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
18 |     decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
19 |                                       dec_hook=decode_hook)
20 |     req = decoder.decode(encoder.encode(execute_model_req))
21 |     expected = execute_model_req.seq_group_metadata_list
22 |     actual = req.seq_group_metadata_list
23 |     assert (len(expected) == len(actual))
24 |     expected = expected[0]
25 |     actual = actual[0]
26 | 
27 |     assert expected.block_tables == actual.block_tables
28 |     assert expected.is_prompt == actual.is_prompt
29 |     assert expected.request_id == actual.request_id
30 |     assert (expected.seq_data[0].prompt_token_ids ==
31 |             actual.seq_data[0].prompt_token_ids)
32 |     assert (expected.seq_data[0].output_token_ids ==
33 |             actual.seq_data[0].output_token_ids)
34 | 


--------------------------------------------------------------------------------
/tests/data/test_config.yaml:
--------------------------------------------------------------------------------
1 | port: 12312
2 | served_model_name: mymodel
3 | tensor_parallel_size: 2
4 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/distributed/__init__.py


--------------------------------------------------------------------------------
/tests/distributed/test_distributed_oot.py:
--------------------------------------------------------------------------------
1 | from ..entrypoints.openai.test_oot_registration import (
2 |     run_and_test_dummy_opt_api_server)
3 | 
4 | 
5 | def test_distributed_oot(dummy_opt_path: str):
6 |     run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
7 | 


--------------------------------------------------------------------------------
/tests/distributed/test_pp_cudagraph.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from ..utils import compare_two_settings, fork_new_process_for_each_test
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
 9 |     (2, "JackFram/llama-160m"),
10 | ])
11 | @pytest.mark.parametrize("ATTN_BACKEND", [
12 |     "FLASH_ATTN",
13 |     "FLASHINFER",
14 | ])
15 | @fork_new_process_for_each_test
16 | def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
17 |     cudagraph_args = [
18 |         # use half precision for speed and memory savings in CI environment
19 |         "--dtype",
20 |         "float16",
21 |         "--pipeline-parallel-size",
22 |         str(PP_SIZE),
23 |         "--distributed-executor-backend",
24 |         "mp",
25 |     ]
26 |     os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
27 | 
28 |     eager_args = cudagraph_args + ["--enforce-eager"]
29 | 
30 |     compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
31 | 


--------------------------------------------------------------------------------
/tests/distributed/test_same_node.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch.distributed as dist
 4 | 
 5 | from vllm.distributed.parallel_state import in_the_same_node_as
 6 | 
 7 | if __name__ == "__main__":
 8 |     dist.init_process_group(backend="gloo")
 9 |     test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
10 | 
11 |     expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
12 |     assert test_result == expected, f"Expected {expected}, got {test_result}"
13 |     print("Same node test passed!")
14 | 


--------------------------------------------------------------------------------
/tests/distributed/test_utils.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | 
 3 | import vllm.envs as envs
 4 | from vllm.utils import (cuda_device_count_stateless,
 5 |                         update_environment_variables)
 6 | 
 7 | 
 8 | @ray.remote
 9 | class _CUDADeviceCountStatelessTestActor:
10 | 
11 |     def get_count(self):
12 |         return cuda_device_count_stateless()
13 | 
14 |     def set_cuda_visible_devices(self, cuda_visible_devices: str):
15 |         update_environment_variables(
16 |             {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
17 | 
18 |     def get_cuda_visible_devices(self):
19 |         return envs.CUDA_VISIBLE_DEVICES
20 | 
21 | 
22 | def test_cuda_device_count_stateless():
23 |     """Test that cuda_device_count_stateless changes return value if
24 |     CUDA_VISIBLE_DEVICES is changed."""
25 |     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
26 |         num_gpus=2).remote()
27 |     assert sorted(ray.get(
28 |         actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
29 |     assert ray.get(actor.get_count.remote()) == 2
30 |     ray.get(actor.set_cuda_visible_devices.remote("0"))
31 |     assert ray.get(actor.get_count.remote()) == 1
32 |     ray.get(actor.set_cuda_visible_devices.remote(""))
33 |     assert ray.get(actor.get_count.remote()) == 0
34 | 


--------------------------------------------------------------------------------
/tests/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/encoder_decoder/__init__.py


--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/engine/__init__.py


--------------------------------------------------------------------------------
/tests/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/tests/engine/test_skip_tokenizer_init.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_skip_tokenizer_initialization(model: str):
 9 |     # This test checks if the flag skip_tokenizer_init skips the initialization
10 |     # of tokenizer and detokenizer. The generated output is expected to contain
11 |     # token ids.
12 |     llm = LLM(model=model, skip_tokenizer_init=True)
13 |     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
14 | 
15 |     with pytest.raises(ValueError, match="cannot pass text prompts when"):
16 |         llm.generate("abc", sampling_params)
17 | 
18 |     outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
19 |                            sampling_params=sampling_params)
20 |     assert len(outputs) > 0
21 |     completions = outputs[0].outputs
22 |     assert len(completions) > 0
23 |     assert completions[0].text == ""
24 |     assert completions[0].token_ids
25 | 


--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/llm/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/llm/test_prompt_validation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import LLM
 4 | 
 5 | 
 6 | def test_empty_prompt():
 7 |     llm = LLM(model="gpt2")
 8 |     with pytest.raises(ValueError, match='Prompt cannot be empty'):
 9 |         llm.generate([""])
10 | 


--------------------------------------------------------------------------------
/tests/entrypoints/offline_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/offline_mode/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/test_prompt_validation.py:
--------------------------------------------------------------------------------
 1 | # imports for guided decoding tests
 2 | import re
 3 | 
 4 | import openai
 5 | import pytest
 6 | 
 7 | from ...utils import RemoteOpenAIServer
 8 | 
 9 | 
10 | @pytest.mark.asyncio
11 | async def test_empty_prompt():
12 |     model_name = "gpt2"
13 |     server_args = ["--enforce-eager"]
14 |     with RemoteOpenAIServer(model_name, server_args) as remote_server:
15 |         client = remote_server.get_async_client()
16 | 
17 |         with pytest.raises(openai.BadRequestError,
18 |                            match=re.compile('.+Prompt cannot be empty.+')):
19 |             await client.completions.create(model=model_name,
20 |                                             prompt="",
21 |                                             max_tokens=5,
22 |                                             temperature=0.0)
23 | 


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/kernels/__init__.py


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Reference default values of atol and rtol are from
 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 6 | default_rtol = {
 7 |     torch.float16: 1e-3,
 8 |     torch.bfloat16: 1.6e-2,
 9 |     torch.float: 1.3e-6
10 | }
11 | 
12 | 
13 | def get_default_atol(output) -> float:
14 |     return default_atol[output.dtype]
15 | 
16 | 
17 | def get_default_rtol(output) -> float:
18 |     return default_rtol[output.dtype]
19 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.utils import (create_kv_caches_with_random,
 4 |                         create_kv_caches_with_random_flash)
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def kv_cache_factory():
 9 |     return create_kv_caches_with_random
10 | 
11 | 
12 | @pytest.fixture()
13 | def kv_cache_factory_flashinfer():
14 |     return create_kv_caches_with_random_flash
15 | 


--------------------------------------------------------------------------------
/tests/kernels/test_ggml.py:
--------------------------------------------------------------------------------
 1 | import gguf
 2 | import pytest
 3 | import torch
 4 | 
 5 | from tests.kernels.utils import opcheck
 6 | from vllm import _custom_ops as ops  # noqa: F401
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("quant_type", [12])
10 | def test_ggml_opcheck(quant_type):
11 |     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
12 |     shape = [256, 1152]
13 |     qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
14 |     m = qweight.shape[0]
15 |     n = qweight.shape[1] // type_size * block_size
16 |     opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
17 | 
18 |     x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
19 |     opcheck(torch.ops._C.ggml_mul_mat_a8,
20 |             (qweight, x, quant_type, qweight.shape[0]))
21 |     opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
22 |             (qweight, x, quant_type, qweight.shape[0]))
23 | 


--------------------------------------------------------------------------------
/tests/kernels/test_gptq.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from tests.kernels.utils import opcheck
 4 | from vllm import _custom_ops as ops  # noqa: F401
 5 | 
 6 | 
 7 | def test_gptq_shuffle_opcheck():
 8 |     weight = torch.randint(-2000000,
 9 |                            2000000, (1792, 4096),
10 |                            device='cuda',
11 |                            dtype=torch.int32)
12 |     perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
13 |     bit = 4
14 |     opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
15 | 
16 | 
17 | def test_gptq_gemm_opcheck():
18 |     a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
19 |     weight = torch.randint(-2000000,
20 |                            2000000, (512, 6144),
21 |                            device='cuda',
22 |                            dtype=torch.int32)
23 |     zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
24 |     scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
25 |     idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
26 |     use_exllama = True
27 |     bit = 4
28 |     opcheck(torch.ops._C.gptq_gemm,
29 |             (a, weight, zeros, scales, idx, use_exllama, bit))
30 | 


--------------------------------------------------------------------------------
/tests/kernels/test_permute_cols.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from tests.kernels.utils import opcheck
 5 | from vllm._custom_ops import permute_cols
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
 9 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
10 | def test_permute_cols(shape, dtype):
11 |     x = torch.randn(shape, dtype=dtype).cuda()
12 |     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
13 |     opcheck(torch.ops._C.permute_cols, (x, perm))
14 |     y = permute_cols(x, perm)
15 |     torch.testing.assert_close(y, x[:, perm])


--------------------------------------------------------------------------------
/tests/kernels/test_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for miscellaneous utilities
 3 | """
 4 | 
 5 | import pytest
 6 | import torch
 7 | 
 8 | from tests.kernels.utils import opcheck
 9 | from vllm.platforms import current_platform
10 | 
11 | 
12 | def test_convert_fp8_opcheck():
13 |     data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
14 |     result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
15 |     opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
16 | 
17 | 
18 | @pytest.mark.skipif(not current_platform.is_cuda(),
19 |                     reason="Only supported for CUDA")
20 | def test_cuda_utils_opcheck():
21 |     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
22 |     opcheck(
23 |         torch.ops._C_cuda_utils.
24 |         get_max_shared_memory_per_block_device_attribute, (0, ))
25 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/lora/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/lora/data/__init__.py


--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/model_executor/__init__.py


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/audio_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/audio_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/decoder_only/language/test_granite.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_granite.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | from ...utils import check_logprobs_close
 8 | 
 9 | MODELS = [
10 |     "ibm/PowerLM-3b",
11 | ]
12 | 
13 | 
14 | @pytest.mark.parametrize("model", MODELS)
15 | @pytest.mark.parametrize("dtype", ["bfloat16"])
16 | @pytest.mark.parametrize("max_tokens", [64])
17 | @pytest.mark.parametrize("num_logprobs", [5])
18 | def test_models(
19 |     hf_runner,
20 |     vllm_runner,
21 |     example_prompts,
22 |     model: str,
23 |     dtype: str,
24 |     max_tokens: int,
25 |     num_logprobs: int,
26 | ) -> None:
27 |     # TODO(sang): Sliding window should be tested separately.
28 |     with hf_runner(model, dtype=dtype) as hf_model:
29 |         hf_outputs = hf_model.generate_greedy_logprobs_limit(
30 |             example_prompts, max_tokens, num_logprobs)
31 | 
32 |     with vllm_runner(model, dtype=dtype) as vllm_model:
33 |         vllm_outputs = vllm_model.generate_greedy_logprobs(
34 |             example_prompts, max_tokens, num_logprobs)
35 |     check_logprobs_close(
36 |         outputs_0_lst=hf_outputs,
37 |         outputs_1_lst=vllm_outputs,
38 |         name_0="hf",
39 |         name_1="vllm",
40 |     )
41 | 


--------------------------------------------------------------------------------
/tests/models/decoder_only/language/test_granitemoe.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_granite.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | from ...utils import check_logprobs_close
 8 | 
 9 | MODELS = [
10 |     "ibm/PowerMoE-3b",
11 | ]
12 | 
13 | 
14 | @pytest.mark.parametrize("model", MODELS)
15 | @pytest.mark.parametrize("dtype", ["bfloat16"])
16 | @pytest.mark.parametrize("max_tokens", [64])
17 | @pytest.mark.parametrize("num_logprobs", [5])
18 | def test_models(
19 |     hf_runner,
20 |     vllm_runner,
21 |     example_prompts,
22 |     model: str,
23 |     dtype: str,
24 |     max_tokens: int,
25 |     num_logprobs: int,
26 | ) -> None:
27 |     with hf_runner(model, dtype=dtype) as hf_model:
28 |         hf_outputs = hf_model.generate_greedy_logprobs_limit(
29 |             example_prompts, max_tokens, num_logprobs)
30 | 
31 |     with vllm_runner(model, dtype=dtype) as vllm_model:
32 |         vllm_outputs = vllm_model.generate_greedy_logprobs(
33 |             example_prompts, max_tokens, num_logprobs)
34 |     check_logprobs_close(
35 |         outputs_0_lst=hf_outputs,
36 |         outputs_1_lst=vllm_outputs,
37 |         name_0="hf",
38 |         name_1="vllm",
39 |     )
40 | 


--------------------------------------------------------------------------------
/tests/models/decoder_only/vision_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/decoder_only/vision_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/embedding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/embedding/__init__.py


--------------------------------------------------------------------------------
/tests/models/embedding/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/embedding/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/vision_language/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/models/encoder_decoder/vision_language/__init__.py


--------------------------------------------------------------------------------
/tests/models/encoder_decoder/vision_language/test_broadcast.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ....utils import multi_gpu_test
 4 | 
 5 | 
 6 | @multi_gpu_test(num_gpus=2)
 7 | @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 8 | @pytest.mark.parametrize("model", [
 9 |     "meta-llama/Llama-3.2-11B-Vision-Instruct",
10 | ])
11 | def test_models(hf_runner, vllm_runner, image_assets,
12 |                 distributed_executor_backend, model) -> None:
13 | 
14 |     dtype = "half"
15 |     max_tokens = 5
16 |     num_logprobs = 5
17 |     tensor_parallel_size = 2
18 | 
19 |     if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
20 |         from .test_mllama import models, run_test
21 |     else:
22 |         raise NotImplementedError(f"Unsupported model: {model}")
23 | 
24 |     run_test(
25 |         hf_runner,
26 |         vllm_runner,
27 |         image_assets,
28 |         model=models[0],
29 |         size_factors=[0.25, 0.5, 1.0],
30 |         dtype=dtype,
31 |         max_tokens=max_tokens,
32 |         num_logprobs=num_logprobs,
33 |         tensor_parallel_size=tensor_parallel_size,
34 |         distributed_executor_backend=distributed_executor_backend,
35 |     )
36 | 


--------------------------------------------------------------------------------
/tests/mq_llm_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/mq_llm_engine/__init__.py


--------------------------------------------------------------------------------
/tests/multi_step/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/multi_step/__init__.py


--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/multimodal/__init__.py


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='vllm_add_dummy_model',
 4 |       version='0.1',
 5 |       packages=['vllm_add_dummy_model'],
 6 |       entry_points={
 7 |           'vllm.general_plugins':
 8 |           ["register_dummy_model = vllm_add_dummy_model:register"]
 9 |       })
10 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm import ModelRegistry
 2 | 
 3 | 
 4 | def register():
 5 |     # Test directly passing the model
 6 |     from .my_opt import MyOPTForCausalLM
 7 | 
 8 |     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
 9 |         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
10 | 
11 |     # Test passing lazy model
12 |     if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
13 |         ModelRegistry.register_model(
14 |             "MyGemma2Embedding",
15 |             "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
16 |         )
17 | 
18 |     if "MyLlava" not in ModelRegistry.get_supported_archs():
19 |         ModelRegistry.register_model("MyLlava",
20 |                                      "vllm_add_dummy_model.my_llava:MyLlava")
21 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Union
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.attention import AttentionMetadata
 6 | from vllm.model_executor.models.gemma2_embedding import Gemma2EmbeddingModel
 7 | from vllm.sequence import IntermediateTensors
 8 | 
 9 | 
10 | class MyGemma2Embedding(Gemma2EmbeddingModel):
11 | 
12 |     def forward(
13 |         self,
14 |         input_ids: torch.Tensor,
15 |         positions: torch.Tensor,
16 |         kv_caches: List[torch.Tensor],
17 |         attn_metadata: AttentionMetadata,
18 |         intermediate_tensors: Optional[IntermediateTensors] = None,
19 |         inputs_embeds: Optional[torch.Tensor] = None,
20 |     ) -> Union[torch.Tensor, IntermediateTensors]:
21 |         hidden_states = super().forward(
22 |             input_ids,
23 |             positions,
24 |             kv_caches,
25 |             attn_metadata,
26 |             intermediate_tensors=intermediate_tensors,
27 |             inputs_embeds=inputs_embeds,
28 |         )
29 | 
30 |         if isinstance(hidden_states, IntermediateTensors):
31 |             return hidden_states
32 | 
33 |         # Return all-zero embeddings
34 |         return torch.zeros_like(hidden_states)
35 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.inputs import INPUT_REGISTRY
 6 | from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
 7 |                                               dummy_data_for_llava,
 8 |                                               get_max_llava_image_tokens,
 9 |                                               input_processor_for_llava)
10 | from vllm.model_executor.sampling_metadata import SamplingMetadata
11 | from vllm.multimodal import MULTIMODAL_REGISTRY
12 | 
13 | 
14 | @MULTIMODAL_REGISTRY.register_image_input_mapper()
15 | @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
16 | @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
17 | @INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
18 | class MyLlava(LlavaForConditionalGeneration):
19 | 
20 |     def compute_logits(
21 |             self, hidden_states: torch.Tensor,
22 |             sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
23 |         # this dummy model always predicts the first token
24 |         logits = super().compute_logits(hidden_states, sampling_metadata)
25 |         if logits is not None:
26 |             logits.zero_()
27 |             logits[:, 0] += 1.0
28 |         return logits
29 | 


--------------------------------------------------------------------------------
/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.model_executor.models.opt import OPTForCausalLM
 6 | from vllm.model_executor.sampling_metadata import SamplingMetadata
 7 | 
 8 | 
 9 | class MyOPTForCausalLM(OPTForCausalLM):
10 | 
11 |     def compute_logits(
12 |             self, hidden_states: torch.Tensor,
13 |             sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
14 |         # this dummy model always predicts the first token
15 |         logits = super().compute_logits(hidden_states, sampling_metadata)
16 |         if logits is not None:
17 |             logits.zero_()
18 |             logits[:, 0] += 1.0
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/tests/prefix_caching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/prefix_caching/__init__.py


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/quantization/test_experts_int8.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | """Tests experts_int8 quantization startup and generation, 
 3 | doesn't test correctness
 4 | """
 5 | import pytest
 6 | 
 7 | from tests.quantization.utils import is_quant_method_supported
 8 | 
 9 | MODELS = ["ai21labs/Jamba-tiny-random"]
10 | 
11 | 
12 | @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
13 |                     reason="ExpertsInt8 is not supported on this GPU type.")
14 | @pytest.mark.parametrize("model", MODELS)
15 | @pytest.mark.parametrize("dtype", ["bfloat16"])
16 | @pytest.mark.parametrize("max_tokens", [10])
17 | def test_model_experts_int8_startup(
18 |     hf_runner,
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 | ) -> None:
25 | 
26 |     with vllm_runner(model, dtype=dtype,
27 |                      quantization="experts_int8") as vllm_model:
28 |         vllm_model.generate_greedy(example_prompts, max_tokens)
29 | 


--------------------------------------------------------------------------------
/tests/quantization/test_ipex_quant.py:
--------------------------------------------------------------------------------
 1 | """Test model set-up and inference for quantized HF models supported
 2 |  on the CPU backend using IPEX (including AWQ).
 3 |  
 4 |  Validating the configuration and printing results for manual checking.
 5 | 
 6 |  Run `pytest tests/quantization/test_ipex_quant.py`.
 7 | """
 8 | 
 9 | import pytest
10 | 
11 | from vllm.platforms import current_platform
12 | 
13 | MODELS = [
14 |     "casperhansen/llama-3-8b-instruct-awq",
15 | ]
16 | DTYPE = ["bfloat16"]
17 | 
18 | 
19 | @pytest.mark.skipif(not current_platform.is_cpu(),
20 |                     reason="only supports the CPU backend.")
21 | @pytest.mark.parametrize("model", MODELS)
22 | @pytest.mark.parametrize("dtype", DTYPE)
23 | def test_ipex_quant(vllm_runner, model, dtype):
24 |     with vllm_runner(model, dtype=dtype) as llm:
25 |         output = llm.generate_greedy(["The capital of France is"],
26 |                                      max_tokens=32)
27 |     assert output
28 |     print(output)
29 | 


--------------------------------------------------------------------------------
/tests/quantization/utils.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 2 | from vllm.platforms import current_platform
 3 | 
 4 | 
 5 | def is_quant_method_supported(quant_method: str) -> bool:
 6 |     # Currently, all quantization methods require Nvidia or AMD GPUs
 7 |     if not (current_platform.is_cuda() or current_platform.is_rocm()):
 8 |         return False
 9 | 
10 |     capability = current_platform.get_device_capability()
11 |     assert capability is not None
12 | 
13 |     min_capability = QUANTIZATION_METHODS[quant_method].get_min_capability()
14 | 
15 |     return capability.to_int() >= min_capability
16 | 


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/test_ignore_eos.py:
--------------------------------------------------------------------------------
 1 | """Make sure ignore_eos works.
 2 | 
 3 | Run `pytest tests/samplers/test_ignore_eos.py`.
 4 | """
 5 | 
 6 | import pytest
 7 | 
 8 | from vllm import SamplingParams
 9 | 
10 | # We also test with llama because it has generation_config to specify EOS
11 | # (past regression).
12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", MODELS)
16 | @pytest.mark.parametrize("dtype", ["half"])
17 | @pytest.mark.parametrize("max_tokens", [512])
18 | def test_ignore_eos(
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 | ) -> None:
25 |     with vllm_runner(model, dtype=dtype) as vllm_model:
26 |         sampling_params = SamplingParams(max_tokens=max_tokens,
27 |                                          ignore_eos=True)
28 | 
29 |         for prompt in example_prompts:
30 |             ignore_eos_output = vllm_model.model.generate(
31 |                 prompt, sampling_params=sampling_params)
32 |             output_length = len(ignore_eos_output[0].outputs[0].token_ids)
33 |             assert output_length == max_tokens
34 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/spec_decode/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tensorizer_loader/__init__.py


--------------------------------------------------------------------------------
/tests/test_embedded_commit.py:
--------------------------------------------------------------------------------
1 | import vllm
2 | 
3 | 
4 | def test_embedded_commit_defined():
5 |     assert hasattr(vllm, "__version__")
6 |     assert hasattr(vllm, "__version_tuple__")
7 |     assert vllm.__version__ != "dev"
8 |     assert vllm.__version_tuple__ != (0, 0, "dev")
9 | 


--------------------------------------------------------------------------------
/tests/test_sampling_params.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SamplingParams class.
 2 | """
 3 | from vllm import SamplingParams
 4 | 
 5 | 
 6 | def test_max_tokens_none():
 7 |     """max_tokens=None should be allowed"""
 8 |     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     import pytest
13 |     pytest.main([__file__])
14 | 


--------------------------------------------------------------------------------
/tests/test_scalartype.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm.scalar_type import scalar_types
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("type_tuple", (
 8 |     (-8, 7, scalar_types.int4),
 9 |     (0, 15, scalar_types.uint4),
10 |     (-8, 7, scalar_types.uint4b8),
11 |     (-128, 127, scalar_types.uint8b128),
12 |     (-28., 28., scalar_types.float6_e3m2f),
13 |     (torch.int8, scalar_types.int8),
14 |     (torch.uint8, scalar_types.uint8),
15 |     (torch.float8_e5m2, scalar_types.float8_e5m2),
16 |     (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
17 |     (torch.bfloat16, scalar_types.float16_e8m7),
18 |     (torch.float16, scalar_types.float16_e5m10),
19 | ),
20 |                          ids=lambda x: str(x))
21 | def test_scalar_type_min_max(type_tuple):
22 |     print(type_tuple)
23 |     if len(type_tuple) == 3:
24 |         min, max, t = type_tuple
25 |     else:
26 |         torch_type, t = type_tuple
27 |         if torch_type.is_floating_point:
28 |             min = torch.finfo(torch_type).min
29 |             max = torch.finfo(torch_type).max
30 |         else:
31 |             min = torch.iinfo(torch_type).min
32 |             max = torch.iinfo(torch_type).max
33 | 
34 |     print(t, min, max, t.min(), t.max())
35 |     assert min == t.min()
36 |     assert max == t.max()
37 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_cached_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 6 | 
 7 | 
 8 | def test_cached_tokenizer():
 9 |     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
10 |     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
11 |     reference_tokenizer.add_special_tokens(
12 |         {"additional_special_tokens": ["<SEP>"]})
13 |     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
14 | 
15 |     assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
16 |         "prompt")
17 |     assert set(reference_tokenizer.all_special_ids) == set(
18 |         cached_tokenizer.all_special_ids)
19 |     assert set(reference_tokenizer.all_special_tokens) == set(
20 |         cached_tokenizer.all_special_tokens)
21 |     assert set(reference_tokenizer.all_special_tokens_extended) == set(
22 |         cached_tokenizer.all_special_tokens_extended)
23 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_get_eos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This test file includes some cases where it is inappropriate to
 3 | only get the `eos_token_id` from the tokenizer as defined by
 4 | :meth:`vllm.LLMEngine._get_eos_token_id`.
 5 | """
 6 | from vllm.transformers_utils.config import try_get_generation_config
 7 | from vllm.transformers_utils.tokenizer import get_tokenizer
 8 | 
 9 | 
10 | def test_get_llama3_eos_token():
11 |     model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
12 | 
13 |     tokenizer = get_tokenizer(model_name)
14 |     assert tokenizer.eos_token_id == 128009
15 | 
16 |     generation_config = try_get_generation_config(model_name,
17 |                                                   trust_remote_code=False)
18 |     assert generation_config is not None
19 |     assert generation_config.eos_token_id == [128001, 128009]
20 | 
21 | 
22 | def test_get_blip2_eos_token():
23 |     model_name = "Salesforce/blip2-opt-2.7b"
24 | 
25 |     tokenizer = get_tokenizer(model_name)
26 |     assert tokenizer.eos_token_id == 2
27 | 
28 |     generation_config = try_get_generation_config(model_name,
29 |                                                   trust_remote_code=False)
30 |     assert generation_config is not None
31 |     assert generation_config.eos_token_id == 50118
32 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import PreTrainedTokenizerBase
 3 | 
 4 | from vllm.transformers_utils.tokenizer import get_tokenizer
 5 | 
 6 | TOKENIZER_NAMES = [
 7 |     "facebook/opt-125m",
 8 |     "gpt2",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
13 | def test_tokenizer_revision(tokenizer_name: str):
14 |     # Assume that "main" branch always exists
15 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
16 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
17 | 
18 |     # Assume that "never" branch always does not exist
19 |     with pytest.raises(OSError, match='not a valid git identifier'):
20 |         get_tokenizer(tokenizer_name, revision="never")
21 | 


--------------------------------------------------------------------------------
/tests/tool_use/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tool_use/__init__.py


--------------------------------------------------------------------------------
/tests/tool_use/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pytest_asyncio
 3 | from huggingface_hub import snapshot_download
 4 | 
 5 | from tests.utils import RemoteOpenAIServer
 6 | 
 7 | from .utils import ARGS, CONFIGS, ServerConfig
 8 | 
 9 | 
10 | # for each server config, download the model and return the config
11 | @pytest.fixture(scope="session", params=CONFIGS.keys())
12 | def server_config(request):
13 |     config = CONFIGS[request.param]
14 |     # download model and tokenizer using transformers
15 |     snapshot_download(config["model"])
16 |     yield CONFIGS[request.param]
17 | 
18 | 
19 | # run this for each server config
20 | @pytest.fixture(scope="session")
21 | def server(request, server_config: ServerConfig):
22 |     model = server_config["model"]
23 |     args_for_model = server_config["arguments"]
24 |     with RemoteOpenAIServer(model, ARGS + args_for_model,
25 |                             max_wait_seconds=480) as server:
26 |         yield server
27 | 
28 | 
29 | @pytest_asyncio.fixture
30 | async def client(server: RemoteOpenAIServer):
31 |     async with server.get_async_client() as async_client:
32 |         yield async_client
33 | 


--------------------------------------------------------------------------------
/tests/tpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tpu/__init__.py


--------------------------------------------------------------------------------
/tests/tpu/test_custom_dispatcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from vllm.compilation.levels import CompilationLevel
 4 | 
 5 | from ..utils import compare_two_settings
 6 | 
 7 | # --enforce-eager on TPU causes graph compilation
 8 | # this times out default Health Check in the MQLLMEngine,
 9 | # so we set the timeout here to 30s
10 | os.environ["VLLM_RPC_TIMEOUT"] = "30000"
11 | 
12 | 
13 | def test_custom_dispatcher():
14 |     compare_two_settings(
15 |         "google/gemma-2b",
16 |         arg1=["--enforce-eager"],
17 |         arg2=["--enforce-eager"],
18 |         env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
19 |         env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})
20 | 


--------------------------------------------------------------------------------
/tests/tracing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/tracing/__init__.py


--------------------------------------------------------------------------------
/tests/weight_loading/models-large.txt:
--------------------------------------------------------------------------------
1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
4 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
5 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main


--------------------------------------------------------------------------------
/tests/weight_loading/run_model_weight_loading_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SUCCESS=0
 3 | 
 4 | while getopts "c:" OPT; do
 5 |   case ${OPT} in
 6 |     c ) 
 7 |         CONFIG="$OPTARG"
 8 |         ;;
 9 |     \? )
10 |         usage
11 |         exit 1
12 |         ;;
13 |   esac
14 | done
15 | 
16 | 
17 | IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
18 | 
19 | for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
20 | do
21 |     LOCAL_SUCCESS=0
22 |     IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
23 |     
24 |     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
25 | 
26 |     export QUANTIZATION=${array[0]}
27 |     export MODEL_NAME=${array[1]}
28 |     export REVISION=${array[2]}
29 |     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
30 | 
31 |     if [[ $LOCAL_SUCCESS == 0 ]]; then
32 |         echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
33 |     else
34 |         echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
35 |     fi
36 | 
37 |     SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
38 | 
39 | done
40 | 
41 | if [ "${SUCCESS}" -eq "0" ]; then
42 |     exit 0
43 | else
44 |     exit 1
45 | fi
46 | 


--------------------------------------------------------------------------------
/tests/weight_loading/test_weight_loading.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | MAX_MODEL_LEN = 1024
 6 | MODEL_NAME = os.environ.get("MODEL_NAME",
 7 |                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 8 | REVISION = os.environ.get("REVISION", "main")
 9 | QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
10 | 
11 | 
12 | def test_weight_loading(vllm_runner):
13 |     """
14 |     Test parameter weight loading with tp>1.
15 |     """
16 |     with vllm_runner(model_name=MODEL_NAME,
17 |                      revision=REVISION,
18 |                      dtype=torch.half if QUANTIZATION == "gptq" else "auto",
19 |                      quantization=QUANTIZATION,
20 |                      max_model_len=MAX_MODEL_LEN,
21 |                      tensor_parallel_size=2) as model:
22 | 
23 |         output = model.generate_greedy("Hello world!", max_tokens=20)
24 |         print(output)
25 |         assert output
26 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/tests/worker/__init__.py


--------------------------------------------------------------------------------
/tools/actionlint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if command -v actionlint &> /dev/null; then
 4 |     actionlint "$@"
 5 |     exit 0
 6 | elif [ -x ./actionlint ]; then
 7 |     ./actionlint "$@"
 8 |     exit 0
 9 | fi
10 | 
11 | # download a binary to the current directory - v1.7.3
12 | bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
13 | ./actionlint "$@"
14 | 


--------------------------------------------------------------------------------
/tools/mypy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CI=${1:-0}
 4 | 
 5 | run_mypy() {
 6 |     echo "Running mypy on $1"
 7 |     if [ $CI -eq 1 ] && [ -z "$1" ]; then
 8 |         mypy "$@"
 9 |         return
10 |     fi
11 |     mypy --follow-imports skip "$@"
12 | }
13 | 
14 | run_mypy # Note that this is less strict than CI
15 | run_mypy tests
16 | run_mypy vllm/assets
17 | run_mypy vllm/attention
18 | #run_mypy vllm/compilation
19 | #run_mypy vllm/core
20 | run_mypy vllm/distributed
21 | run_mypy vllm/engine
22 | run_mypy vllm/entrypoints
23 | run_mypy vllm/executor
24 | #run_mypy vllm/inputs
25 | run_mypy vllm/logging
26 | run_mypy vllm/lora
27 | run_mypy vllm/model_executor
28 | run_mypy vllm/multimodal
29 | run_mypy vllm/platforms
30 | run_mypy vllm/plugins
31 | run_mypy vllm/prompt_adapter
32 | run_mypy vllm/spec_decode
33 | run_mypy vllm/transformers_utils
34 | run_mypy vllm/usage
35 | #run_mypy vllm/vllm_flash_attn
36 | run_mypy vllm/worker
37 | 


--------------------------------------------------------------------------------
/use_existing_torch.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | requires_files = glob.glob('requirements*.txt')
 4 | requires_files += ["pyproject.toml"]
 5 | for file in requires_files:
 6 |     print(f">>> cleaning {file}")
 7 |     with open(file, 'r') as f:
 8 |         lines = f.readlines()
 9 |     if "torch" in "".join(lines).lower():
10 |         print("removed:")
11 |         with open(file, 'w') as f:
12 |             for line in lines:
13 |                 if 'torch' not in line.lower():
14 |                     f.write(line)
15 |                 else:
16 |                     print(line.strip())
17 |     print(f"<<< done cleaning {file}")
18 |     print()
19 | 


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.entrypoints.llm import LLM
 7 | from vllm.executor.ray_utils import initialize_ray_cluster
 8 | from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 9 | from vllm.model_executor.models import ModelRegistry
10 | from vllm.outputs import (CompletionOutput, EmbeddingOutput,
11 |                           EmbeddingRequestOutput, RequestOutput)
12 | from vllm.pooling_params import PoolingParams
13 | from vllm.sampling_params import SamplingParams
14 | 
15 | from .version import __version__, __version_tuple__
16 | 
17 | __all__ = [
18 |     "__version__",
19 |     "__version_tuple__",
20 |     "LLM",
21 |     "ModelRegistry",
22 |     "PromptType",
23 |     "TextPrompt",
24 |     "TokensPrompt",
25 |     "SamplingParams",
26 |     "RequestOutput",
27 |     "CompletionOutput",
28 |     "EmbeddingOutput",
29 |     "EmbeddingRequestOutput",
30 |     "LLMEngine",
31 |     "EngineArgs",
32 |     "AsyncLLMEngine",
33 |     "AsyncEngineArgs",
34 |     "initialize_ray_cluster",
35 |     "PoolingParams",
36 | ]
37 | 


--------------------------------------------------------------------------------
/vllm/adapter_commons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/adapter_commons/__init__.py


--------------------------------------------------------------------------------
/vllm/adapter_commons/layers.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Tuple
 3 | 
 4 | 
 5 | @dataclass
 6 | class AdapterMapping:
 7 |     # Per every token in input_ids:
 8 |     index_mapping: Tuple[int, ...]
 9 |     # Per sampled token:
10 |     prompt_mapping: Tuple[int, ...]
11 | 
12 |     def __post_init__(self):
13 |         self.index_mapping = tuple(self.index_mapping)
14 |         self.prompt_mapping = tuple(self.prompt_mapping)


--------------------------------------------------------------------------------
/vllm/adapter_commons/request.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class AdapterRequest(ABC):
 5 |     """
 6 |     Base class for adapter requests.
 7 |     """
 8 | 
 9 |     @property
10 |     @abstractmethod
11 |     def adapter_id(self) -> int:
12 |         raise NotImplementedError
13 | 
14 |     def __post_init__(self) -> None:
15 |         if self.adapter_id < 1:
16 |             raise ValueError(f"id must be > 0, got {self.adapter_id}")
17 | 
18 |     def __eq__(self, value: object) -> bool:
19 |         return isinstance(
20 |             value, self.__class__) and self.adapter_id == value.adapter_id
21 | 
22 |     def __hash__(self) -> int:
23 |         return hash(self.adapter_id)
24 | 


--------------------------------------------------------------------------------
/vllm/adapter_commons/worker_manager.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Optional, Set
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbstractWorkerManager(ABC):
 8 | 
 9 |     def __init__(self, device: torch.device):
10 |         self.device = device
11 | 
12 |     @property
13 |     @abstractmethod
14 |     def is_enabled(self) -> bool:
15 |         raise NotImplementedError
16 | 
17 |     @abstractmethod
18 |     def set_active_adapters(self, requests: Set[Any],
19 |                             mapping: Optional[Any]) -> None:
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def add_adapter(self, adapter_request: Any) -> bool:
24 |         raise NotImplementedError
25 | 
26 |     @abstractmethod
27 |     def remove_adapter(self, adapter_id: int) -> bool:
28 |         raise NotImplementedError
29 | 
30 |     @abstractmethod
31 |     def remove_all_adapters(self) -> None:
32 |         raise NotImplementedError
33 | 
34 |     @abstractmethod
35 |     def list_adapters(self) -> Set[int]:
36 |         raise NotImplementedError
37 | 


--------------------------------------------------------------------------------
/vllm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/assets/__init__.py


--------------------------------------------------------------------------------
/vllm/assets/audio.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal, Tuple
 3 | from urllib.parse import urljoin
 4 | 
 5 | import librosa
 6 | import numpy as np
 7 | 
 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
 9 | 
10 | ASSET_DIR = "multimodal_asset"
11 | 
12 | 
13 | @dataclass(frozen=True)
14 | class AudioAsset:
15 |     name: Literal["winning_call", "mary_had_lamb"]
16 | 
17 |     @property
18 |     def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
19 | 
20 |         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
21 |                                             s3_prefix=ASSET_DIR)
22 |         y, sr = librosa.load(audio_path, sr=None)
23 |         assert isinstance(sr, int)
24 |         return y, sr
25 | 
26 |     @property
27 |     def url(self) -> str:
28 |         return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
29 | 


--------------------------------------------------------------------------------
/vllm/assets/base.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | 
 5 | import vllm.envs as envs
 6 | from vllm.connections import global_http_connection
 7 | from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 8 | 
 9 | vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
10 | 
11 | 
12 | def get_cache_dir() -> Path:
13 |     """Get the path to the cache for storing downloaded assets."""
14 |     path = Path(envs.VLLM_ASSETS_CACHE)
15 |     path.mkdir(parents=True, exist_ok=True)
16 | 
17 |     return path
18 | 
19 | 
20 | @lru_cache
21 | def get_vllm_public_assets(filename: str,
22 |                            s3_prefix: Optional[str] = None) -> Path:
23 |     """
24 |     Download an asset file from ``s3://vllm-public-assets``
25 |     and return the path to the downloaded file.
26 |     """
27 |     asset_directory = get_cache_dir() / "vllm_public_assets"
28 |     asset_directory.mkdir(parents=True, exist_ok=True)
29 | 
30 |     asset_path = asset_directory / filename
31 |     if not asset_path.exists():
32 |         if s3_prefix is not None:
33 |             filename = s3_prefix + "/" + filename
34 |         global_http_connection.download_file(
35 |             f"{vLLM_S3_BUCKET_URL}/{filename}",
36 |             asset_path,
37 |             timeout=VLLM_IMAGE_FETCH_TIMEOUT)
38 | 
39 |     return asset_path
40 | 


--------------------------------------------------------------------------------
/vllm/assets/image.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | 
 7 | from vllm.assets.base import get_vllm_public_assets
 8 | 
 9 | VLM_IMAGES_DIR = "vision_model_images"
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | class ImageAsset:
14 |     name: Literal["stop_sign", "cherry_blossom"]
15 | 
16 |     @property
17 |     def pil_image(self) -> Image.Image:
18 | 
19 |         image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
20 |                                             s3_prefix=VLM_IMAGES_DIR)
21 |         return Image.open(image_path)
22 | 
23 |     @property
24 |     def image_embeds(self) -> torch.Tensor:
25 |         """
26 |         Image embeddings, only used for testing purposes with llava 1.5.
27 |         """
28 |         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
29 |                                             s3_prefix=VLM_IMAGES_DIR)
30 |         return torch.load(image_path)
31 | 


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.attention.backends.abstract import (AttentionBackend,
 2 |                                               AttentionMetadata,
 3 |                                               AttentionMetadataBuilder,
 4 |                                               AttentionState, AttentionType)
 5 | from vllm.attention.layer import Attention
 6 | from vllm.attention.selector import get_attn_backend
 7 | 
 8 | __all__ = [
 9 |     "Attention",
10 |     "AttentionBackend",
11 |     "AttentionMetadata",
12 |     "AttentionType",
13 |     "AttentionMetadataBuilder",
14 |     "Attention",
15 |     "AttentionState",
16 |     "get_attn_backend",
17 | ]
18 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/attention/ops/blocksparse_attention/__init__.py


--------------------------------------------------------------------------------
/vllm/compilation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/compilation/__init__.py


--------------------------------------------------------------------------------
/vllm/compilation/compile_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from typing import Any
 3 | 
 4 | _compile_context: Any = None
 5 | 
 6 | 
 7 | def get_compile_context() -> Any:
 8 |     """Get the current compile context."""
 9 |     return _compile_context
10 | 
11 | 
12 | @contextmanager
13 | def set_compile_context(context: Any):
14 |     """A context manager that stores the current compile context,
15 |     usually it is a list of sizes to specialize.
16 |     """
17 |     global _compile_context
18 |     prev_context = _compile_context
19 |     _compile_context = context
20 |     try:
21 |         yield
22 |     finally:
23 |         _compile_context = prev_context
24 | 


--------------------------------------------------------------------------------
/vllm/compilation/levels.py:
--------------------------------------------------------------------------------
 1 | # constants for the levels of the compilation process
 2 | 
 3 | 
 4 | class CompilationLevel:
 5 |     NO_COMPILATION = 0
 6 |     DYNAMO_AS_IS = 1
 7 |     DYNAMO_ONCE = 2
 8 |     INDUCTOR = 3
 9 |     INDUCTOR_MAX_AUTOTUNE = 4
10 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/vllm/distributed/communication_op.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Union
 2 | 
 3 | import torch
 4 | import torch.distributed
 5 | 
 6 | from .parallel_state import get_tp_group
 7 | 
 8 | 
 9 | def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
10 |     """All-reduce the input tensor across model parallel group."""
11 |     return get_tp_group().all_reduce(input_)
12 | 
13 | 
14 | def tensor_model_parallel_all_gather(input_: torch.Tensor,
15 |                                      dim: int = -1) -> torch.Tensor:
16 |     """All-gather the input tensor across model parallel group."""
17 |     return get_tp_group().all_gather(input_, dim)
18 | 
19 | 
20 | def tensor_model_parallel_gather(input_: torch.Tensor,
21 |                                  dst: int = 0,
22 |                                  dim: int = -1) -> Optional[torch.Tensor]:
23 |     """Gather the input tensor across model parallel group."""
24 |     return get_tp_group().gather(input_, dst, dim)
25 | 
26 | 
27 | def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
28 |                                                                 Any]]] = None,
29 |                           src: int = 0):
30 |     if not torch.distributed.is_initialized():
31 |         return tensor_dict
32 |     return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
33 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Sequence as GenericSequence
 3 | from typing import Union
 4 | 
 5 | from vllm.model_executor.layers.sampler import SamplerOutput
 6 | from vllm.sequence import PoolerOutput, SequenceGroupOutput
 7 | 
 8 | 
 9 | def create_output_by_sequence_group(
10 |         outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]],
11 |         num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
12 |     """Helper method which transforms a 2d list organized by
13 |     [step][sequence group] into [sequence group][step].
14 |     """
15 |     output_by_sequence_group: List[List[SequenceGroupOutput]] = [
16 |         [] for _ in range(num_seq_groups)
17 |     ]
18 |     for step in outputs:
19 |         for i, sequence_group_output in enumerate(step):
20 |             output_by_sequence_group[i].append(sequence_group_output)
21 | 
22 |     return output_by_sequence_group
23 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/tool_parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstract_tool_parser import ToolParser, ToolParserManager
 2 | from .hermes_tool_parser import Hermes2ProToolParser
 3 | from .internlm2_tool_parser import Internlm2ToolParser
 4 | from .llama_tool_parser import Llama3JsonToolParser
 5 | from .mistral_tool_parser import MistralToolParser
 6 | 
 7 | __all__ = [
 8 |     "ToolParser", "ToolParserManager", "Hermes2ProToolParser",
 9 |     "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser"
10 | ]
11 | 


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/msgspec_utils.py:
--------------------------------------------------------------------------------
 1 | from array import array
 2 | from typing import Any, Type
 3 | 
 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
 5 | 
 6 | 
 7 | def encode_hook(obj: Any) -> Any:
 8 |     """Custom msgspec enc hook that supports array types.
 9 | 
10 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
11 |     """
12 |     if isinstance(obj, array):
13 |         assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
14 |             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
15 |             f"Given array has a type code of {obj.typecode}.")
16 |         return obj.tobytes()
17 | 
18 | 
19 | def decode_hook(type: Type, obj: Any) -> Any:
20 |     """Custom msgspec dec hook that supports array types.
21 | 
22 |     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
23 |     """
24 |     if type is array:
25 |         deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
26 |         deserialized.frombytes(obj)
27 |         return deserialized
28 | 


--------------------------------------------------------------------------------
/vllm/executor/multiproc_xpu_executor.py:
--------------------------------------------------------------------------------
 1 | import vllm.envs as envs
 2 | from vllm.executor.multiproc_gpu_executor import (
 3 |     MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
 4 | from vllm.executor.xpu_executor import XPUExecutor
 5 | from vllm.logger import init_logger
 6 | from vllm.utils import make_async
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
12 |     """Python multiprocessing-based multi-XPU executor"""
13 | 
14 |     def _check_executor_parameters(self):
15 |         mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
16 |         if mp_method != "spawn":
17 |             raise RuntimeError(
18 |                 "XPU multiprocess executor only support spawn as mp method")
19 | 
20 | 
21 | class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
22 |                                       MultiprocessingGPUExecutorAsync):
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__(*args, **kwargs)
26 |         self.driver_exec_model = make_async(self.driver_worker.execute_model)
27 | 


--------------------------------------------------------------------------------
/vllm/forward_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from typing import Any
 3 | 
 4 | _forward_context: Any = None
 5 | 
 6 | 
 7 | def get_forward_context() -> Any:
 8 |     """Get the current forward context."""
 9 |     return _forward_context
10 | 
11 | 
12 | @contextmanager
13 | def set_forward_context(context: Any):
14 |     """A context manager that stores the current forward context,
15 |     can be attention metadata, etc."""
16 |     global _forward_context
17 |     prev_context = _forward_context
18 |     _forward_context = context
19 |     try:
20 |         yield
21 |     finally:
22 |         _forward_context = prev_context
23 | 


--------------------------------------------------------------------------------
/vllm/logging/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging.formatter import NewLineFormatter
2 | 
3 | __all__ = [
4 |     "NewLineFormatter",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/logging/formatter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NewLineFormatter(logging.Formatter):
 5 |     """Adds logging prefix to newlines to align multi-line messages."""
 6 | 
 7 |     def __init__(self, fmt, datefmt=None, style="%"):
 8 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 9 | 
10 |     def format(self, record):
11 |         msg = logging.Formatter.format(self, record)
12 |         if record.message != "":
13 |             parts = msg.split(record.message)
14 |             msg = msg.replace("\n", "\r\n" + parts[0])
15 |         return msg
16 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/lora/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.parameter import (BasevLLMParameter,
 2 |                                            PackedvLLMParameter)
 3 | from vllm.model_executor.sampling_metadata import (SamplingMetadata,
 4 |                                                    SamplingMetadataCache)
 5 | from vllm.model_executor.utils import set_random_seed
 6 | 
 7 | __all__ = [
 8 |     "SamplingMetadata",
 9 |     "SamplingMetadataCache",
10 |     "set_random_seed",
11 |     "BasevLLMParameter",
12 |     "PackedvLLMParameter",
13 | ]
14 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.fused_moe.layer import (
 2 |     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 3 | from vllm.triton_utils import HAS_TRITON
 4 | 
 5 | __all__ = [
 6 |     "FusedMoE",
 7 |     "FusedMoEMethodBase",
 8 |     "FusedMoeWeightScaleSupported",
 9 | ]
10 | 
11 | if HAS_TRITON:
12 |     from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
13 |         fused_marlin_moe, single_marlin_moe)
14 |     from vllm.model_executor.layers.fused_moe.fused_moe import (
15 |         fused_experts, fused_moe, fused_topk, get_config_file_name,
16 |         grouped_topk)
17 | 
18 |     __all__ += [
19 |         "fused_marlin_moe",
20 |         "single_marlin_moe",
21 |         "fused_moe",
22 |         "fused_topk",
23 |         "fused_experts",
24 |         "get_config_file_name",
25 |         "grouped_topk",
26 |     ]
27 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/mamba/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/mamba/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/mamba/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compressed_tensors_scheme import CompressedTensorsScheme
 2 | from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
 3 |                                           CompressedTensorsW4A16Sparse24)
 4 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 5 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 6 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 7 | from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
 8 |                                        CompressedTensorsWNA16)
 9 | 
10 | __all__ = [
11 |     "CompressedTensorsScheme",
12 |     "CompressedTensorsWNA16",
13 |     "CompressedTensorsW8A16Fp8",
14 |     "CompressedTensorsW4A16Sparse24",
15 |     "CompressedTensorsW8A8Int8",
16 |     "CompressedTensorsW8A8Fp8",
17 |     "WNA16_SUPPORTED_BITS",
18 |     "W4A16SPARSE24_SUPPORTED_BITS",
19 | ]
20 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_utils import replace_parameter, update_tensor_inplace
2 | 
3 | __all__ = ['update_tensor_inplace', 'replace_parameter']
4 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/machete_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.scalar_type import ScalarType, scalar_types
 6 | 
 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
 9 | 
10 | 
11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
12 |     if zero_points:
13 |         return [scalar_types.uint4, scalar_types.uint8]
14 |     else:
15 |         return [scalar_types.uint4b8, scalar_types.uint8b128]
16 | 
17 | 
18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
19 |     return [torch.float16, torch.bfloat16]
20 | 
21 | 
22 | def check_machete_supports_shape(in_features: int, out_featrues: int) \
23 |     -> Tuple[bool, Optional[str]]:
24 |     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
25 |         return False, "Input features size must be divisible by "\
26 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
27 |     if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
28 |         return False, "Output features size must be divisible by "\
29 |             f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
30 |     return True, None
31 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch import nn
 4 | 
 5 | from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 6 |                          ModelConfig, ParallelConfig, SchedulerConfig)
 7 | from vllm.model_executor.model_loader.loader import (BaseModelLoader,
 8 |                                                      get_model_loader)
 9 | from vllm.model_executor.model_loader.utils import (
10 |     get_architecture_class_name, get_model_architecture)
11 | 
12 | 
13 | def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
14 |               device_config: DeviceConfig, parallel_config: ParallelConfig,
15 |               scheduler_config: SchedulerConfig,
16 |               lora_config: Optional[LoRAConfig],
17 |               cache_config: CacheConfig) -> nn.Module:
18 |     loader = get_model_loader(load_config)
19 |     return loader.load_model(model_config=model_config,
20 |                              device_config=device_config,
21 |                              lora_config=lora_config,
22 |                              parallel_config=parallel_config,
23 |                              scheduler_config=scheduler_config,
24 |                              cache_config=cache_config)
25 | 
26 | 
27 | __all__ = [
28 |     "get_model", "get_model_loader", "BaseModelLoader",
29 |     "get_architecture_class_name", "get_model_architecture"
30 | ]
31 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
 2 |                          SupportsPP, has_inner_state, supports_lora,
 3 |                          supports_multimodal, supports_pp)
 4 | from .interfaces_base import (VllmModelForEmbedding,
 5 |                               VllmModelForTextGeneration, is_embedding_model,
 6 |                               is_text_generation_model)
 7 | from .registry import ModelRegistry
 8 | 
 9 | __all__ = [
10 |     "ModelRegistry",
11 |     "VllmModelForEmbedding",
12 |     "is_embedding_model",
13 |     "VllmModelForTextGeneration",
14 |     "is_text_generation_model",
15 |     "HasInnerState",
16 |     "has_inner_state",
17 |     "SupportsLoRA",
18 |     "supports_lora",
19 |     "SupportsMultiModal",
20 |     "supports_multimodal",
21 |     "SupportsPP",
22 |     "supports_pp",
23 | ]


--------------------------------------------------------------------------------
/vllm/model_executor/models/phi3.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Adapted from llama.py
 3 | """Inference-only Phi3 model code inherit from Llama.py"""
 4 | 
 5 | from vllm.model_executor.models.llama import LlamaForCausalLM
 6 | 
 7 | 
 8 | class Phi3ForCausalLM(LlamaForCausalLM):
 9 | 
10 |     packed_modules_mapping = {
11 |         "qkv_proj": [
12 |             "qkv_proj",
13 |         ],
14 |         "gate_up_proj": [
15 |             "gate_up_proj",
16 |         ],
17 |     }
18 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.utils import seed_everything
 7 | 
 8 | 
 9 | def set_random_seed(seed: int) -> None:
10 |     seed_everything(seed)
11 | 
12 | 
13 | def set_weight_attrs(
14 |     weight: torch.Tensor,
15 |     weight_attrs: Optional[Dict[str, Any]],
16 | ):
17 |     """Set attributes on a weight tensor.
18 | 
19 |     This method is used to set attributes on a weight tensor. This method
20 |     will not overwrite existing attributes.
21 | 
22 |     Args:
23 |         weight: The weight tensor.
24 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
25 |     """
26 |     if weight_attrs is None:
27 |         return
28 |     for key, value in weight_attrs.items():
29 |         assert not hasattr(
30 |             weight, key), (f"Overwriting existing tensor attribute: {key}")
31 |         setattr(weight, key, value)
32 | 


--------------------------------------------------------------------------------
/vllm/multimodal/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
 2 |                    MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
 3 |                    NestedTensors)
 4 | from .registry import MultiModalRegistry
 5 | 
 6 | MULTIMODAL_REGISTRY = MultiModalRegistry()
 7 | """
 8 | The global :class:`~MultiModalRegistry` is used by model runners to
 9 | dispatch data processing according to its modality and the target model.
10 | 
11 | See also:
12 |     :ref:`input_processing_pipeline`
13 | """
14 | 
15 | __all__ = [
16 |     "BatchedTensorInputs",
17 |     "MultiModalDataBuiltins",
18 |     "MultiModalDataDict",
19 |     "MultiModalInputs",
20 |     "MultiModalPlugin",
21 |     "NestedTensors",
22 |     "MULTIMODAL_REGISTRY",
23 |     "MultiModalRegistry",
24 | ]
25 | 


--------------------------------------------------------------------------------
/vllm/multimodal/audio.py:
--------------------------------------------------------------------------------
 1 | from vllm.inputs.registry import InputContext
 2 | from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
 3 | 
 4 | 
 5 | class AudioPlugin(MultiModalPlugin):
 6 |     """Plugin for audio data."""
 7 | 
 8 |     def get_data_key(self) -> str:
 9 |         return "audio"
10 | 
11 |     def _default_input_mapper(self, ctx: InputContext, data: object,
12 |                               **mm_processor_kwargs) -> MultiModalInputs:
13 |         raise NotImplementedError("There is no default audio input mapper")
14 | 
15 |     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
16 |         raise NotImplementedError(
17 |             "There is no default maximum multimodal tokens")
18 | 


--------------------------------------------------------------------------------
/vllm/platforms/cpu.py:
--------------------------------------------------------------------------------
 1 | import psutil
 2 | import torch
 3 | 
 4 | from .interface import Platform, PlatformEnum
 5 | 
 6 | 
 7 | class CpuPlatform(Platform):
 8 |     _enum = PlatformEnum.CPU
 9 | 
10 |     @classmethod
11 |     def get_device_name(cls, device_id: int = 0) -> str:
12 |         return "cpu"
13 | 
14 |     @classmethod
15 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
16 |         return psutil.virtual_memory().total
17 | 
18 |     @classmethod
19 |     def inference_mode(cls):
20 |         return torch.no_grad()
21 | 


--------------------------------------------------------------------------------
/vllm/platforms/rocm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from functools import lru_cache
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.logger import init_logger
 7 | 
 8 | from .interface import DeviceCapability, Platform, PlatformEnum
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
13 |     logger.warning("`fork` method is not supported by ROCm. "
14 |                    "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
15 |                    " `spawn` instead.")
16 |     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
17 | 
18 | 
19 | class RocmPlatform(Platform):
20 |     _enum = PlatformEnum.ROCM
21 | 
22 |     @classmethod
23 |     @lru_cache(maxsize=8)
24 |     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
25 |         major, minor = torch.cuda.get_device_capability(device_id)
26 |         return DeviceCapability(major=major, minor=minor)
27 | 
28 |     @classmethod
29 |     @lru_cache(maxsize=8)
30 |     def get_device_name(cls, device_id: int = 0) -> str:
31 |         return torch.cuda.get_device_name(device_id)
32 | 
33 |     @classmethod
34 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
35 |         device_props = torch.cuda.get_device_properties(device_id)
36 |         return device_props.total_memory
37 | 


--------------------------------------------------------------------------------
/vllm/platforms/tpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | import vllm.envs as envs
 6 | from vllm.compilation.levels import CompilationLevel
 7 | from vllm.plugins import set_torch_compile_backend
 8 | 
 9 | from .interface import Platform, PlatformEnum
10 | 
11 | if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
12 |     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
13 | 
14 | assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\
15 |      "TPU does not support Inductor."
16 | 
17 | set_torch_compile_backend("openxla")
18 | 
19 | 
20 | class TpuPlatform(Platform):
21 |     _enum = PlatformEnum.TPU
22 | 
23 |     @classmethod
24 |     def get_device_name(cls, device_id: int = 0) -> str:
25 |         raise NotImplementedError
26 | 
27 |     @classmethod
28 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
29 |         raise NotImplementedError
30 | 
31 |     @classmethod
32 |     def inference_mode(cls):
33 |         return torch.no_grad()
34 | 


--------------------------------------------------------------------------------
/vllm/platforms/xpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .interface import DeviceCapability, Platform, PlatformEnum
 4 | 
 5 | 
 6 | class XPUPlatform(Platform):
 7 |     _enum = PlatformEnum.XPU
 8 | 
 9 |     @staticmethod
10 |     def get_device_capability(device_id: int = 0) -> DeviceCapability:
11 |         major, minor, *_ = torch.xpu.get_device_capability(
12 |             device_id)['version'].split('.')
13 |         return DeviceCapability(major=int(major), minor=int(minor))
14 | 
15 |     @staticmethod
16 |     def get_device_name(device_id: int = 0) -> str:
17 |         return torch.xpu.get_device_name(device_id)
18 | 
19 |     @classmethod
20 |     def get_device_total_memory(cls, device_id: int = 0) -> int:
21 |         device_props = torch.xpu.get_device_properties(device_id)
22 |         return device_props.total_memory
23 | 


--------------------------------------------------------------------------------
/vllm/pooling_params.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | import msgspec
 4 | 
 5 | 
 6 | class PoolingParams(
 7 |         msgspec.Struct,
 8 |         omit_defaults=True,  # type: ignore[call-arg]
 9 |         array_like=True):  # type: ignore[call-arg]
10 |     """Pooling parameters for pooling.
11 | 
12 |     Attributes:
13 |         additional_data: Any additional data needed for pooling.
14 |     """
15 |     additional_data: Optional[Any] = None
16 | 
17 |     def clone(self) -> "PoolingParams":
18 |         """Returns a deep copy of the PoolingParams instance."""
19 |         return PoolingParams(additional_data=self.additional_data, )
20 | 
21 |     def __repr__(self) -> str:
22 |         return (f"PoolingParams("
23 |                 f"additional_metadata={self.additional_data})")
24 | 


--------------------------------------------------------------------------------
/vllm/prompt_adapter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/prompt_adapter/__init__.py


--------------------------------------------------------------------------------
/vllm/prompt_adapter/request.py:
--------------------------------------------------------------------------------
 1 | import msgspec
 2 | 
 3 | from vllm.adapter_commons.request import AdapterRequest
 4 | 
 5 | 
 6 | class PromptAdapterRequest(
 7 |         msgspec.Struct,
 8 |         array_like=True,  # type: ignore[call-arg]
 9 |         omit_defaults=True,  # type: ignore[call-arg]
10 |         frozen=True):  # type: ignore[call-arg]
11 |     """
12 |     Request for a Prompt adapter.
13 |     """
14 |     __metaclass__ = AdapterRequest
15 | 
16 |     prompt_adapter_name: str
17 |     prompt_adapter_id: int
18 |     prompt_adapter_local_path: str
19 |     prompt_adapter_num_virtual_tokens: int
20 | 
21 |     def __hash__(self):
22 |         return super().__hash__()
23 | 
24 |     @property
25 |     def adapter_id(self):
26 |         return self.prompt_adapter_id
27 | 
28 |     @property
29 |     def name(self):
30 |         return self.prompt_adapter_name
31 | 
32 |     @property
33 |     def local_path(self):
34 |         return self.prompt_adapter_local_path
35 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/scalar_type.py:
--------------------------------------------------------------------------------
 1 | from ._core_ext import NanRepr, ScalarType
 2 | 
 3 | # naming generally follows: https://github.com/jax-ml/ml_dtypes
 4 | # for floating point types (leading f) the scheme is:
 5 | #  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
 6 | #  flags:
 7 | #  - no-flags: means it follows IEEE 754 conventions
 8 | #  - f: means finite values only (no infinities)
 9 | #  - n: means nans are supported (non-standard encoding)
10 | # for integer types the scheme is:
11 | #  `[u]int<size_bits>[b<bias>]`
12 | #  - if bias is not present it means its zero
13 | 
14 | 
15 | class scalar_types:
16 |     int4 = ScalarType.int_(4, None)
17 |     uint4 = ScalarType.uint(4, None)
18 |     int8 = ScalarType.int_(8, None)
19 |     uint8 = ScalarType.uint(8, None)
20 |     float8_e4m3fn = ScalarType.float_(4, 3, True,
21 |                                       NanRepr.EXTD_RANGE_MAX_MIN.value)
22 |     float8_e5m2 = ScalarType.float_IEEE754(5, 2)
23 |     float16_e8m7 = ScalarType.float_IEEE754(8, 7)
24 |     float16_e5m10 = ScalarType.float_IEEE754(5, 10)
25 | 
26 |     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
27 |     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE.value)
28 | 
29 |     # "gptq" types
30 |     uint4b8 = ScalarType.uint(4, 8)
31 |     uint8b128 = ScalarType.uint(8, 128)
32 | 
33 |     # colloquial names
34 |     bfloat16 = float16_e8m7
35 |     float16 = float16_e5m10
36 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.envs import VLLM_USE_MODELSCOPE
 2 | 
 3 | if VLLM_USE_MODELSCOPE:
 4 |     # Patch here, before each import happens
 5 |     import modelscope
 6 |     from packaging import version
 7 | 
 8 |     # patch_hub begins from modelscope>=1.18.1
 9 |     if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
10 |         raise ImportError(
11 |             'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
12 |             'install by `pip install modelscope>=1.18.1`')
13 | 
14 |     from modelscope.utils.hf_util import patch_hub
15 | 
16 |     # Patch hub to download models from modelscope to speed up.
17 |     patch_hub()
18 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/mllama.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config
 2 | 
 3 | 
 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
 5 |     '''
 6 |     Use this class to override is_encoder_decoder:
 7 |     - transformers regards mllama as is_encoder_decoder=False
 8 |     - vllm needs is_encoder_decoder=True to enable cross-attention
 9 |     '''
10 | 
11 |     def __init__(
12 |         self,
13 |         **kwargs,
14 |     ):
15 |         super().__init__(**kwargs)
16 |         self.is_encoder_decoder = True
17 | 
18 | 
19 | class MllamaConfig(mllama_hf_config.MllamaConfig):
20 | 
21 |     def __init__(
22 |         self,
23 |         text_config=None,
24 |         **kwargs,
25 |     ):
26 |         if isinstance(text_config, dict):
27 |             text_config = MllamaTextConfig(**text_config)
28 |         super().__init__(text_config=text_config, **kwargs)
29 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/nvlm_d.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
 3 | # --------------------------------------------------------
 4 | # NVLM-D
 5 | # Copyright (c) 2024 NVIDIA
 6 | # Licensed under Apache 2.0 License [see LICENSE for details]
 7 | # --------------------------------------------------------
 8 | from .internvl import InternVLChatConfig
 9 | 
10 | 
11 | class NVLM_D_Config(InternVLChatConfig):
12 |     model_type = 'NVLM_D'
13 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .mistral import MistralTokenizer
2 | 
3 | __all__ = ["MistralTokenizer"]
4 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/utils.py:
--------------------------------------------------------------------------------
 1 | from os import PathLike
 2 | from pathlib import Path
 3 | from typing import Union
 4 | 
 5 | 
 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool:
 7 |     """Check if the file is a GGUF model."""
 8 |     model = Path(model)
 9 |     if not model.is_file():
10 |         return False
11 |     elif model.suffix == ".gguf":
12 |         return True
13 | 
14 |     with open(model, "rb") as f:
15 |         header = f.read(4)
16 |     return header == b"GGUF"
17 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.triton_utils.importing import HAS_TRITON
 2 | 
 3 | __all__ = ["HAS_TRITON"]
 4 | 
 5 | if HAS_TRITON:
 6 | 
 7 |     from vllm.triton_utils.custom_cache_manager import (
 8 |         maybe_set_triton_cache_manager)
 9 |     from vllm.triton_utils.libentry import libentry
10 | 
11 |     __all__ += ["maybe_set_triton_cache_manager", "libentry"]
12 | 


--------------------------------------------------------------------------------
/vllm/triton_utils/importing.py:
--------------------------------------------------------------------------------
 1 | from importlib.util import find_spec
 2 | 
 3 | from vllm.logger import init_logger
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | HAS_TRITON = find_spec("triton") is not None
 8 | 
 9 | if not HAS_TRITON:
10 |     logger.info("Triton not installed; certain GPU-related functions"
11 |                 " will not be available.")
12 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/vllm/version.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from ._version import __version__, __version_tuple__
 3 | except Exception as e:
 4 |     import warnings
 5 | 
 6 |     warnings.warn(f"Failed to read commit hash:\n{e}",
 7 |                   RuntimeWarning,
 8 |                   stacklevel=2)
 9 | 
10 |     __version__ = "dev"
11 |     __version_tuple__ = (0, 0, __version__)
12 | 


--------------------------------------------------------------------------------
/vllm/vllm_flash_attn/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/vllm_flash_attn/.gitkeep


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/vllm/fd47e57f4b0d5f7920903490bce13bc9e49d8dba/vllm/worker/__init__.py


--------------------------------------------------------------------------------