├── .buildkite ├── check-wheel-size.py ├── lm-eval-harness │ ├── configs │ │ ├── DeepSeek-V2-Lite-Chat.yaml │ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-70B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-QQQ.yaml │ │ ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Minitron-4B-Base-FP8.yaml │ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml │ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml │ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml │ │ ├── Qwen2-57B-A14-Instruct.yaml │ │ ├── models-large.txt │ │ └── models-small.txt │ ├── run-lm-eval-gsm-hf-baseline.sh │ ├── run-lm-eval-gsm-vllm-baseline.sh │ ├── run-tests.sh │ └── test_lm_eval_correctness.py ├── nightly-benchmarks │ ├── README.md │ ├── benchmark-pipeline.yaml │ ├── nightly-annotation.md │ ├── nightly-descriptions.md │ ├── nightly-pipeline.yaml │ ├── performance-benchmarks-descriptions.md │ ├── scripts │ │ ├── convert-results-json-to-markdown.py │ │ ├── download-tokenizer.py │ │ ├── generate-nightly-markdown.py │ │ ├── get-lmdeploy-modelname.py │ │ ├── launch-server.sh │ │ ├── nightly-annotate.sh │ │ ├── run-nightly-benchmarks.sh │ │ ├── run-performance-benchmarks.sh │ │ ├── summary-nightly-results.py │ │ └── wait-for-image.sh │ └── tests │ │ ├── latency-tests.json │ │ ├── nightly-tests.json │ │ ├── serving-tests.json │ │ └── throughput-tests.json ├── release-pipeline.yaml ├── run-amd-test.sh ├── run-benchmarks.sh ├── run-cpu-test-ppc64le.sh ├── run-cpu-test.sh ├── run-hpu-test.sh ├── run-multi-node-test.sh ├── run-neuron-test.sh ├── run-openvino-test.sh ├── run-tpu-test.sh ├── run-xpu-test.sh ├── test-pipeline.yaml └── upload-wheels.sh ├── .clang-format ├── .dockerignore ├── .github ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug report.yml │ ├── 500-feature request.yml │ ├── 600-new model.yml │ ├── 700-performance discussion.yml │ ├── 750-RFC.yml │ ├── 800-misc discussion.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── mergify.yml ├── scripts │ └── cleanup_pr_body.sh └── workflows │ ├── actionlint.yml │ ├── add_label_automerge.yml │ ├── clang-format.yml │ ├── cleanup_pr_body.yml │ ├── codespell.yml │ ├── matchers │ ├── actionlint.json │ ├── mypy.json │ └── ruff.json │ ├── mypy.yaml │ ├── png-lint.yml │ ├── publish.yml │ ├── reminder_comment.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ ├── shellcheck.yml │ ├── sphinx-lint.yml │ ├── stale.yml │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── .shellcheckrc ├── .yapfignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DCO ├── Dockerfile ├── Dockerfile.arm ├── Dockerfile.cpu ├── Dockerfile.hpu ├── Dockerfile.neuron ├── Dockerfile.openvino ├── Dockerfile.ppc64le ├── Dockerfile.rocm ├── Dockerfile.tpu ├── Dockerfile.xpu ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_latency.py ├── benchmark_prefix_caching.py ├── benchmark_prioritization.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── cutlass_benchmarks │ ├── w8a8_benchmarks.py │ └── weight_shapes.py ├── disagg_benchmarks │ ├── disagg_overhead_benchmark.sh │ ├── disagg_performance_benchmark.sh │ ├── disagg_prefill_proxy_server.py │ ├── round_robin_proxy.py │ └── visualize_benchmark_results.py ├── kernels │ ├── benchmark_aqlm.py │ ├── benchmark_layernorm.py │ ├── benchmark_machete.py │ ├── benchmark_marlin.py │ ├── benchmark_moe.py │ ├── benchmark_paged_attention.py │ ├── benchmark_quant.py │ ├── benchmark_rope.py │ ├── benchmark_shapes.py │ ├── graph_machete_bench.py │ ├── requirements.txt │ └── weight_shapes.py ├── launch_tgi_server.sh ├── overheads │ └── benchmark_hashing.py └── sonnet.txt ├── cmake ├── cpu_extension.cmake ├── hipify.py └── utils.cmake ├── collect_env.py ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cuh │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ ├── dtype_fp8.cuh │ ├── paged_attention_v1.cu │ └── paged_attention_v2.cu ├── cache.h ├── cache_kernels.cu ├── core │ ├── exception.hpp │ ├── registration.h │ └── scalar_type.hpp ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── cpu_types_arm.hpp │ ├── cpu_types_vsx.hpp │ ├── cpu_types_x86.hpp │ ├── dnnl_helper.hpp │ ├── layernorm.cpp │ ├── pos_encoding.cpp │ ├── quant.cpp │ ├── torch_bindings.cpp │ └── utils.cpp ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── cutlass_extensions │ ├── cute_utils.cuh │ ├── epilogue │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ ├── scaled_mm_epilogues_c2x.hpp │ │ └── scaled_mm_epilogues_c3x.hpp │ ├── torch_utils.hpp │ ├── vllm_collective_builder.cuh │ ├── vllm_custom_types.cuh │ ├── vllm_cutlass_library_extension.py │ ├── vllm_numeric_conversion.cuh │ └── vllm_type_utils.cuh ├── dispatch_utils.h ├── layernorm_kernels.cu ├── layernorm_quant_kernels.cu ├── mamba │ ├── causal_conv1d │ │ ├── causal_conv1d.cu │ │ ├── causal_conv1d.h │ │ └── static_switch.h │ └── mamba_ssm │ │ ├── selective_scan.h │ │ ├── selective_scan_fwd.cu │ │ └── static_switch.h ├── moe │ ├── marlin_kernels │ │ ├── marlin_moe_kernel.h │ │ ├── marlin_moe_kernel_ku4.cu │ │ ├── marlin_moe_kernel_ku4.h │ │ ├── marlin_moe_kernel_ku4b8.cu │ │ ├── marlin_moe_kernel_ku4b8.h │ │ ├── marlin_moe_kernel_ku8b128.cu │ │ └── marlin_moe_kernel_ku8b128.h │ ├── marlin_moe_ops.cu │ ├── moe_align_sum_kernels.cu │ ├── moe_ops.h │ ├── topk_softmax_kernels.cu │ └── torch_bindings.cpp ├── ops.h ├── permute_cols.cu ├── pos_encoding_kernels.cu ├── prepare_inputs │ ├── advance_step.cu │ └── advance_step.cuh ├── quantization │ ├── aqlm │ │ └── gemm_kernels.cu │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── compressed_tensors │ │ └── int8_quant_kernels.cu │ ├── cutlass_w8a8 │ │ ├── Epilogues.md │ │ ├── common.hpp │ │ ├── scaled_mm_c2x.cu │ │ ├── scaled_mm_c2x.cuh │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ ├── scaled_mm_c3x.cu │ │ └── scaled_mm_entry.cu │ ├── fp8 │ │ ├── amd │ │ │ ├── hip_float8.h │ │ │ ├── hip_float8_impl.h │ │ │ └── quant_utils.cuh │ │ ├── common.cu │ │ ├── common.cuh │ │ ├── fp8_marlin.cu │ │ └── nvidia │ │ │ └── quant_utils.cuh │ ├── gguf │ │ ├── dequantize.cuh │ │ ├── ggml-common.h │ │ ├── gguf_kernel.cu │ │ ├── mmq.cuh │ │ ├── mmvq.cuh │ │ └── vecdotq.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── gptq_marlin │ │ ├── awq_marlin_repack.cu │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin_repack.cu │ │ ├── marlin.cuh │ │ └── marlin_dtypes.cuh │ ├── machete │ │ ├── Readme.md │ │ ├── generate.py │ │ ├── machete_collective_builder.cuh │ │ ├── machete_interleaving_utils.cuh │ │ ├── machete_mainloop.cuh │ │ ├── machete_mm_kernel.cuh │ │ ├── machete_mm_launcher.cuh │ │ ├── machete_prepack_kernel.cuh │ │ ├── machete_prepack_launcher.cuh │ │ ├── machete_prepacked_layout.cuh │ │ └── machete_pytorch.cu │ └── marlin │ │ ├── dense │ │ ├── LICENSE │ │ ├── common │ │ │ ├── base.h │ │ │ └── mem.h │ │ └── marlin_cuda_kernel.cu │ │ ├── qqq │ │ └── marlin_qqq_gemm_kernel.cu │ │ └── sparse │ │ ├── LICENSE │ │ ├── common │ │ ├── base.h │ │ ├── mem.h │ │ └── mma.h │ │ └── marlin_24_cuda_kernel.cu ├── rocm │ ├── attention.cu │ ├── ops.h │ └── torch_bindings.cpp ├── torch_bindings.cpp └── type_convert.cuh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── _static │ └── custom.js │ ├── _templates │ └── sections │ │ └── header.html │ ├── assets │ ├── design │ │ ├── arch_overview │ │ │ ├── entrypoints.excalidraw.png │ │ │ └── llm_engine.excalidraw.png │ │ └── hierarchy.png │ ├── dev │ │ └── dockerfile-stages-dependency.png │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── automatic_prefix_caching │ ├── apc.rst │ └── details.md │ ├── community │ ├── meetups.rst │ └── sponsors.md │ ├── conf.py │ ├── contributing │ ├── dockerfile │ │ └── dockerfile.rst │ ├── overview.rst │ └── profiling │ │ └── profiling_index.rst │ ├── design │ ├── arch_overview.rst │ ├── huggingface_integration.rst │ ├── input_processing │ │ ├── input_processing_pipeline.rst │ │ └── model_inputs_index.rst │ ├── kernel │ │ └── paged_attention.rst │ ├── multimodal │ │ ├── adding_multimodal_plugin.rst │ │ └── multimodal_index.rst │ └── plugin_system.rst │ ├── dev │ ├── engine │ │ ├── async_llm_engine.rst │ │ ├── engine_index.rst │ │ └── llm_engine.rst │ ├── offline_inference │ │ ├── llm.rst │ │ ├── llm_inputs.rst │ │ └── offline_index.rst │ ├── pooling_params.rst │ └── sampling_params.rst │ ├── generate_examples.py │ ├── getting_started │ ├── amd-installation.rst │ ├── arm-installation.rst │ ├── cpu-installation.rst │ ├── debugging.rst │ ├── examples │ │ └── examples_index.template.rst │ ├── gaudi-installation.rst │ ├── installation.rst │ ├── neuron-installation.rst │ ├── openvino-installation.rst │ ├── quickstart.rst │ ├── tpu-installation.rst │ └── xpu-installation.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── enabling_multimodal_inputs.rst │ ├── engine_args.rst │ ├── lora.rst │ ├── performance.rst │ ├── spec_decode.rst │ ├── structured_outputs.rst │ ├── supported_models.rst │ └── vlm.rst │ ├── performance │ └── benchmarks.rst │ ├── quantization │ ├── auto_awq.rst │ ├── bnb.rst │ ├── fp8.rst │ ├── fp8_e4m3_kvcache.rst │ ├── fp8_e5m2_kvcache.rst │ ├── gguf.rst │ ├── int8.rst │ └── supported_hardware.rst │ └── serving │ ├── compatibility_matrix.rst │ ├── deploying_with_bentoml.rst │ ├── deploying_with_cerebrium.rst │ ├── deploying_with_docker.rst │ ├── deploying_with_dstack.rst │ ├── deploying_with_k8s.rst │ ├── deploying_with_kserve.rst │ ├── deploying_with_lws.rst │ ├── deploying_with_nginx.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ ├── env_vars.rst │ ├── faq.rst │ ├── integrations.rst │ ├── metrics.rst │ ├── openai_compatible_server.md │ ├── run_on_sky.rst │ ├── serving_with_langchain.rst │ ├── serving_with_llamaindex.rst │ ├── serving_with_llamastack.rst │ ├── tensorizer.rst │ └── usage_stats.md ├── examples ├── api_client.py ├── aqlm_example.py ├── cpu_offload.py ├── disaggregated_prefill.sh ├── florence2_inference.py ├── fp8 │ ├── README.md │ ├── extract_scales.py │ └── quantizer │ │ ├── README.md │ │ └── quantize.py ├── gguf_inference.py ├── gradio_openai_chatbot_webserver.py ├── gradio_webserver.py ├── llm_engine_example.py ├── logging_configuration.md ├── lora_with_quantization_inference.py ├── multilora_inference.py ├── offline_chat_with_tools.py ├── offline_inference.py ├── offline_inference_arctic.py ├── offline_inference_audio_language.py ├── offline_inference_chat.py ├── offline_inference_cli.py ├── offline_inference_distributed.py ├── offline_inference_embedding.py ├── offline_inference_encoder_decoder.py ├── offline_inference_mlpspeculator.py ├── offline_inference_neuron.py ├── offline_inference_neuron_int8_quantization.py ├── offline_inference_openai.md ├── offline_inference_pixtral.py ├── offline_inference_structured_outputs.py ├── offline_inference_tpu.py ├── offline_inference_vision_language.py ├── offline_inference_vision_language_embedding.py ├── offline_inference_vision_language_multi_image.py ├── offline_inference_with_prefix.py ├── offline_inference_with_profiler.py ├── offline_profile.py ├── openai_chat_completion_client.py ├── openai_chat_completion_client_for_multimodal.py ├── openai_chat_completion_client_with_tools.py ├── openai_chat_completion_structured_outputs.py ├── openai_chat_embedding_client_for_multimodal.py ├── openai_completion_client.py ├── openai_cross_encoder_score.py ├── openai_embedding_client.py ├── openai_example_batch.jsonl ├── production_monitoring │ ├── Otel.md │ ├── README.md │ ├── docker-compose.yaml │ ├── dummy_client.py │ ├── grafana.json │ └── prometheus.yaml ├── run_cluster.sh ├── save_sharded_state.py ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_blip2.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_dse_qwen2_vl.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja ├── template_llava.jinja ├── template_vlm2vec.jinja ├── tensorize_vllm_model.py ├── tool_chat_template_granite.jinja ├── tool_chat_template_granite_20b_fc.jinja ├── tool_chat_template_hermes.jinja ├── tool_chat_template_internlm2_tool.jinja ├── tool_chat_template_llama3.1_json.jinja ├── tool_chat_template_llama3.2_json.jinja ├── tool_chat_template_llama3.2_pythonic.jinja ├── tool_chat_template_mistral.jinja ├── tool_chat_template_mistral_parallel.jinja └── tool_chat_template_toolace.jinja ├── find_cuda_init.py ├── format.sh ├── pyproject.toml ├── python_only_dev.py ├── requirements-build.txt ├── requirements-common.txt ├── requirements-cpu.txt ├── requirements-cuda.txt ├── requirements-dev.txt ├── requirements-hpu.txt ├── requirements-lint.txt ├── requirements-neuron.txt ├── requirements-openvino.txt ├── requirements-rocm.txt ├── requirements-test.in ├── requirements-test.txt ├── requirements-tpu.txt ├── requirements-xpu.txt ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── __init__.py │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ └── test_request_tracker.py ├── basic_correctness │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_chunked_prefill.py │ ├── test_cpu_offload.py │ └── test_preemption.py ├── compile │ ├── __init__.py │ ├── backend.py │ ├── piecewise │ │ ├── __init__.py │ │ ├── test_simple.py │ │ └── test_toy_llama.py │ ├── test_basic_correctness.py │ ├── test_full_graph.py │ ├── test_functionalization.py │ ├── test_fusion.py │ ├── test_pass_manager.py │ ├── test_wrapper.py │ └── utils.py ├── conftest.py ├── core │ ├── __init__.py │ ├── block │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── e2e │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_correctness.py │ │ │ └── test_correctness_sliding_window.py │ │ ├── test_block_manager.py │ │ ├── test_block_table.py │ │ ├── test_common.py │ │ ├── test_cpu_gpu_block_allocator.py │ │ ├── test_naive_block.py │ │ └── test_prefix_caching_block.py │ ├── test_chunked_prefill_scheduler.py │ ├── test_num_computed_tokens_update.py │ ├── test_scheduler.py │ ├── test_scheduler_encoder_decoder.py │ ├── test_serialization.py │ └── utils.py ├── data │ └── test_config.yaml ├── distributed │ ├── __init__.py │ ├── test_ca_buffer_sharing.py │ ├── test_comm_ops.py │ ├── test_custom_all_reduce.py │ ├── test_distributed_oot.py │ ├── test_multi_node_assignment.py │ ├── test_pipeline_parallel.py │ ├── test_pipeline_partition.py │ ├── test_pp_cudagraph.py │ ├── test_pynccl.py │ ├── test_same_node.py │ ├── test_shm_broadcast.py │ └── test_utils.py ├── encoder_decoder │ ├── __init__.py │ └── test_e2e_correctness.py ├── engine │ ├── __init__.py │ ├── output_processor │ │ ├── __init__.py │ │ ├── test_multi_step.py │ │ └── test_stop_checker.py │ ├── test_arg_utils.py │ ├── test_computed_prefix_blocks.py │ ├── test_custom_executor.py │ ├── test_detokenization.py │ ├── test_multiproc_workers.py │ ├── test_short_mm_context.py │ ├── test_skip_tokenizer_init.py │ ├── test_stop_reason.py │ └── test_stop_strings.py ├── entrypoints │ ├── __init__.py │ ├── conftest.py │ ├── llm │ │ ├── __init__.py │ │ ├── test_accuracy.py │ │ ├── test_chat.py │ │ ├── test_encode.py │ │ ├── test_generate.py │ │ ├── test_generate_multiple_loras.py │ │ ├── test_guided_generate.py │ │ ├── test_init.py │ │ ├── test_lazy_outlines.py │ │ └── test_prompt_validation.py │ ├── offline_mode │ │ ├── __init__.py │ │ └── test_offline_mode.py │ ├── openai │ │ ├── __init__.py │ │ ├── test_accuracy.py │ │ ├── test_async_tokenization.py │ │ ├── test_audio.py │ │ ├── test_basic.py │ │ ├── test_chat.py │ │ ├── test_chat_echo.py │ │ ├── test_chat_template.py │ │ ├── test_chunked_prompt.py │ │ ├── test_cli_args.py │ │ ├── test_completion.py │ │ ├── test_embedding.py │ │ ├── test_encoder_decoder.py │ │ ├── test_lora_lineage.py │ │ ├── test_metrics.py │ │ ├── test_models.py │ │ ├── test_oot_registration.py │ │ ├── test_prompt_validation.py │ │ ├── test_return_tokens_as_ids.py │ │ ├── test_root_path.py │ │ ├── test_run_batch.py │ │ ├── test_score.py │ │ ├── test_serving_chat.py │ │ ├── test_serving_engine.py │ │ ├── test_shutdown.py │ │ ├── test_tokenization.py │ │ ├── test_video.py │ │ ├── test_vision.py │ │ ├── test_vision_embedding.py │ │ └── tool_parsers │ │ │ ├── __init__.py │ │ │ ├── test_pythonic_tool_parser.py │ │ │ └── utils.py │ └── test_chat_utils.py ├── fp8_kv │ ├── llama2-70b-fp8-kv │ │ └── kv_cache_scales.json │ └── llama2-7b-fp8-kv │ │ └── kv_cache_scales.json ├── kernels │ ├── __init__.py │ ├── allclose_default.py │ ├── conftest.py │ ├── quant_utils.py │ ├── test_activation.py │ ├── test_aqlm.py │ ├── test_attention.py │ ├── test_attention_selector.py │ ├── test_awq.py │ ├── test_awq_marlin.py │ ├── test_awq_triton.py │ ├── test_blocksparse_attention.py │ ├── test_cache.py │ ├── test_causal_conv1d.py │ ├── test_cutlass.py │ ├── test_encoder_decoder_attn.py │ ├── test_flash_attn.py │ ├── test_flashinfer.py │ ├── test_fp8_quant.py │ ├── test_ggml.py │ ├── test_gguf.py │ ├── test_gptq.py │ ├── test_int8_quant.py │ ├── test_layernorm.py │ ├── test_machete_mm.py │ ├── test_mamba_ssm.py │ ├── test_marlin_gemm.py │ ├── test_moe.py │ ├── test_permute_cols.py │ ├── test_pos_encoding.py │ ├── test_prefix_prefill.py │ ├── test_rotary_embedding.py │ ├── test_triton_scaled_mm.py │ ├── test_utils.py │ └── utils.py ├── kv_transfer │ ├── disagg_test.py │ ├── module_test.py │ ├── test_lookup_buffer.py │ ├── test_lookup_buffer.sh │ ├── test_send_recv.py │ └── test_send_recv.sh ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ ├── __init__.py │ │ └── long_context_test_data.py │ ├── test_baichuan.py │ ├── test_chatglm3_tp.py │ ├── test_gemma.py │ ├── test_layers.py │ ├── test_llama_tp.py │ ├── test_long_context.py │ ├── test_lora_bias_e2e.py │ ├── test_lora_checkpoints.py │ ├── test_lora_huggingface.py │ ├── test_lora_manager.py │ ├── test_minicpmv.py │ ├── test_minicpmv_tp.py │ ├── test_mixtral.py │ ├── test_phi.py │ ├── test_punica_sizes.py │ ├── test_punica_variation.py │ ├── test_quant_model.py │ ├── test_tokenizer_group.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ ├── __init__.py │ └── test_metrics.py ├── model_executor │ ├── __init__.py │ ├── conftest.py │ ├── test_enabled_custom_ops.py │ ├── test_guided_processors.py │ ├── test_model_load_with_params.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── decoder_only │ │ ├── __init__.py │ │ ├── audio_language │ │ │ ├── __init__.py │ │ │ └── test_ultravox.py │ │ ├── language │ │ │ ├── __init__.py │ │ │ ├── test_aqlm.py │ │ │ ├── test_fp8.py │ │ │ ├── test_gguf.py │ │ │ ├── test_gptq_marlin.py │ │ │ ├── test_gptq_marlin_24.py │ │ │ ├── test_granite.py │ │ │ ├── test_jamba.py │ │ │ ├── test_mamba.py │ │ │ ├── test_mistral.py │ │ │ ├── test_modelopt.py │ │ │ ├── test_models.py │ │ │ └── test_phimoe.py │ │ └── vision_language │ │ │ ├── __init__.py │ │ │ ├── mm_processor_kwargs │ │ │ ├── __init__.py │ │ │ ├── test_idefics3.py │ │ │ ├── test_internvl.py │ │ │ ├── test_llava_next.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_qwen.py │ │ │ └── test_qwen2_vl.py │ │ │ ├── test_awq.py │ │ │ ├── test_h2ovl.py │ │ │ ├── test_intern_vit.py │ │ │ ├── test_models.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_pixtral.py │ │ │ ├── test_qwen2_vl.py │ │ │ └── vlm_utils │ │ │ ├── __init__.py │ │ │ ├── builders.py │ │ │ ├── case_filtering.py │ │ │ ├── core.py │ │ │ ├── custom_inputs.py │ │ │ ├── model_utils.py │ │ │ ├── runners.py │ │ │ └── types.py │ ├── embedding │ │ ├── __init__.py │ │ ├── language │ │ │ ├── __init__.py │ │ │ ├── test_cls_models.py │ │ │ ├── test_embedding.py │ │ │ └── test_scoring.py │ │ ├── utils.py │ │ └── vision_language │ │ │ ├── __init__.py │ │ │ ├── test_dse_qwen2_vl.py │ │ │ ├── test_llava_next.py │ │ │ └── test_phi3v.py │ ├── encoder_decoder │ │ ├── __init__.py │ │ ├── language │ │ │ ├── __init__.py │ │ │ └── test_bart.py │ │ └── vision_language │ │ │ ├── __init__.py │ │ │ ├── test_broadcast.py │ │ │ ├── test_florence2.py │ │ │ └── test_mllama.py │ ├── fixtures │ │ ├── pixtral_chat.json │ │ └── pixtral_chat_engine.json │ ├── registry.py │ ├── test_initialization.py │ ├── test_oot_registration.py │ ├── test_registry.py │ └── utils.py ├── mq_llm_engine │ ├── __init__.py │ ├── test_abort.py │ ├── test_error_handling.py │ ├── test_load.py │ └── utils.py ├── multi_step │ ├── __init__.py │ ├── test_correctness_async_llm.py │ └── test_correctness_llm.py ├── multimodal │ ├── __init__.py │ ├── test_inputs.py │ ├── test_mapper.py │ ├── test_processing.py │ ├── test_processor_kwargs.py │ └── test_utils.py ├── plugins │ └── vllm_add_dummy_model │ │ ├── setup.py │ │ └── vllm_add_dummy_model │ │ ├── __init__.py │ │ ├── my_gemma_embedding.py │ │ ├── my_llava.py │ │ └── my_opt.py ├── prefix_caching │ ├── __init__.py │ ├── test_disable_sliding_window.py │ └── test_prefix_caching.py ├── prompt_adapter │ ├── test_bloom.py │ ├── test_multi_adapter_inference.py │ └── test_pa_lora.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── __init__.py │ ├── test_bitsandbytes.py │ ├── test_compressed_tensors.py │ ├── test_configs.py │ ├── test_cpu_offload.py │ ├── test_experts_int8.py │ ├── test_fp8.py │ ├── test_ipex_quant.py │ ├── test_lm_head.py │ └── utils.py ├── samplers │ ├── __init__.py │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_logits_processor.py │ ├── test_logprobs.py │ ├── test_no_bad_words.py │ ├── test_ranks.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ ├── test_seeded_generate.py │ └── test_typical_acceptance_sampler.py ├── spec_decode │ ├── __init__.py │ ├── e2e │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_compatibility.py │ │ ├── test_eagle_correctness.py │ │ ├── test_integration.py │ │ ├── test_integration_dist_tp2.py │ │ ├── test_integration_dist_tp4.py │ │ ├── test_logprobs.py │ │ ├── test_medusa_correctness.py │ │ ├── test_mlp_correctness.py │ │ ├── test_multistep_correctness.py │ │ ├── test_ngram_correctness.py │ │ └── test_seed.py │ ├── test_batch_expansion.py │ ├── test_dynamic_spec_decode.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_ngram_worker.py │ ├── test_scorer.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── tensorizer_loader │ ├── __init__.py │ ├── conftest.py │ └── test_tensorizer.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_embedded_commit.py ├── test_inputs.py ├── test_lazy_torch_compile.py ├── test_logger.py ├── test_logits_processor.py ├── test_regression.py ├── test_sampling_params.py ├── test_scalartype.py ├── test_sequence.py ├── test_sharded_state_loader.py ├── test_utils.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_get_eos.py │ ├── test_tokenizer.py │ └── test_tokenizer_group.py ├── tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_chat_completion_request_validations.py │ ├── test_chat_completions.py │ ├── test_jamba_tool_parser.py │ ├── test_parallel_tool_calls.py │ ├── test_tool_calls.py │ └── utils.py ├── tpu │ ├── __init__.py │ ├── test_compilation.py │ └── test_custom_dispatcher.py ├── tracing │ ├── __init__.py │ └── test_tracing.py ├── utils.py ├── v1 │ ├── __init__.py │ ├── core │ │ └── test_prefix_caching.py │ └── engine │ │ ├── __init__.py │ │ ├── test_async_llm.py │ │ ├── test_detokenizer.py │ │ ├── test_engine_args.py │ │ ├── test_engine_core.py │ │ └── test_engine_core_client.py ├── vllm_test_utils │ ├── setup.py │ └── vllm_test_utils │ │ ├── __init__.py │ │ └── blame.py ├── weight_loading │ ├── models-large.txt │ ├── models.txt │ ├── run_model_weight_loading_test.sh │ └── test_weight_loading.py └── worker │ ├── __init__.py │ ├── test_encoder_decoder_model_runner.py │ ├── test_model_input.py │ ├── test_model_runner.py │ ├── test_profile.py │ └── test_swap.py ├── tools ├── actionlint.sh ├── check_repo.sh ├── mypy.sh ├── png-lint.sh ├── profiler │ ├── print_layerwise_table.py │ └── visualize_layerwise_profile.py ├── report_build_time_ninja.py ├── shellcheck.sh └── sphinx-lint.sh ├── use_existing_torch.py └── vllm ├── __init__.py ├── _custom_ops.py ├── _ipex_ops.py ├── adapter_commons ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── assets ├── __init__.py ├── audio.py ├── base.py ├── image.py └── video.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── blocksparse_attn.py │ ├── flash_attn.py │ ├── flashinfer.py │ ├── hpu_attn.py │ ├── ipex_attn.py │ ├── openvino.py │ ├── pallas.py │ ├── placeholder_attn.py │ ├── rocm_flash_attn.py │ ├── torch_sdpa.py │ ├── utils.py │ └── xformers.py ├── layer.py ├── ops │ ├── __init__.py │ ├── blocksparse_attention │ │ ├── __init__.py │ │ ├── blocksparse_attention_kernel.py │ │ ├── interface.py │ │ └── utils.py │ ├── hpu_paged_attn.py │ ├── ipex_attn.py │ ├── paged_attn.py │ ├── prefix_prefill.py │ └── triton_flash_attention.py └── selector.py ├── beam_search.py ├── block.py ├── compilation ├── __init__.py ├── backends.py ├── compile_context.py ├── counter.py ├── decorators.py ├── fix_functionalization.py ├── fusion.py ├── inductor_pass.py ├── pass_manager.py ├── reshapes.py ├── vllm_inductor_pass.py └── wrapper.py ├── config.py ├── connections.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ ├── prefix_caching_block.py │ └── utils.py ├── block_manager.py ├── evictor.py ├── interfaces.py ├── placeholder_block_space_manager.py └── scheduler.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── cuda_wrapper.py │ ├── custom_all_reduce.py │ ├── custom_all_reduce_utils.py │ ├── hpu_communicator.py │ ├── pynccl.py │ ├── pynccl_wrapper.py │ ├── shm_broadcast.py │ ├── tpu_communicator.py │ └── xpu_communicator.py ├── kv_transfer │ ├── README.md │ ├── __init__.py │ ├── disagg_prefill_workflow.jpg │ ├── kv_connector │ │ ├── __init__.py │ │ ├── base.py │ │ ├── factory.py │ │ └── simple_connector.py │ ├── kv_lookup_buffer │ │ ├── __init__.py │ │ ├── base.py │ │ └── simple_buffer.py │ ├── kv_pipe │ │ ├── __init__.py │ │ ├── base.py │ │ └── pynccl_pipe.py │ └── kv_transfer_agent.py ├── parallel_state.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── async_timeout.py ├── llm_engine.py ├── metrics.py ├── metrics_types.py ├── multiprocessing │ ├── __init__.py │ ├── client.py │ └── engine.py ├── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── multi_step.py │ ├── single_step.py │ ├── stop_checker.py │ └── util.py └── protocol.py ├── entrypoints ├── __init__.py ├── api_server.py ├── chat_utils.py ├── launcher.py ├── llm.py ├── logger.py └── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── logits_processors.py │ ├── protocol.py │ ├── run_batch.py │ ├── serving_chat.py │ ├── serving_completion.py │ ├── serving_embedding.py │ ├── serving_engine.py │ ├── serving_score.py │ ├── serving_tokenization.py │ └── tool_parsers │ ├── __init__.py │ ├── abstract_tool_parser.py │ ├── granite_20b_fc_tool_parser.py │ ├── granite_tool_parser.py │ ├── hermes_tool_parser.py │ ├── internlm2_tool_parser.py │ ├── jamba_tool_parser.py │ ├── llama_tool_parser.py │ ├── mistral_tool_parser.py │ ├── pythonic_tool_parser.py │ └── utils.py ├── envs.py ├── executor ├── __init__.py ├── cpu_executor.py ├── distributed_gpu_executor.py ├── executor_base.py ├── gpu_executor.py ├── hpu_executor.py ├── msgspec_utils.py ├── multiproc_gpu_executor.py ├── multiproc_worker_utils.py ├── multiproc_xpu_executor.py ├── neuron_executor.py ├── openvino_executor.py ├── ray_gpu_executor.py ├── ray_hpu_executor.py ├── ray_tpu_executor.py ├── ray_utils.py ├── ray_xpu_executor.py ├── tpu_executor.py └── xpu_executor.py ├── forward_context.py ├── inputs ├── __init__.py ├── data.py ├── parse.py ├── preprocess.py └── registry.py ├── logger.py ├── logging_utils ├── __init__.py └── formatter.py ├── logits_process.py ├── lora ├── __init__.py ├── fully_sharded_layers.py ├── layers.py ├── lora.py ├── models.py ├── ops │ ├── __init__.py │ ├── bgmv_expand.py │ ├── bgmv_expand_slice.py │ ├── bgmv_shrink.py │ ├── sgmv_expand.py │ ├── sgmv_expand_slice.py │ ├── sgmv_shrink.py │ └── utils.py ├── punica.py ├── request.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── custom_op.py ├── guided_decoding │ ├── __init__.py │ ├── guided_fields.py │ ├── lm_format_enforcer_decoding.py │ ├── outlines_decoding.py │ └── outlines_logits_processors.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ └── README │ │ ├── fused_marlin_moe.py │ │ ├── fused_moe.py │ │ ├── layer.py │ │ └── moe_pallas.py │ ├── layernorm.py │ ├── linear.py │ ├── logits_processor.py │ ├── mamba │ │ ├── __init__.py │ │ ├── mamba_mixer.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── causal_conv1d.py │ │ │ └── mamba_ssm.py │ ├── pooler.py │ ├── quantization │ │ ├── __init__.py │ │ ├── aqlm.py │ │ ├── awq.py │ │ ├── awq_marlin.py │ │ ├── awq_triton.py │ │ ├── base_config.py │ │ ├── bitsandbytes.py │ │ ├── compressed_tensors │ │ │ ├── __init__.py │ │ │ ├── compressed_tensors.py │ │ │ ├── compressed_tensors_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ ├── compressed_tensors_w4a16_24.py │ │ │ │ ├── compressed_tensors_w8a16_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_int8.py │ │ │ │ └── compressed_tensors_wNa16.py │ │ │ ├── triton_scaled_mm.py │ │ │ └── utils.py │ │ ├── deepspeedfp.py │ │ ├── experts_int8.py │ │ ├── fbgemm_fp8.py │ │ ├── fp8.py │ │ ├── gguf.py │ │ ├── gptq.py │ │ ├── gptq_marlin.py │ │ ├── gptq_marlin_24.py │ │ ├── hqq_marlin.py │ │ ├── ipex_quant.py │ │ ├── kernels │ │ │ ├── MPLinearKernel.py │ │ │ ├── __init__.py │ │ │ ├── exllama.py │ │ │ ├── machete.py │ │ │ └── marlin.py │ │ ├── kv_cache.py │ │ ├── marlin.py │ │ ├── modelopt.py │ │ ├── neuron_quant.py │ │ ├── qqq.py │ │ ├── schema.py │ │ ├── tpu_int8.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── layer_utils.py │ │ │ ├── machete_utils.py │ │ │ ├── marlin_utils.py │ │ │ ├── marlin_utils_fp8.py │ │ │ ├── marlin_utils_test.py │ │ │ ├── marlin_utils_test_24.py │ │ │ ├── marlin_utils_test_qqq.py │ │ │ ├── quant_utils.py │ │ │ └── w8a8_utils.py │ ├── rejection_sampler.py │ ├── resampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ ├── spec_decode_base_sampler.py │ ├── typical_acceptance_sampler.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── loader.py │ ├── neuron.py │ ├── openvino.py │ ├── tensorizer.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── adapters.py │ ├── arctic.py │ ├── aria.py │ ├── baichuan.py │ ├── bart.py │ ├── bert.py │ ├── blip.py │ ├── blip2.py │ ├── bloom.py │ ├── chameleon.py │ ├── chatglm.py │ ├── clip.py │ ├── commandr.py │ ├── dbrx.py │ ├── decilm.py │ ├── deepseek.py │ ├── deepseek_v2.py │ ├── eagle.py │ ├── exaone.py │ ├── falcon.py │ ├── florence2.py │ ├── fuyu.py │ ├── gemma.py │ ├── gemma2.py │ ├── glm.py │ ├── glm4_vision_encoder.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── granite.py │ ├── granitemoe.py │ ├── h2ovl.py │ ├── idefics2_vision_model.py │ ├── idefics3.py │ ├── interfaces.py │ ├── interfaces_base.py │ ├── intern_vit.py │ ├── internlm2.py │ ├── internlm2_ve.py │ ├── internvl.py │ ├── jais.py │ ├── jamba.py │ ├── llama.py │ ├── llava.py │ ├── llava_next.py │ ├── llava_next_video.py │ ├── llava_onevision.py │ ├── mamba.py │ ├── mamba_cache.py │ ├── medusa.py │ ├── minicpm.py │ ├── minicpm3.py │ ├── minicpmv.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mllama.py │ ├── mlp_speculator.py │ ├── module_mapping.py │ ├── molmo.py │ ├── mpt.py │ ├── nemotron.py │ ├── nvlm_d.py │ ├── olmo.py │ ├── olmo2.py │ ├── olmoe.py │ ├── opt.py │ ├── orion.py │ ├── paligemma.py │ ├── persimmon.py │ ├── phi.py │ ├── phi3.py │ ├── phi3_small.py │ ├── phi3v.py │ ├── phimoe.py │ ├── pixtral.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_audio.py │ ├── qwen2_cls.py │ ├── qwen2_moe.py │ ├── qwen2_rm.py │ ├── qwen2_vl.py │ ├── registry.py │ ├── roberta.py │ ├── siglip.py │ ├── solar.py │ ├── stablelm.py │ ├── starcoder2.py │ ├── telechat2.py │ ├── ultravox.py │ └── utils.py ├── parameter.py ├── pooling_metadata.py ├── sampling_metadata.py └── utils.py ├── multimodal ├── __init__.py ├── audio.py ├── base.py ├── image.py ├── inputs.py ├── processing.py ├── registry.py ├── utils.py └── video.py ├── outputs.py ├── platforms ├── __init__.py ├── cpu.py ├── cuda.py ├── hpu.py ├── interface.py ├── neuron.py ├── openvino.py ├── rocm.py ├── tpu.py └── xpu.py ├── plugins └── __init__.py ├── pooling_params.py ├── profiler ├── __init__.py ├── layerwise_profile.py └── utils.py ├── prompt_adapter ├── __init__.py ├── layers.py ├── models.py ├── request.py ├── utils.py └── worker_manager.py ├── py.typed ├── sampling_params.py ├── scalar_type.py ├── scripts.py ├── sequence.py ├── spec_decode ├── __init__.py ├── batch_expansion.py ├── draft_model_runner.py ├── interfaces.py ├── medusa_worker.py ├── metrics.py ├── mlp_speculator_worker.py ├── mqa_scorer.py ├── multi_step_worker.py ├── ngram_worker.py ├── proposer_worker_base.py ├── smaller_tp_proposer_worker.py ├── spec_decode_worker.py ├── target_model_runner.py ├── top1_proposer.py └── util.py ├── tracing.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── arctic.py │ ├── aria.py │ ├── chatglm.py │ ├── dbrx.py │ ├── eagle.py │ ├── exaone.py │ ├── falcon.py │ ├── h2ovl.py │ ├── internvl.py │ ├── jais.py │ ├── medusa.py │ ├── mllama.py │ ├── mlp_speculator.py │ ├── mpt.py │ ├── nemotron.py │ ├── nvlm_d.py │ ├── olmo2.py │ ├── solar.py │ ├── telechat2.py │ └── ultravox.py ├── detokenizer.py ├── detokenizer_utils.py ├── processor.py ├── tokenizer.py ├── tokenizer_group │ ├── __init__.py │ ├── base_tokenizer_group.py │ ├── ray_tokenizer_group.py │ └── tokenizer_group.py ├── tokenizers │ ├── __init__.py │ └── mistral.py └── utils.py ├── triton_utils ├── __init__.py ├── custom_cache_manager.py └── importing.py ├── usage ├── __init__.py └── usage_lib.py ├── utils.py ├── v1 ├── __init__.py ├── attention │ ├── __init__.py │ └── backends │ │ ├── __init__.py │ │ └── flash_attn.py ├── core │ ├── __init__.py │ ├── encoder_cache_manager.py │ ├── kv_cache_manager.py │ ├── kv_cache_utils.py │ └── scheduler.py ├── engine │ ├── __init__.py │ ├── async_llm.py │ ├── async_stream.py │ ├── core.py │ ├── core_client.py │ ├── detokenizer.py │ ├── llm_engine.py │ ├── mm_input_mapper.py │ └── processor.py ├── executor │ ├── __init__.py │ └── gpu_executor.py ├── outputs.py ├── request.py ├── sample │ ├── __init__.py │ ├── metadata.py │ └── sampler.py ├── serial_utils.py ├── utils.py └── worker │ ├── __init__.py │ ├── gpu_model_runner.py │ └── gpu_worker.py ├── version.py ├── vllm_flash_attn └── .gitkeep └── worker ├── __init__.py ├── cache_engine.py ├── cpu_enc_dec_model_runner.py ├── cpu_model_runner.py ├── cpu_pooling_model_runner.py ├── cpu_worker.py ├── enc_dec_model_runner.py ├── hpu_model_runner.py ├── hpu_worker.py ├── model_runner.py ├── model_runner_base.py ├── multi_step_model_runner.py ├── multi_step_tpu_worker.py ├── multi_step_worker.py ├── neuron_model_runner.py ├── neuron_worker.py ├── openvino_model_runner.py ├── openvino_worker.py ├── pooling_model_runner.py ├── tpu_model_runner.py ├── tpu_worker.py ├── utils.py ├── worker.py ├── worker_base.py ├── xpu_model_runner.py └── xpu_worker.py /.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 2 | model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.671 8 | - name: "exact_match,flexible-extract" 9 | value: 0.664 10 | limit: 1000 11 | num_fewshot: 5 12 | trust_remote_code: True -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 2 | model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.905 8 | - name: "exact_match,flexible-extract" 9 | value: 0.905 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 2 | model_name: "meta-llama/Meta-Llama-3-70B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.892 8 | - name: "exact_match,flexible-extract" 9 | value: 0.892 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.752 8 | - name: "exact_match,flexible-extract" 9 | value: 0.754 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.753 8 | - name: "exact_match,flexible-extract" 9 | value: 0.753 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.755 8 | - name: "exact_match,flexible-extract" 9 | value: 0.755 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 2 | model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.753 8 | - name: "exact_match,flexible-extract" 9 | value: 0.753 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.764 8 | - name: "exact_match,flexible-extract" 9 | value: 0.764 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.728 8 | - name: "exact_match,flexible-extract" 9 | value: 0.728 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.758 8 | - name: "exact_match,flexible-extract" 9 | value: 0.759 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 2 | model_name: "meta-llama/Meta-Llama-3-8B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.756 8 | - name: "exact_match,flexible-extract" 9 | value: 0.752 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 2 | model_name: "HandH1998/QQQ-Llama-3-8b-g128" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.419 8 | - name: "exact_match,flexible-extract" 9 | value: 0.416 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 2 | model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.356 8 | - name: "exact_match,flexible-extract" 9 | value: 0.358 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 2 | model_name: "mgoin/Minitron-4B-Base-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.233 8 | - name: "exact_match,flexible-extract" 9 | value: 0.236 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 2 | model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.86 8 | - name: "exact_match,flexible-extract" 9 | value: 0.86 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 2 | model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.624 8 | - name: "exact_match,flexible-extract" 9 | value: 0.624 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 2 | model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.616 8 | - name: "exact_match,flexible-extract" 9 | value: 0.632 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.578 8 | - name: "exact_match,flexible-extract" 9 | value: 0.585 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 2 | model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.593 8 | - name: "exact_match,flexible-extract" 9 | value: 0.588 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml: -------------------------------------------------------------------------------- 1 | # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 2 | model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.595 8 | - name: "exact_match,flexible-extract" 9 | value: 0.582 10 | limit: 1000 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml: -------------------------------------------------------------------------------- 1 | # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 2 | model_name: "Qwen/Qwen2-57B-A14B-Instruct" 3 | tasks: 4 | - name: "gsm8k" 5 | metrics: 6 | - name: "exact_match,strict-match" 7 | value: 0.792 8 | - name: "exact_match,flexible-extract" 9 | value: 0.824 10 | limit: 250 11 | num_fewshot: 5 12 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-large.txt: -------------------------------------------------------------------------------- 1 | Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml 2 | Meta-Llama-3-70B-Instruct.yaml 3 | Mixtral-8x7B-Instruct-v0.1.yaml 4 | Qwen2-57B-A14-Instruct.yaml 5 | DeepSeek-V2-Lite-Chat.yaml 6 | -------------------------------------------------------------------------------- /.buildkite/lm-eval-harness/configs/models-small.txt: -------------------------------------------------------------------------------- 1 | Meta-Llama-3-8B-Instruct.yaml 2 | Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml 3 | Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml 4 | Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml 5 | Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml 6 | Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml 7 | Minitron-4B-Base-FP8.yaml 8 | Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml 9 | Qwen2-1.5B-Instruct-FP8W8.yaml 10 | Meta-Llama-3-8B-QQQ.yaml 11 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/nightly-annotation.md: -------------------------------------------------------------------------------- 1 | 2 | ## Description 3 | 4 | This file contains the downloading link for benchmarking results. 5 | 6 | - [benchmarking pipeline](artifact://nightly-pipeline.yaml) 7 | - [benchmarking results](artifact://results.zip) 8 | - [benchmarking code](artifact://nightly-benchmarks.zip) 9 | 10 | Please download the visualization scripts in the post 11 | 12 | 13 | ## Results reproduction 14 | 15 | - Find the docker we use in `benchmarking pipeline` 16 | - Deploy the docker, and inside the docker: 17 | - Download `nightly-benchmarks.zip`. 18 | - In the same folder, run the following code 19 | ``` 20 | export HF_TOKEN= 21 | apt update 22 | apt install -y git 23 | unzip nightly-benchmarks.zip 24 | VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh 25 | ``` 26 | 27 | And the results will be inside `./benchmarks/results`. 28 | 29 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/download-tokenizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from transformers import AutoTokenizer 4 | 5 | 6 | def main(model, cachedir): 7 | # Load the tokenizer and save it to the specified directory 8 | tokenizer = AutoTokenizer.from_pretrained(model) 9 | tokenizer.save_pretrained(cachedir) 10 | print(f"Tokenizer saved to {cachedir}") 11 | 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser( 15 | description="Download and save Hugging Face tokenizer") 16 | parser.add_argument("--model", 17 | type=str, 18 | required=True, 19 | help="Name of the model") 20 | parser.add_argument("--cachedir", 21 | type=str, 22 | required=True, 23 | help="Directory to save the tokenizer") 24 | 25 | args = parser.parse_args() 26 | main(args.model, args.cachedir) 27 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py: -------------------------------------------------------------------------------- 1 | from lmdeploy.serve.openai.api_client import APIClient 2 | 3 | api_client = APIClient("http://localhost:8000") 4 | model_name = api_client.available_models[0] 5 | 6 | print(model_name) 7 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/scripts/wait-for-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) 3 | URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" 4 | 5 | TIMEOUT_SECONDS=10 6 | 7 | retries=0 8 | while [ $retries -lt 1000 ]; do 9 | if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then 10 | exit 0 11 | fi 12 | 13 | echo "Waiting for image to be available..." 14 | 15 | retries=$((retries + 1)) 16 | sleep 5 17 | done 18 | 19 | exit 1 20 | -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/tests/latency-tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_name": "latency_llama8B_tp1", 4 | "parameters": { 5 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 6 | "tensor_parallel_size": 1, 7 | "load_format": "dummy", 8 | "num_iters_warmup": 5, 9 | "num_iters": 15 10 | } 11 | }, 12 | { 13 | "test_name": "latency_llama70B_tp4", 14 | "parameters": { 15 | "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 16 | "tensor_parallel_size": 4, 17 | "load_format": "dummy", 18 | "num-iters-warmup": 5, 19 | "num-iters": 15 20 | } 21 | }, 22 | { 23 | "test_name": "latency_mixtral8x7B_tp2", 24 | "parameters": { 25 | "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", 26 | "tensor_parallel_size": 2, 27 | "load_format": "dummy", 28 | "num-iters-warmup": 5, 29 | "num-iters": 15 30 | } 31 | } 32 | ] -------------------------------------------------------------------------------- /.buildkite/nightly-benchmarks/tests/throughput-tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_name": "throughput_llama8B_tp1", 4 | "parameters": { 5 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 6 | "tensor_parallel_size": 1, 7 | "load_format": "dummy", 8 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 9 | "num_prompts": 200, 10 | "backend": "vllm" 11 | } 12 | }, 13 | { 14 | "test_name": "throughput_llama70B_tp4", 15 | "parameters": { 16 | "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 17 | "tensor_parallel_size": 4, 18 | "load_format": "dummy", 19 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 20 | "num_prompts": 200, 21 | "backend": "vllm" 22 | } 23 | }, 24 | { 25 | "test_name": "throughput_mixtral8x7B_tp2", 26 | "parameters": { 27 | "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", 28 | "tensor_parallel_size": 2, 29 | "load_format": "dummy", 30 | "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", 31 | "num_prompts": 200, 32 | "backend": "vllm" 33 | } 34 | } 35 | ] -------------------------------------------------------------------------------- /.buildkite/run-cpu-test-ppc64le.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script build the CPU docker image and run the offline inference inside the container. 4 | # It serves a sanity check for compilation and basic model usage. 5 | set -ex 6 | 7 | # Setup cleanup 8 | remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } 9 | trap remove_docker_container EXIT 10 | remove_docker_container 11 | 12 | # Try building the docker image 13 | docker build -t cpu-test -f Dockerfile.ppc64le . 14 | 15 | -------------------------------------------------------------------------------- /.buildkite/run-hpu-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script build the CPU docker image and run the offline inference inside the container. 4 | # It serves a sanity check for compilation and basic model usage. 5 | set -ex 6 | 7 | # Try building the docker image 8 | docker build -t hpu-test-env -f Dockerfile.hpu . 9 | 10 | # Setup cleanup 11 | remove_docker_container() { docker rm -f hpu-test || true; } 12 | trap remove_docker_container EXIT 13 | remove_docker_container 14 | 15 | # Run the image and launch offline inference 16 | docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py -------------------------------------------------------------------------------- /.buildkite/run-openvino-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script build the OpenVINO docker image and run the offline inference inside the container. 4 | # It serves a sanity check for compilation and basic model usage. 5 | set -ex 6 | 7 | # Try building the docker image 8 | docker build -t openvino-test -f Dockerfile.openvino . 9 | 10 | # Setup cleanup 11 | remove_docker_container() { docker rm -f openvino-test || true; } 12 | trap remove_docker_container EXIT 13 | remove_docker_container 14 | 15 | # Run the image and launch offline inference 16 | docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py 17 | -------------------------------------------------------------------------------- /.buildkite/run-tpu-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Build the docker image. 6 | docker build -f Dockerfile.tpu -t vllm-tpu . 7 | 8 | # Set up cleanup. 9 | remove_docker_container() { docker rm -f tpu-test || true; } 10 | trap remove_docker_container EXIT 11 | # Remove the container that might not be cleaned up in the previous run. 12 | remove_docker_container 13 | 14 | # For HF_TOKEN. 15 | source /etc/environment 16 | # Run a simple end-to-end example. 17 | docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" 18 | -------------------------------------------------------------------------------- /.buildkite/run-xpu-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script build the CPU docker image and run the offline inference inside the container. 4 | # It serves a sanity check for compilation and basic model usage. 5 | set -ex 6 | 7 | # Try building the docker image 8 | docker build -t xpu-test -f Dockerfile.xpu . 9 | 10 | # Setup cleanup 11 | remove_docker_container() { docker rm -f xpu-test || true; } 12 | trap remove_docker_container EXIT 13 | remove_docker_container 14 | 15 | # Run the image and launch offline inference 16 | docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py 17 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | UseTab: Never 3 | IndentWidth: 2 4 | ColumnLimit: 80 5 | 6 | # Force pointers to the type for C++. 7 | DerivePointerAlignment: false 8 | PointerAlignment: Left 9 | 10 | # Reordering #include statements can (and currently will) introduce errors 11 | SortIncludes: false 12 | 13 | # Style choices 14 | AlignConsecutiveAssignments: false 15 | AlignConsecutiveDeclarations: false 16 | IndentPPDirectives: BeforeHash 17 | 18 | IncludeCategories: 19 | - Regex: '^<' 20 | Priority: 4 21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' 22 | Priority: 3 23 | - Regex: '^"(qoda|\.\.)/' 24 | Priority: 2 25 | - Regex: '.*' 26 | Priority: 1 27 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /.venv 2 | /build 3 | dist 4 | vllm/*.so 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | .mypy_cache 12 | 13 | # Distribution / packaging 14 | .Python 15 | /build/ 16 | cmake-build-*/ 17 | CMakeUserPresets.json 18 | develop-eggs/ 19 | /dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [vllm-project] 2 | open_collective: vllm 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://docs.vllm.ai/ 3 | title: "[Doc]: " 4 | labels: ["documentation"] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description of what content in https://docs.vllm.ai/ is an issue. 12 | validations: 13 | required: true 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | - type: markdown 20 | attributes: 21 | value: > 22 | Thanks for contributing 🎉! 23 | - type: checkboxes 24 | id: askllm 25 | attributes: 26 | label: Before submitting a new issue... 27 | options: 28 | - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. 29 | required: true 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/800-misc discussion.yml: -------------------------------------------------------------------------------- 1 | name: 🎲 Misc/random discussions that do not fit into the above categories. 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. 3 | title: "[Misc]: " 4 | labels: ["misc"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Anything you want to discuss about vllm. 14 | description: > 15 | Anything you want to discuss about vllm. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | - type: checkboxes 23 | id: askllm 24 | attributes: 25 | label: Before submitting a new issue... 26 | options: 27 | - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. 28 | required: true 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | FILL IN THE PR DESCRIPTION HERE 2 | 3 | FIX #xxxx (*link existing issues this PR will resolve*) 4 | 5 | **BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html ** 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | labels: ["dependencies"] 13 | open-pull-requests-limit: 5 14 | reviewers: ["khluu", "simon-mo"] 15 | allow: 16 | - dependency-type: "all" 17 | ignore: 18 | - dependency-name: "*" 19 | update-types: ["version-update:semver-patch"] 20 | - dependency-name: "torch" 21 | - dependency-name: "torchvision" 22 | - dependency-name: "xformers" 23 | - dependency-name: "lm-format-enforcer" 24 | - dependency-name: "gguf" 25 | - dependency-name: "compressed-tensors" 26 | - dependency-name: "ray[adag]" 27 | - dependency-name: "lm-eval" 28 | groups: 29 | minor-update: 30 | applies-to: version-updates 31 | update-types: ["minor"] 32 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.yml: -------------------------------------------------------------------------------- 1 | name: Lint GitHub Actions workflows 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | paths: 7 | - '.github/workflows/*.ya?ml' 8 | - '.github/workflows/actionlint.*' 9 | - '.github/workflows/matchers/actionlint.json' 10 | pull_request: 11 | branches: 12 | - "main" 13 | paths: 14 | - '.github/workflows/*.ya?ml' 15 | - '.github/workflows/actionlint.*' 16 | - '.github/workflows/matchers/actionlint.json' 17 | 18 | env: 19 | LC_ALL: en_US.UTF-8 20 | 21 | defaults: 22 | run: 23 | shell: bash 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | actionlint: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: "Checkout" 33 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 34 | with: 35 | fetch-depth: 0 36 | 37 | - name: "Run actionlint" 38 | run: | 39 | echo "::add-matcher::.github/workflows/matchers/actionlint.json" 40 | tools/actionlint.sh -color 41 | -------------------------------------------------------------------------------- /.github/workflows/add_label_automerge.yml: -------------------------------------------------------------------------------- 1 | name: Add label on auto-merge enabled 2 | on: 3 | pull_request_target: 4 | types: 5 | - auto_merge_enabled 6 | jobs: 7 | add-label-on-auto-merge: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Add label 11 | uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 12 | with: 13 | script: | 14 | github.rest.issues.addLabels({ 15 | owner: context.repo.owner, 16 | repo: context.repo.repo, 17 | issue_number: context.issue.number, 18 | labels: ['ready'] 19 | }) 20 | env: 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | -------------------------------------------------------------------------------- /.github/workflows/cleanup_pr_body.yml: -------------------------------------------------------------------------------- 1 | name: Cleanup PR Body 2 | 3 | on: 4 | pull_request_target: 5 | types: [opened, reopened, edited] 6 | 7 | permissions: 8 | pull-requests: write 9 | 10 | jobs: 11 | update-description: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 20 | with: 21 | python-version: '3.12' 22 | 23 | - name: Update PR description 24 | env: 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" 27 | -------------------------------------------------------------------------------- /.github/workflows/matchers/actionlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "actionlint", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "message": 4, 12 | "code": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/matchers/mypy.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "mypy", 5 | "pattern": [ 6 | { 7 | "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", 8 | "file": 1, 9 | "line": 2, 10 | "severity": 3, 11 | "message": 4 12 | } 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.github/workflows/matchers/ruff.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "ruff", 5 | "pattern": [ 6 | { 7 | "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "code": 4, 12 | "message": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/png-lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint PNG exports from excalidraw 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | paths: 7 | - '*.excalidraw.png' 8 | - '.github/workflows/png-lint.yml' 9 | pull_request: 10 | branches: 11 | - "main" 12 | paths: 13 | - '*.excalidraw.png' 14 | - '.github/workflows/png-lint.yml' 15 | 16 | env: 17 | LC_ALL: en_US.UTF-8 18 | 19 | defaults: 20 | run: 21 | shell: bash 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | actionlint: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: "Checkout" 31 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 32 | with: 33 | fetch-depth: 0 34 | 35 | - name: "Run png-lint.sh to check excalidraw exported images" 36 | run: | 37 | tools/png-lint.sh 38 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | python_executable=python$1 5 | cuda_home=/usr/local/cuda-$2 6 | 7 | # Update paths 8 | PATH=${cuda_home}/bin:$PATH 9 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 10 | 11 | # Install requirements 12 | $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt 13 | 14 | # Limit the number of parallel jobs to avoid OOM 15 | export MAX_JOBS=1 16 | # Make sure release wheels are built for the following architectures 17 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" 18 | export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" 19 | 20 | bash tools/check_repo.sh 21 | 22 | # Build 23 | $python_executable setup.py bdist_wheel --dist-dir=dist 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: true, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo "$1" | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo "$2" | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/shellcheck.yml: -------------------------------------------------------------------------------- 1 | name: Lint shell scripts 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | paths: 7 | - '**/*.sh' 8 | - '.github/workflows/shellcheck.yml' 9 | pull_request: 10 | branches: 11 | - "main" 12 | paths: 13 | - '**/*.sh' 14 | - '.github/workflows/shellcheck.yml' 15 | 16 | env: 17 | LC_ALL: en_US.UTF-8 18 | 19 | defaults: 20 | run: 21 | shell: bash 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | shellcheck: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: "Checkout" 31 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 32 | with: 33 | fetch-depth: 0 34 | 35 | - name: "Check shell scripts" 36 | run: | 37 | tools/shellcheck.sh 38 | -------------------------------------------------------------------------------- /.github/workflows/sphinx-lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "docs/**" 9 | pull_request: 10 | branches: 11 | - main 12 | paths: 13 | - "docs/**" 14 | 15 | jobs: 16 | sphinx-lint: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | matrix: 20 | python-version: ["3.12"] 21 | steps: 22 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install -r requirements-lint.txt 31 | - name: Linting docs 32 | run: tools/sphinx-lint.sh 33 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | paths: 10 | - "**/*.py" 11 | - .github/workflows/yapf.yml 12 | pull_request: 13 | branches: 14 | - main 15 | paths: 16 | - "**/*.py" 17 | - .github/workflows/yapf.yml 18 | 19 | jobs: 20 | yapf: 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | python-version: ["3.12"] 25 | steps: 26 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install yapf==0.32.0 35 | pip install toml==0.10.2 36 | - name: Running yapf 37 | run: | 38 | yapf --diff --recursive . 39 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.12" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | fail_on_warning: true 14 | 15 | # If using Sphinx, optionally build your docs in additional formats such as PDF 16 | formats: [] 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /.shellcheckrc: -------------------------------------------------------------------------------- 1 | # rules currently disabled: 2 | # 3 | # SC1091 (info): Not following: was not specified as input (see shellcheck -x) 4 | # SC2004 (style): $/${} is unnecessary on arithmetic variables. 5 | # SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. 6 | # SC2155 (warning): Declare and assign separately to avoid masking return values. 7 | # SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. 8 | # 9 | disable=SC1091,SC2004,SC2129,SC2155,SC2164 10 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). 4 | -------------------------------------------------------------------------------- /Dockerfile.hpu: -------------------------------------------------------------------------------- 1 | FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest 2 | 3 | COPY ./ /workspace/vllm 4 | 5 | WORKDIR /workspace/vllm 6 | 7 | RUN pip install -v -r requirements-hpu.txt 8 | 9 | ENV no_proxy=localhost,127.0.0.1 10 | ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true 11 | 12 | RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install 13 | 14 | # install development dependencies (for testing) 15 | RUN python3 -m pip install -e tests/vllm_test_utils 16 | 17 | WORKDIR /workspace/ 18 | 19 | RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks 20 | 21 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] 22 | -------------------------------------------------------------------------------- /Dockerfile.openvino: -------------------------------------------------------------------------------- 1 | # The vLLM Dockerfile is used to construct vLLM image that can be directly used 2 | # to run the OpenAI compatible server. 3 | 4 | FROM ubuntu:22.04 AS dev 5 | 6 | RUN apt-get update -y && \ 7 | apt-get install -y \ 8 | git python3-pip \ 9 | ffmpeg libsm6 libxext6 libgl1 10 | WORKDIR /workspace 11 | 12 | COPY . . 13 | ARG GIT_REPO_CHECK=0 14 | RUN --mount=type=bind,source=.git,target=.git \ 15 | if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi 16 | 17 | # install build requirements 18 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt 19 | # build vLLM with OpenVINO backend 20 | RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace 21 | 22 | COPY examples/ /workspace/examples 23 | COPY benchmarks/ /workspace/benchmarks 24 | 25 | # install development dependencies (for testing) 26 | RUN python3 -m pip install -e tests/vllm_test_utils 27 | 28 | CMD ["/bin/bash"] 29 | -------------------------------------------------------------------------------- /Dockerfile.tpu: -------------------------------------------------------------------------------- 1 | ARG NIGHTLY_DATE="20241017" 2 | ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" 3 | 4 | FROM $BASE_IMAGE 5 | WORKDIR /workspace/vllm 6 | 7 | # Install some basic utilities 8 | RUN apt-get update && apt-get install -y \ 9 | git \ 10 | ffmpeg libsm6 libxext6 libgl1 11 | 12 | # Build vLLM. 13 | COPY . . 14 | ARG GIT_REPO_CHECK=0 15 | RUN --mount=type=bind,source=.git,target=.git \ 16 | if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi 17 | 18 | ENV VLLM_TARGET_DEVICE="tpu" 19 | RUN --mount=type=cache,target=/root/.cache/pip \ 20 | --mount=type=bind,source=.git,target=.git \ 21 | python3 -m pip install \ 22 | -r requirements-tpu.txt 23 | RUN python3 setup.py develop 24 | 25 | # install development dependencies (for testing) 26 | RUN python3 -m pip install -e tests/vllm_test_utils 27 | 28 | CMD ["/bin/bash"] 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements-common.txt 3 | include requirements-cuda.txt 4 | include requirements-rocm.txt 5 | include requirements-neuron.txt 6 | include requirements-cpu.txt 7 | include CMakeLists.txt 8 | 9 | recursive-include cmake * 10 | recursive-include csrc * 11 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. 6 | 7 | Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). 8 | 9 | --- 10 | 11 | Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. 12 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | 10 | ## Downloading the ShareGPT4V dataset 11 | 12 | The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts 13 | will ignore a datapoint if the referred image is missing. 14 | ```bash 15 | wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json 16 | mkdir coco -p 17 | wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip 18 | unzip coco/train2017.zip -d coco/ 19 | ``` 20 | -------------------------------------------------------------------------------- /benchmarks/kernels/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v "$PWD/data:/data" \ 9 | ghcr.io/huggingface/text-generation-inference:2.2.0 \ 10 | --model-id "$MODEL" \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens "$TOKENS" 17 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8.cuh" 8 | -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8 7 | #ifndef USE_ROCM 8 | #include 9 | #endif // USE_ROCM 10 | #endif // ENABLE_FP8 11 | 12 | namespace vllm { 13 | 14 | enum class Fp8KVCacheDataType { 15 | kAuto = 0, 16 | kFp8E4M3 = 1, 17 | kFp8E5M2 = 2, 18 | }; 19 | 20 | // fp8 vector types for quantization of kv cache 21 | template <> 22 | struct Vec { 23 | using Type = uint8_t; 24 | }; 25 | 26 | template <> 27 | struct Vec { 28 | using Type = uint16_t; 29 | }; 30 | 31 | template <> 32 | struct Vec { 33 | using Type = uint32_t; 34 | }; 35 | 36 | template <> 37 | struct Vec { 38 | using Type = uint2; 39 | }; 40 | 41 | } // namespace vllm 42 | -------------------------------------------------------------------------------- /csrc/core/exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VLLM_IMPLIES(p, q) (!(p) || (q)) 4 | -------------------------------------------------------------------------------- /csrc/core/registration.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define _CONCAT(A, B) A##B 6 | #define CONCAT(A, B) _CONCAT(A, B) 7 | 8 | #define _STRINGIFY(A) #A 9 | #define STRINGIFY(A) _STRINGIFY(A) 10 | 11 | // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME 12 | // could be a macro instead of a literal token. 13 | #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) 14 | 15 | // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME 16 | // could be a macro instead of a literal token. 17 | #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \ 18 | TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE) 19 | 20 | // REGISTER_EXTENSION allows the shared library to be loaded and initialized 21 | // via python's import statement. 22 | #define REGISTER_EXTENSION(NAME) \ 23 | PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \ 24 | static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \ 25 | STRINGIFY(NAME), nullptr, 0, nullptr}; \ 26 | return PyModule_Create(&module); \ 27 | } 28 | -------------------------------------------------------------------------------- /csrc/cpu/cpu_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CPU_TYPES_HPP 2 | #define CPU_TYPES_HPP 3 | 4 | #if defined(__x86_64__) 5 | //x86 implementation 6 | #include "cpu_types_x86.hpp" 7 | #elif defined(__POWER9_VECTOR__) 8 | //ppc implementation 9 | #include "cpu_types_vsx.hpp" 10 | #elif defined(__aarch64__) 11 | //arm implementation 12 | #include "cpu_types_arm.hpp" 13 | #else 14 | #warning "unsupported vLLM cpu implementation" 15 | #endif 16 | 17 | #endif -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) 4 | #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 5 | #define DEVICE_INLINE __forceinline__ __device__ 6 | #define HOST_INLINE __forceinline__ __host__ 7 | #else 8 | #define HOST_DEVICE_INLINE inline 9 | #define DEVICE_INLINE inline 10 | #define HOST_INLINE inline 11 | #endif 12 | 13 | int64_t get_device_attribute(int64_t attribute, int64_t device_id); 14 | 15 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); 16 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int64_t get_device_attribute(int64_t attribute, int64_t device_id) { 6 | int device, value; 7 | if (device_id < 0) { 8 | cudaGetDevice(&device); 9 | } else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), 13 | device); 14 | return value; 15 | } 16 | 17 | int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) { 18 | int64_t attribute; 19 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 20 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 21 | 22 | #ifdef USE_ROCM 23 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 24 | #else 25 | attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; 26 | #endif 27 | 28 | return get_device_attribute(attribute, device_id); 29 | } 30 | -------------------------------------------------------------------------------- /csrc/cutlass_extensions/vllm_type_utils.cuh: -------------------------------------------------------------------------------- 1 | #include "cutlass/bfloat16.h" 2 | #include "cutlass/half.h" 3 | #include "cuda_bf16.h" 4 | 5 | #include "cutlass_extensions/vllm_custom_types.cuh" 6 | 7 | namespace cutlass { 8 | 9 | template 10 | struct nameof { 11 | static constexpr char const* value = "unknown"; 12 | }; 13 | 14 | template 15 | inline constexpr auto nameof_v = nameof::value; 16 | 17 | #define NAMEOF_TYPE(T) \ 18 | template <> \ 19 | struct nameof { \ 20 | static constexpr char const* value = #T; \ 21 | }; 22 | 23 | NAMEOF_TYPE(float_e4m3_t) 24 | NAMEOF_TYPE(float_e5m2_t) 25 | NAMEOF_TYPE(half_t) 26 | NAMEOF_TYPE(nv_bfloat16) 27 | NAMEOF_TYPE(bfloat16_t) 28 | NAMEOF_TYPE(float) 29 | 30 | NAMEOF_TYPE(int4b_t) 31 | NAMEOF_TYPE(int8_t) 32 | NAMEOF_TYPE(int32_t) 33 | NAMEOF_TYPE(int64_t) 34 | 35 | NAMEOF_TYPE(vllm_uint4b8_t) 36 | NAMEOF_TYPE(uint4b_t) 37 | NAMEOF_TYPE(uint8_t) 38 | NAMEOF_TYPE(vllm_uint8b128_t) 39 | NAMEOF_TYPE(uint32_t) 40 | NAMEOF_TYPE(uint64_t) 41 | 42 | }; // namespace cutlass -------------------------------------------------------------------------------- /csrc/mamba/mamba_ssm/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 3 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 4 | 5 | // clang-format off 6 | // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h 7 | #pragma once 8 | 9 | /// @param COND - a boolean expression to switch by 10 | /// @param CONST_NAME - a name given for the constexpr bool variable. 11 | /// @param ... - code to execute for true and false 12 | /// 13 | /// Usage: 14 | /// ``` 15 | /// BOOL_SWITCH(flag, BoolConst, [&] { 16 | /// some_function(...); 17 | /// }); 18 | /// ``` 19 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 20 | [&] { \ 21 | if (COND) { \ 22 | constexpr bool CONST_NAME = true; \ 23 | return __VA_ARGS__(); \ 24 | } else { \ 25 | constexpr bool CONST_NAME = false; \ 26 | return __VA_ARGS__(); \ 27 | } \ 28 | }() 29 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = true; 18 | 19 | if (false) { 20 | } 21 | AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256) 22 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256) 23 | AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128) 24 | AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku4b8.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku4b8( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | // We return bool so we can create these different kernel calls as a sequence 8 | // of if-elseif's. 9 | bool call_marlin_moe_kernel_ku4b8( 10 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 11 | bool has_act_order, int group_blocks, int num_threads, int blocks, 12 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 13 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 14 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 15 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 16 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 17 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 18 | int m_block, int max_par, int cfg_max_m_blocks); 19 | 20 | } // namespace marlin_moe 21 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu: -------------------------------------------------------------------------------- 1 | #include "marlin_moe_kernel_ku8b128.h" 2 | 3 | namespace marlin_moe { 4 | 5 | // We return bool so we can create these different kernel calls as a sequence 6 | // of if-elseif's. 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks) { 17 | bool has_zp = false; 18 | 19 | if (false) { 20 | } 21 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256) 22 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256) 23 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128) 24 | GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128) 25 | else { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | } // namespace marlin_moe 32 | -------------------------------------------------------------------------------- /csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "marlin_moe_kernel.h" 4 | 5 | namespace marlin_moe { 6 | 7 | bool call_marlin_moe_kernel_ku8b128( 8 | vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks, 9 | bool has_act_order, int group_blocks, int num_threads, int blocks, 10 | int max_shared_mem, cudaStream_t stream, const int4* A_ptr, 11 | const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr, 12 | const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr, 13 | const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups, 14 | int expert_idx, int num_experts, int topk, int prob_m, int prob_n, 15 | int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights, 16 | int m_block, int max_par, int cfg_max_m_blocks); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, 6 | torch::Tensor& token_expert_indices, 7 | torch::Tensor& gating_output); 8 | 9 | void moe_sum(torch::Tensor& input, torch::Tensor& output); 10 | 11 | void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, 12 | int64_t block_size, torch::Tensor sorted_token_ids, 13 | torch::Tensor experts_ids, 14 | torch::Tensor num_tokens_post_pad); 15 | -------------------------------------------------------------------------------- /csrc/prepare_inputs/advance_step.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace prepare_inputs { 13 | 14 | static constexpr int max_threads = 256; 15 | static constexpr bool logging = false; 16 | 17 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 18 | 19 | } // namespace prepare_inputs 20 | -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass/cutlass.h" 4 | #include 5 | 6 | /** 7 | * Helper function for checking CUTLASS errors 8 | */ 9 | #define CUTLASS_CHECK(status) \ 10 | { \ 11 | TORCH_CHECK(status == cutlass::Status::kSuccess, \ 12 | cutlassGetStatusString(status)) \ 13 | } 14 | 15 | inline uint32_t next_pow_2(uint32_t const num) { 16 | if (num <= 1) return num; 17 | return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); 18 | } 19 | 20 | inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { 21 | int max_shared_mem_per_block_opt_in = 0; 22 | cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, 23 | cudaDevAttrMaxSharedMemoryPerBlockOptin, 24 | device); 25 | return max_shared_mem_per_block_opt_in; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 14 | 15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 16 | const uint32_t q_1, 17 | half2 (&dq)[4], int stride, 18 | const uint32_t zero) { 19 | half dqh[8]; 20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 22 | 23 | for (int i = 0; i < 4; i++) 24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 25 | } 26 | 27 | } // namespace gptq 28 | } // namespace vllm 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /csrc/rocm/ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, 6 | torch::Tensor& max_logits, torch::Tensor& tmp_out, 7 | torch::Tensor& query, torch::Tensor& key_cache, 8 | torch::Tensor& value_cache, int64_t num_kv_heads, 9 | double scale, torch::Tensor& block_tables, 10 | torch::Tensor& context_lens, int64_t block_size, 11 | int64_t max_context_len, 12 | const c10::optional& alibi_slopes, 13 | const std::string& kv_cache_dtype, double k_scale, 14 | double v_scale); 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx==6.2.1 2 | sphinx-book-theme==1.0.1 3 | sphinx-copybutton==0.5.2 4 | myst-parser==2.0.0 5 | sphinx-argparse==0.4.0 6 | msgspec 7 | cloudpickle 8 | 9 | # packages to install to build the documentation 10 | pydantic >= 2.8 11 | -f https://download.pytorch.org/whl/cpu 12 | torch 13 | py-cpuinfo 14 | transformers 15 | mistral_common >= 1.3.4 16 | aiohttp 17 | starlette 18 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args 19 | partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -------------------------------------------------------------------------------- /docs/source/_static/custom.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function () { 2 | var script = document.createElement("script"); 3 | script.type = "module"; 4 | script.id = "runllm-widget-script" 5 | 6 | script.src = "https://widget.runllm.com"; 7 | 8 | script.setAttribute("version", "stable"); 9 | script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. 10 | script.setAttribute("runllm-name", "vLLM"); 11 | script.setAttribute("runllm-position", "BOTTOM_RIGHT"); 12 | script.setAttribute("runllm-position-y", "20%"); 13 | script.setAttribute("runllm-position-x", "3%"); 14 | script.setAttribute("runllm-assistant-id", "207"); 15 | 16 | script.async = true; 17 | document.head.appendChild(script); 18 | }); -------------------------------------------------------------------------------- /docs/source/_templates/sections/header.html: -------------------------------------------------------------------------------- 1 | 36 | 37 |
38 |

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.

39 |
40 | -------------------------------------------------------------------------------- /docs/source/assets/design/arch_overview/entrypoints.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png -------------------------------------------------------------------------------- /docs/source/assets/design/arch_overview/llm_engine.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png -------------------------------------------------------------------------------- /docs/source/assets/design/hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/design/hierarchy.png -------------------------------------------------------------------------------- /docs/source/assets/dev/dockerfile-stages-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/dev/dockerfile-stages-dependency.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/community/sponsors.md: -------------------------------------------------------------------------------- 1 | # Sponsors 2 | 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! 4 | 5 | 6 | 7 | 8 | - a16z 9 | - AMD 10 | - Anyscale 11 | - AWS 12 | - Crusoe Cloud 13 | - Databricks 14 | - DeepInfra 15 | - Dropbox 16 | - Google Cloud 17 | - Lambda Lab 18 | - Nebius 19 | - NVIDIA 20 | - Replicate 21 | - Roblox 22 | - RunPod 23 | - Sequoia Capital 24 | - Skywork AI 25 | - Trainy 26 | - UC Berkeley 27 | - UC San Diego 28 | - ZhenFund 29 | 30 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. 31 | -------------------------------------------------------------------------------- /docs/source/design/input_processing/input_processing_pipeline.rst: -------------------------------------------------------------------------------- 1 | .. _input_processing_pipeline: 2 | 3 | Input Processing Pipeline 4 | ========================= 5 | 6 | 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). 7 | 8 | 2. Tokenize the data if necessary. 9 | 10 | 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. 11 | 12 | - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. 13 | 14 | 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. 15 | 16 | 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. 17 | 18 | 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. 19 | 20 | - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. 21 | -------------------------------------------------------------------------------- /docs/source/design/input_processing/model_inputs_index.rst: -------------------------------------------------------------------------------- 1 | .. _input_processing: 2 | 3 | Input Processing 4 | ================ 5 | 6 | .. currentmodule:: vllm.inputs 7 | 8 | Each model can override parts of vLLM's :ref:`input processing pipeline ` via 9 | :data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. 10 | 11 | Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input 12 | data in addition to input prompt, but it can be extended to text-only language models when needed. 13 | 14 | Guides 15 | ++++++ 16 | 17 | .. toctree:: 18 | :maxdepth: 1 19 | 20 | input_processing_pipeline 21 | 22 | Module Contents 23 | +++++++++++++++ 24 | 25 | LLM Engine Inputs 26 | ----------------- 27 | 28 | .. autoclass:: vllm.inputs.DecoderOnlyInputs 29 | :members: 30 | :show-inheritance: 31 | 32 | Registry 33 | -------- 34 | 35 | .. autodata:: vllm.inputs.INPUT_REGISTRY 36 | 37 | .. automodule:: vllm.inputs.registry 38 | :members: 39 | :show-inheritance: 40 | -------------------------------------------------------------------------------- /docs/source/design/multimodal/adding_multimodal_plugin.rst: -------------------------------------------------------------------------------- 1 | .. _adding_multimodal_plugin: 2 | 3 | Adding a Multimodal Plugin 4 | ========================== 5 | 6 | This document teaches you how to add a new modality to vLLM. 7 | 8 | Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. 9 | For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. 10 | 11 | The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. 12 | 13 | .. note:: 14 | This article is a work in progress. 15 | 16 | .. 17 | TODO: Add more instructions on how to add new plugins once embeddings is in. 18 | -------------------------------------------------------------------------------- /docs/source/dev/engine/async_llm_engine.rst: -------------------------------------------------------------------------------- 1 | AsyncLLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.AsyncLLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/engine/engine_index.rst: -------------------------------------------------------------------------------- 1 | vLLM Engine 2 | ================================= 3 | 4 | .. automodule:: vllm.engine 5 | .. currentmodule:: vllm.engine 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Engines 10 | 11 | llm_engine 12 | async_llm_engine 13 | 14 | -------------------------------------------------------------------------------- /docs/source/dev/engine/llm_engine.rst: -------------------------------------------------------------------------------- 1 | LLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.LLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm.rst: -------------------------------------------------------------------------------- 1 | LLM Class 2 | ========= 3 | 4 | .. autoclass:: vllm.LLM 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm_inputs.rst: -------------------------------------------------------------------------------- 1 | LLM Inputs 2 | ========== 3 | 4 | .. autodata:: vllm.inputs.PromptType 5 | 6 | .. autoclass:: vllm.inputs.TextPrompt 7 | :show-inheritance: 8 | :members: 9 | :member-order: bysource 10 | 11 | .. autoclass:: vllm.inputs.TokensPrompt 12 | :show-inheritance: 13 | :members: 14 | :member-order: bysource 15 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/offline_index.rst: -------------------------------------------------------------------------------- 1 | Offline Inference 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | llm 8 | llm_inputs 9 | -------------------------------------------------------------------------------- /docs/source/dev/pooling_params.rst: -------------------------------------------------------------------------------- 1 | Pooling Parameters 2 | ================== 3 | 4 | .. autoclass:: vllm.PoolingParams 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/dev/sampling_params.rst: -------------------------------------------------------------------------------- 1 | Sampling Parameters 2 | =================== 3 | 4 | .. autoclass:: vllm.SamplingParams 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/getting_started/examples/examples_index.template.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Scripts 7 | 8 | %EXAMPLE_DOCS% 9 | -------------------------------------------------------------------------------- /docs/source/models/engine_args.rst: -------------------------------------------------------------------------------- 1 | .. _engine_args: 2 | 3 | Engine Arguments 4 | ================ 5 | 6 | Below, you can find an explanation of every engine argument for vLLM: 7 | 8 | .. argparse:: 9 | :module: vllm.engine.arg_utils 10 | :func: _engine_args_parser 11 | :prog: vllm serve 12 | :nodefaultconst: 13 | 14 | Async Engine Arguments 15 | ---------------------- 16 | 17 | Below are the additional arguments related to the asynchronous engine: 18 | 19 | .. argparse:: 20 | :module: vllm.engine.arg_utils 21 | :func: _async_engine_args_parser 22 | :prog: vllm serve 23 | :nodefaultconst: -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_bentoml.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_bentoml: 2 | 3 | Deploying with BentoML 4 | ====================== 5 | 6 | `BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial `vLLM inference in the BentoML documentation `_. -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_kserve.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_kserve: 2 | 3 | Deploying with KServe 4 | ============================ 5 | 6 | vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see `this guide `_ for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_lws.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_lws: 2 | 3 | Deploying with LWS 4 | ============================ 5 | 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. 7 | A major use case is for multi-host/multi-node distributed inference. 8 | 9 | vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. 10 | 11 | Please see `this guide `_ for more details on 12 | deploying vLLM on Kubernetes using LWS. 13 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/env_vars.rst: -------------------------------------------------------------------------------- 1 | Environment Variables 2 | ======================== 3 | 4 | vLLM uses the following environment variables to configure the system: 5 | 6 | .. warning:: 7 | Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. 8 | 9 | All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. 10 | 11 | .. literalinclude:: ../../../vllm/envs.py 12 | :language: python 13 | :start-after: begin-env-vars-definition 14 | :end-before: end-env-vars-definition 15 | -------------------------------------------------------------------------------- /docs/source/serving/integrations.rst: -------------------------------------------------------------------------------- 1 | Integrations 2 | ------------ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | run_on_sky 8 | deploying_with_kserve 9 | deploying_with_triton 10 | deploying_with_bentoml 11 | deploying_with_cerebrium 12 | deploying_with_lws 13 | deploying_with_dstack 14 | serving_with_langchain 15 | serving_with_llamaindex 16 | serving_with_llamastack 17 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_langchain.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_langchain: 2 | 3 | Serving with Langchain 4 | ============================ 5 | 6 | vLLM is also available via `Langchain `_ . 7 | 8 | To install langchain, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install langchain langchain_community -q 13 | 14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. 15 | 16 | .. code-block:: python 17 | 18 | from langchain_community.llms import VLLM 19 | 20 | llm = VLLM(model="mosaicml/mpt-7b", 21 | trust_remote_code=True, # mandatory for hf models 22 | max_new_tokens=128, 23 | top_k=10, 24 | top_p=0.95, 25 | temperature=0.8, 26 | # tensor_parallel_size=... # for distributed inference 27 | ) 28 | 29 | print(llm("What is the capital of France ?")) 30 | 31 | Please refer to this `Tutorial `_ for more details. 32 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_llamaindex.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_llamaindex: 2 | 3 | Serving with llama_index 4 | ============================ 5 | 6 | vLLM is also available via `llama_index `_ . 7 | 8 | To install llamaindex, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install llama-index-llms-vllm -q 13 | 14 | To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. 15 | 16 | .. code-block:: python 17 | 18 | from llama_index.llms.vllm import Vllm 19 | 20 | llm = Vllm( 21 | model="microsoft/Orca-2-7b", 22 | tensor_parallel_size=4, 23 | max_new_tokens=100, 24 | vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, 25 | ) 26 | 27 | Please refer to this `Tutorial `_ for more details. 28 | -------------------------------------------------------------------------------- /docs/source/serving/tensorizer.rst: -------------------------------------------------------------------------------- 1 | .. _tensorizer: 2 | 3 | Loading Models with CoreWeave's Tensorizer 4 | ========================================== 5 | vLLM supports loading models with `CoreWeave's Tensorizer `_. 6 | vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized 7 | at runtime extremely quickly directly to the GPU, resulting in significantly 8 | shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. 9 | 10 | For more information on CoreWeave's Tensorizer, please refer to 11 | `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see 12 | the `vLLM example script `_. 13 | 14 | .. note:: 15 | Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. 16 | -------------------------------------------------------------------------------- /examples/cpu_offload.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10) 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -------------------------------------------------------------------------------- /examples/offline_inference_arctic.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="snowflake/snowflake-arctic-instruct", 15 | quantization="deepspeedfp", 16 | tensor_parallel_size=8, 17 | trust_remote_code=True) 18 | # Generate texts from the prompts. The output is a list of RequestOutput objects 19 | # that contain the prompt, generated text, and other information. 20 | 21 | outputs = llm.generate(prompts, sampling_params) 22 | # Print the outputs. 23 | for output in outputs: 24 | prompt = output.prompt 25 | generated_text = output.outputs[0].text 26 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 27 | -------------------------------------------------------------------------------- /examples/offline_inference_embedding.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | 11 | # Create an LLM. 12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) 13 | # Generate embedding. The output is a list of PoolingRequestOutputs. 14 | outputs = model.encode(prompts) 15 | # Print the outputs. 16 | for output in outputs: 17 | print(output.outputs.embedding) # list of 4096 floats 18 | -------------------------------------------------------------------------------- /examples/offline_inference_tpu.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | prompts = [ 4 | "A robot may not injure a human being", 5 | "It is only with the heart that one can see rightly;", 6 | "The greatest glory in living lies not in never falling,", 7 | ] 8 | answers = [ 9 | " or, through inaction, allow a human being to come to harm.", 10 | " what is essential is invisible to the eye.", 11 | " but in rising every time we fall.", 12 | ] 13 | N = 1 14 | # Currently, top-p sampling is disabled. `top_p` should be 1.0. 15 | sampling_params = SamplingParams(temperature=0.7, 16 | top_p=1.0, 17 | n=N, 18 | max_tokens=16) 19 | 20 | # Set `enforce_eager=True` to avoid ahead-of-time compilation. 21 | # In real workloads, `enforace_eager` should be `False`. 22 | llm = LLM(model="google/gemma-2b", enforce_eager=True) 23 | outputs = llm.generate(prompts, sampling_params) 24 | for output, answer in zip(outputs, answers): 25 | prompt = output.prompt 26 | generated_text = output.outputs[0].text 27 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 28 | assert generated_text.startswith(answer) 29 | -------------------------------------------------------------------------------- /examples/offline_inference_with_profiler.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from vllm import LLM, SamplingParams 4 | 5 | # enable torch profiler, can also be set on cmd line 6 | os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" 7 | 8 | # Sample prompts. 9 | prompts = [ 10 | "Hello, my name is", 11 | "The president of the United States is", 12 | "The capital of France is", 13 | "The future of AI is", 14 | ] 15 | # Create a sampling params object. 16 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 17 | 18 | # Create an LLM. 19 | llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) 20 | 21 | llm.start_profile() 22 | 23 | # Generate texts from the prompts. The output is a list of RequestOutput objects 24 | # that contain the prompt, generated text, and other information. 25 | outputs = llm.generate(prompts, sampling_params) 26 | 27 | llm.stop_profile() 28 | 29 | # Print the outputs. 30 | for output in outputs: 31 | prompt = output.prompt 32 | generated_text = output.outputs[0].text 33 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 34 | -------------------------------------------------------------------------------- /examples/openai_chat_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | chat_completion = client.chat.completions.create( 17 | messages=[{ 18 | "role": "system", 19 | "content": "You are a helpful assistant." 20 | }, { 21 | "role": "user", 22 | "content": "Who won the world series in 2020?" 23 | }, { 24 | "role": 25 | "assistant", 26 | "content": 27 | "The Los Angeles Dodgers won the World Series in 2020." 28 | }, { 29 | "role": "user", 30 | "content": "Where was it played?" 31 | }], 32 | model=model, 33 | ) 34 | 35 | print("Chat completion results:") 36 | print(chat_completion) 37 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | # Completion API 17 | stream = False 18 | completion = client.completions.create( 19 | model=model, 20 | prompt="A robot may not injure a human being", 21 | echo=False, 22 | n=2, 23 | stream=stream, 24 | logprobs=3) 25 | 26 | print("Completion results:") 27 | if stream: 28 | for c in completion: 29 | print(c) 30 | else: 31 | print(completion) 32 | -------------------------------------------------------------------------------- /examples/openai_embedding_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | responses = client.embeddings.create( 17 | input=[ 18 | "Hello my name is", 19 | "The best thing about vLLM is that it supports many different models" 20 | ], 21 | model=model, 22 | ) 23 | 24 | for data in responses.data: 25 | print(data.embedding) # list of float of len 4096 26 | -------------------------------------------------------------------------------- /examples/openai_example_batch.jsonl: -------------------------------------------------------------------------------- 1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} 2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} 3 | -------------------------------------------------------------------------------- /examples/production_monitoring/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/production_monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_blip2.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'Question: ' + message['content'] + ' ' -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Answer: ' + message['content'] + ' ' -}} 6 | {%- endif -%} 7 | {%- endfor -%} 8 | 9 | {%- if add_generation_prompt -%} 10 | {{- 'Answer:' -}} 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_dse_qwen2_vl.jinja: -------------------------------------------------------------------------------- 1 | {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system 2 | You are a helpful assistant.<|im_end|> 3 | {% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %} 4 | {% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %} 5 | {% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %} 6 | {% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %} 7 | {% endraw %}{% endif %}<|endoftext|> -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- 1 | <#meta#> 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} 4 | <#system#> 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 6 | <#chat#> 7 | {% for message in messages %} 8 | {% if message['role'] == 'user' %} 9 | <#user#> 10 | {{ message['content']|trim -}} 11 | {% if not loop.last %} 12 | 13 | {% endif %} 14 | {% elif message['role'] == 'assistant' %} 15 | <#bot#> 16 | {{ message['content']|trim -}} 17 | {% if not loop.last %} 18 | 19 | {% endif %} 20 | {% elif message['role'] == 'user_context' %} 21 | <#user_context#> 22 | {{ message['content']|trim -}} 23 | {% if not loop.last %} 24 | 25 | {% endif %} 26 | {% endif %} 27 | {% endfor %} 28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 29 | <#bot#> 30 | {% endif %} -------------------------------------------------------------------------------- /examples/template_llava.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages[0]['role'] == 'system' -%} 2 | {%- set system_message = messages[0]['content'] -%} 3 | {%- set messages = messages[1:] -%} 4 | {%- else -%} 5 | {% set system_message = '' -%} 6 | {%- endif -%} 7 | 8 | {{ bos_token + system_message }} 9 | {%- for message in messages -%} 10 | {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} 11 | {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} 12 | {%- endif -%} 13 | 14 | {%- if message['role'] == 'user' -%} 15 | {{ 'USER: ' + message['content'] + '\n' }} 16 | {%- elif message['role'] == 'assistant' -%} 17 | {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }} 18 | {%- endif -%} 19 | {%- endfor -%} 20 | 21 | {%- if add_generation_prompt -%} 22 | {{ 'ASSISTANT:' }} 23 | {% endif %} 24 | -------------------------------------------------------------------------------- /examples/template_vlm2vec.jinja: -------------------------------------------------------------------------------- 1 | {%- if messages | length > 1 -%} 2 | {{ raise_exception('Embedding models should only embed one message at a time') }} 3 | {%- endif -%} 4 | 5 | {% set vars = namespace(parts=[], next_image_id=1) %} 6 | {%- for message in messages -%} 7 | {%- for content in message['content'] -%} 8 | {%- if content['type'] == 'text' -%} 9 | {%- set vars.parts = vars.parts + [content['text']] %} 10 | {%- elif content['type'] == 'image' -%} 11 | {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} 12 | {%- set vars.next_image_id = vars.next_image_id + 1 %} 13 | {%- endif -%} 14 | {%- endfor -%} 15 | {%- endfor -%} 16 | {{ vars.parts | join(' ') }} 17 | -------------------------------------------------------------------------------- /find_cuda_init.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import traceback 3 | from typing import Callable 4 | from unittest.mock import patch 5 | 6 | 7 | def find_cuda_init(fn: Callable[[], object]) -> None: 8 | """ 9 | Helper function to debug CUDA re-initialization errors. 10 | 11 | If `fn` initializes CUDA, prints the stack trace of how this happens. 12 | """ 13 | from torch.cuda import _lazy_init 14 | 15 | stack = None 16 | 17 | def wrapper(): 18 | nonlocal stack 19 | stack = traceback.extract_stack() 20 | return _lazy_init() 21 | 22 | with patch("torch.cuda._lazy_init", wrapper): 23 | fn() 24 | 25 | if stack is not None: 26 | print("==== CUDA Initialized ====") 27 | print("".join(traceback.format_list(stack)).strip()) 28 | print("==========================") 29 | 30 | 31 | if __name__ == "__main__": 32 | find_cuda_init( 33 | lambda: importlib.import_module("vllm.model_executor.models.llava")) 34 | -------------------------------------------------------------------------------- /requirements-build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.26 3 | ninja 4 | packaging 5 | setuptools>=61 6 | setuptools-scm>=8 7 | torch==2.5.1 8 | wheel 9 | jinja2 10 | -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for CPUs 5 | torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 6 | torch==2.5.1; platform_machine == "aarch64" 7 | torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch -------------------------------------------------------------------------------- /requirements-cuda.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for NVIDIA GPUs 5 | ray >= 2.9 6 | nvidia-ml-py >= 12.560.30 # for pynvml package 7 | torch == 2.5.1 8 | # These must be updated alongside torch 9 | torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version 10 | xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 11 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements-lint.txt 2 | -r requirements-test.txt 3 | 4 | # Avoid adding requirements directly to this file. 5 | # Instead, modify the two files referenced above. 6 | -------------------------------------------------------------------------------- /requirements-hpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for HPU code 5 | ray 6 | triton 7 | pandas 8 | tabulate 9 | setuptools>=61 10 | setuptools-scm>=8 11 | vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 12 | -------------------------------------------------------------------------------- /requirements-lint.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | tomli==2.0.2 5 | ruff==0.6.5 6 | codespell==2.3.0 7 | isort==5.13.2 8 | clang-format==18.1.5 9 | sphinx-lint==1.0.0 10 | 11 | # type checking 12 | mypy==1.11.1 13 | types-PyYAML 14 | types-requests 15 | types-setuptools 16 | -------------------------------------------------------------------------------- /requirements-neuron.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for Neuron devices 5 | transformers-neuronx >= 0.12.0 6 | torch-neuronx >= 2.1.2 7 | neuronx-cc 8 | -------------------------------------------------------------------------------- /requirements-openvino.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | torch == 2.5.1 # should be aligned with "common" vLLM torch version 5 | openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention 6 | 7 | optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version 8 | optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version 9 | -------------------------------------------------------------------------------- /requirements-rocm.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for AMD GPUs 5 | awscli 6 | boto3 7 | botocore 8 | ray >= 2.10.0 9 | peft 10 | pytest-asyncio 11 | tensorizer>=2.9.0 -------------------------------------------------------------------------------- /requirements-test.in: -------------------------------------------------------------------------------- 1 | # testing 2 | pytest 3 | tensorizer>=2.9.0 4 | pytest-forked 5 | pytest-asyncio 6 | pytest-rerunfailures 7 | pytest-shard 8 | 9 | # testing utils 10 | awscli 11 | decord # required for video tests 12 | einops # required for MPT, qwen-vl and Mamba 13 | httpx 14 | librosa # required for audio tests 15 | peft 16 | ray[adag]==2.35 17 | sentence-transformers # required for embedding tests 18 | soundfile # required for audio tests 19 | timm # required for internvl test 20 | torch==2.5.1 21 | transformers_stream_generator # required for qwen-vl test 22 | matplotlib # required for qwen-vl test 23 | mistral_common[opencv] >= 1.4.4 # required for pixtral test 24 | datamodel_code_generator # required for minicpm3 test 25 | lm-eval[api]==0.4.4 # required for model evaluation test 26 | 27 | # TODO: Add this after fully implementing llava(mantis) 28 | # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test 29 | 30 | # quantization 31 | bitsandbytes>=0.44.0 32 | buildkite-test-collector==0.1.9 33 | 34 | numpy < 2.0.0 35 | -------------------------------------------------------------------------------- /requirements-tpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for TPU 5 | cmake>=3.26 6 | ninja 7 | packaging 8 | setuptools-scm>=8 9 | wheel 10 | jinja2 11 | ray[default] 12 | 13 | # Install torch_xla 14 | --pre 15 | --extra-index-url https://download.pytorch.org/whl/nightly/cpu 16 | --find-links https://storage.googleapis.com/libtpu-releases/index.html 17 | --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html 18 | --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html 19 | torch==2.6.0.dev20241126+cpu 20 | torchvision==0.20.0.dev20241126+cpu 21 | torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl 22 | jaxlib==0.4.36.dev20241122 23 | jax==0.4.36.dev20241122 24 | -------------------------------------------------------------------------------- /requirements-xpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | ray >= 2.9 5 | cmake>=3.26 6 | ninja 7 | packaging 8 | setuptools-scm>=8 9 | wheel 10 | jinja2 11 | 12 | torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl 13 | intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl 14 | oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl 15 | 16 | triton-xpu == 3.0.0b1 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/async_engine/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/basic_correctness/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/test_cpu_offload.py: -------------------------------------------------------------------------------- 1 | from ..utils import compare_two_settings 2 | 3 | 4 | def test_cpu_offload(): 5 | compare_two_settings("meta-llama/Llama-3.2-1B", [], 6 | ["--cpu-offload-gb", "1"]) 7 | -------------------------------------------------------------------------------- /tests/compile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/compile/__init__.py -------------------------------------------------------------------------------- /tests/compile/piecewise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/compile/piecewise/__init__.py -------------------------------------------------------------------------------- /tests/compile/test_full_graph.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.config import CompilationLevel 4 | 5 | from ..utils import fork_new_process_for_each_test 6 | from .utils import TEST_MODELS, check_full_graph_support 7 | 8 | 9 | @pytest.mark.parametrize("model_info", TEST_MODELS) 10 | @pytest.mark.parametrize( 11 | "optimization_level", 12 | [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) 13 | @fork_new_process_for_each_test 14 | def test_full_graph(model_info, optimization_level): 15 | model = model_info[0] 16 | model_kwargs = model_info[1] 17 | check_full_graph_support(model, 18 | model_kwargs, 19 | optimization_level, 20 | tp_size=1) 21 | -------------------------------------------------------------------------------- /tests/compile/test_pass_manager.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pytest 4 | import torch 5 | from torch._inductor.codecache import BypassFxGraphCache 6 | 7 | from vllm.compilation.config import CompilationConfig 8 | from vllm.compilation.inductor_pass import (CallableInductorPass, 9 | as_inductor_pass) 10 | from vllm.compilation.pass_manager import PostGradPassManager 11 | 12 | 13 | def simple_callable(graph: torch.fx.Graph): 14 | pass 15 | 16 | 17 | @as_inductor_pass(files=(__file__, )) 18 | def callable_decorated(graph: torch.fx.Graph): 19 | pass 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "works, callable", 24 | [(False, simple_callable), (True, callable_decorated), 25 | (True, CallableInductorPass(simple_callable, "simple_callable"))]) 26 | def test_pass_manager(works: bool, callable): 27 | config = CompilationConfig().pass_config 28 | pass_manager = PostGradPassManager([callable]) 29 | pass_manager.configure(config) # Adds default passes 30 | 31 | if works: 32 | pickle.dumps(pass_manager) 33 | else: 34 | with pytest.raises(BypassFxGraphCache): 35 | pickle.dumps(pass_manager) 36 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/block/__init__.py -------------------------------------------------------------------------------- /tests/core/block/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def should_do_global_cleanup_after_test() -> bool: 6 | """Disable the global cleanup fixture for tests in this directory. This 7 | provides a ~10x speedup for unit tests that don't load a model to GPU. 8 | 9 | This requires that tests in this directory clean up after themselves if they 10 | use the GPU. 11 | """ 12 | return False 13 | -------------------------------------------------------------------------------- /tests/core/block/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/core/block/e2e/__init__.py -------------------------------------------------------------------------------- /tests/data/test_config.yaml: -------------------------------------------------------------------------------- 1 | port: 12312 2 | served_model_name: mymodel 3 | tensor_parallel_size: 2 4 | trust_remote_code: true 5 | multi_step_stream_outputs: false 6 | -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/distributed/__init__.py -------------------------------------------------------------------------------- /tests/distributed/test_distributed_oot.py: -------------------------------------------------------------------------------- 1 | from ..entrypoints.openai.test_oot_registration import ( 2 | run_and_test_dummy_opt_api_server) 3 | 4 | 5 | def test_distributed_oot(dummy_opt_path: str): 6 | run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2) 7 | -------------------------------------------------------------------------------- /tests/distributed/test_pp_cudagraph.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from ..utils import compare_two_settings, fork_new_process_for_each_test 6 | 7 | 8 | @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ 9 | (2, "JackFram/llama-160m"), 10 | ]) 11 | @pytest.mark.parametrize("ATTN_BACKEND", [ 12 | "FLASH_ATTN", 13 | "FLASHINFER", 14 | ]) 15 | @fork_new_process_for_each_test 16 | def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): 17 | cudagraph_args = [ 18 | # use half precision for speed and memory savings in CI environment 19 | "--dtype", 20 | "float16", 21 | "--pipeline-parallel-size", 22 | str(PP_SIZE), 23 | "--distributed-executor-backend", 24 | "mp", 25 | ] 26 | os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND 27 | 28 | eager_args = cudagraph_args + ["--enforce-eager"] 29 | 30 | compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) 31 | -------------------------------------------------------------------------------- /tests/distributed/test_same_node.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch.distributed as dist 4 | 5 | from vllm.distributed.parallel_state import in_the_same_node_as 6 | 7 | if __name__ == "__main__": 8 | dist.init_process_group(backend="gloo") 9 | test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0)) 10 | 11 | expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" 12 | assert test_result == expected, f"Expected {expected}, got {test_result}" 13 | print("Same node test passed!") 14 | -------------------------------------------------------------------------------- /tests/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/encoder_decoder/__init__.py -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/engine/__init__.py -------------------------------------------------------------------------------- /tests/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /tests/engine/test_short_mm_context.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ..conftest import IMAGE_ASSETS 4 | 5 | HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ 6 | "stop_sign": 7 | "USER: \nWhat's the content of the image?\nASSISTANT:", 8 | "cherry_blossom": 9 | "USER: \nWhat is the season?\nASSISTANT:", 10 | }) 11 | 12 | models = ["llava-hf/llava-1.5-7b-hf"] 13 | 14 | 15 | @pytest.mark.parametrize("model", models) 16 | def test_context_length_too_short(vllm_runner, image_assets, model): 17 | images = [asset.pil_image for asset in image_assets] 18 | 19 | with pytest.raises(ValueError, match="too long to fit into the model"): 20 | vllm_model = vllm_runner( 21 | model, 22 | max_model_len=128, # LLaVA has a feature size of 576 23 | enforce_eager=True, 24 | ) 25 | 26 | with vllm_model: 27 | vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], 28 | max_tokens=1, 29 | images=[images[0]]) 30 | -------------------------------------------------------------------------------- /tests/engine/test_skip_tokenizer_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_skip_tokenizer_initialization(model: str): 9 | # This test checks if the flag skip_tokenizer_init skips the initialization 10 | # of tokenizer and detokenizer. The generated output is expected to contain 11 | # token ids. 12 | llm = LLM(model=model, skip_tokenizer_init=True) 13 | sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) 14 | 15 | with pytest.raises(ValueError, match="cannot pass text prompts when"): 16 | llm.generate("abc", sampling_params) 17 | 18 | outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, 19 | sampling_params=sampling_params) 20 | assert len(outputs) > 0 21 | completions = outputs[0].outputs 22 | assert len(completions) > 0 23 | assert completions[0].text == "" 24 | assert completions[0].token_ids 25 | -------------------------------------------------------------------------------- /tests/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/llm/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/test_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import LLM 4 | 5 | from ...utils import error_on_warning 6 | 7 | MODEL_NAME = "facebook/opt-125m" 8 | 9 | 10 | def test_pos_args_deprecated(): 11 | with error_on_warning(DeprecationWarning): 12 | LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) 13 | 14 | with error_on_warning(DeprecationWarning): 15 | LLM(MODEL_NAME, tokenizer=MODEL_NAME) 16 | 17 | with pytest.warns(DeprecationWarning, match="'tokenizer'"): 18 | LLM(MODEL_NAME, MODEL_NAME) 19 | 20 | with pytest.warns(DeprecationWarning, 21 | match="'tokenizer', 'tokenizer_mode'"): 22 | LLM(MODEL_NAME, MODEL_NAME, "auto") 23 | -------------------------------------------------------------------------------- /tests/entrypoints/llm/test_prompt_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import LLM 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def v1(run_with_both_engines): 8 | # Simple autouse wrapper to run both engines for each test 9 | # This can be promoted up to conftest.py to run for every 10 | # test in a package 11 | pass 12 | 13 | 14 | def test_empty_prompt(): 15 | llm = LLM(model="gpt2", enforce_eager=True) 16 | with pytest.raises(ValueError, match='Prompt cannot be empty'): 17 | llm.generate([""]) 18 | 19 | 20 | @pytest.mark.skip_v1 21 | def test_out_of_vocab_token(): 22 | llm = LLM(model="gpt2", enforce_eager=True) 23 | with pytest.raises(ValueError, match='out of vocabulary'): 24 | llm.generate({"prompt_token_ids": [999999]}) 25 | -------------------------------------------------------------------------------- /tests/entrypoints/offline_mode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/offline_mode/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/entrypoints/openai/tool_parsers/__init__.py -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/kernels/__init__.py -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Reference default values of atol and rtol are from 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 6 | default_rtol = { 7 | torch.float16: 1e-3, 8 | torch.bfloat16: 1.6e-2, 9 | torch.float: 1.3e-6 10 | } 11 | 12 | 13 | def get_default_atol(output) -> float: 14 | return default_atol[output.dtype] 15 | 16 | 17 | def get_default_rtol(output) -> float: 18 | return default_rtol[output.dtype] 19 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.utils import (create_kv_caches_with_random, 4 | create_kv_caches_with_random_flash) 5 | 6 | 7 | @pytest.fixture() 8 | def kv_cache_factory(): 9 | return create_kv_caches_with_random 10 | 11 | 12 | @pytest.fixture() 13 | def kv_cache_factory_flashinfer(): 14 | return create_kv_caches_with_random_flash 15 | -------------------------------------------------------------------------------- /tests/kernels/test_ggml.py: -------------------------------------------------------------------------------- 1 | import gguf 2 | import pytest 3 | import torch 4 | 5 | from tests.kernels.utils import opcheck 6 | from vllm import _custom_ops as ops # noqa: F401 7 | 8 | 9 | @pytest.mark.parametrize("quant_type", [12]) 10 | def test_ggml_opcheck(quant_type): 11 | block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] 12 | shape = [256, 1152] 13 | qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8) 14 | m = qweight.shape[0] 15 | n = qweight.shape[1] // type_size * block_size 16 | opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n)) 17 | 18 | x = torch.rand((m, 512), device='cuda', dtype=torch.float16) 19 | opcheck(torch.ops._C.ggml_mul_mat_a8, 20 | (qweight, x, quant_type, qweight.shape[0])) 21 | opcheck(torch.ops._C.ggml_mul_mat_vec_a8, 22 | (qweight, x, quant_type, qweight.shape[0])) 23 | -------------------------------------------------------------------------------- /tests/kernels/test_gptq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from tests.kernels.utils import opcheck 4 | from vllm import _custom_ops as ops # noqa: F401 5 | 6 | 7 | def test_gptq_shuffle_opcheck(): 8 | weight = torch.randint(-2000000, 9 | 2000000, (1792, 4096), 10 | device='cuda', 11 | dtype=torch.int32) 12 | perm = torch.empty((0, ), device='cuda', dtype=torch.int32) 13 | bit = 4 14 | opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit)) 15 | 16 | 17 | def test_gptq_gemm_opcheck(): 18 | a = torch.rand((240, 4096), device='cuda', dtype=torch.float16) 19 | weight = torch.randint(-2000000, 20 | 2000000, (512, 6144), 21 | device='cuda', 22 | dtype=torch.int32) 23 | zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32) 24 | scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16) 25 | idx = torch.empty((0, ), device='cuda', dtype=torch.int32) 26 | use_exllama = True 27 | bit = 4 28 | opcheck(torch.ops._C.gptq_gemm, 29 | (a, weight, zeros, scales, idx, use_exllama, bit)) 30 | -------------------------------------------------------------------------------- /tests/kernels/test_permute_cols.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from tests.kernels.utils import opcheck 5 | from vllm._custom_ops import permute_cols 6 | 7 | 8 | @pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)]) 9 | @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16]) 10 | def test_permute_cols(shape, dtype): 11 | x = torch.randn(shape, dtype=dtype).cuda() 12 | perm = torch.randperm(x.shape[1]).to(torch.int).cuda() 13 | opcheck(torch.ops._C.permute_cols, (x, perm)) 14 | y = permute_cols(x, perm) 15 | torch.testing.assert_close(y, x[:, perm]) -------------------------------------------------------------------------------- /tests/kernels/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for miscellaneous utilities 3 | """ 4 | 5 | import pytest 6 | import torch 7 | 8 | from tests.kernels.utils import opcheck 9 | from vllm.platforms import current_platform 10 | 11 | 12 | def test_convert_fp8_opcheck(): 13 | data = torch.randn((256, 256), dtype=torch.float32, device="cuda") 14 | result = torch.empty_like(data, dtype=torch.float8_e4m3fn) 15 | opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) 16 | 17 | 18 | @pytest.mark.skipif(not current_platform.is_cuda(), 19 | reason="Only supported for CUDA") 20 | def test_cuda_utils_opcheck(): 21 | opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) 22 | opcheck( 23 | torch.ops._C_cuda_utils. 24 | get_max_shared_memory_per_block_device_attribute, (0, )) 25 | -------------------------------------------------------------------------------- /tests/kv_transfer/test_lookup_buffer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RANK=0 python test_lookup_buffer.py & 3 | RANK=1 python test_lookup_buffer.py & -------------------------------------------------------------------------------- /tests/kv_transfer/test_send_recv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RANK=0 python3 test_send_recv.py & 3 | RANK=1 python3 test_send_recv.py & -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/lora/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/lora/data/__init__.py -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/model_executor/__init__.py -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/audio_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/audio_language/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/language/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/vision_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py -------------------------------------------------------------------------------- /tests/models/decoder_only/vision_language/vlm_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/decoder_only/vision_language/vlm_utils/__init__.py -------------------------------------------------------------------------------- /tests/models/embedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/__init__.py -------------------------------------------------------------------------------- /tests/models/embedding/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/language/__init__.py -------------------------------------------------------------------------------- /tests/models/embedding/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Sequence 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def check_embeddings_close( 8 | *, 9 | embeddings_0_lst: Sequence[List[float]], 10 | embeddings_1_lst: Sequence[List[float]], 11 | name_0: str, 12 | name_1: str, 13 | tol: float = 1e-3, 14 | ) -> None: 15 | assert len(embeddings_0_lst) == len(embeddings_1_lst) 16 | 17 | for prompt_idx, (embeddings_0, embeddings_1) in enumerate( 18 | zip(embeddings_0_lst, embeddings_1_lst)): 19 | assert len(embeddings_0) == len(embeddings_1), ( 20 | f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") 21 | 22 | sim = F.cosine_similarity(torch.tensor(embeddings_0), 23 | torch.tensor(embeddings_1), 24 | dim=0) 25 | 26 | fail_msg = (f"Test{prompt_idx}:" 27 | f"\n{name_0}:\t{embeddings_0[:16]!r}" 28 | f"\n{name_1}:\t{embeddings_1[:16]!r}") 29 | 30 | assert sim >= 1 - tol, fail_msg 31 | -------------------------------------------------------------------------------- /tests/models/embedding/vision_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/embedding/vision_language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/vision_language/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/models/encoder_decoder/vision_language/__init__.py -------------------------------------------------------------------------------- /tests/models/encoder_decoder/vision_language/test_broadcast.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ....utils import multi_gpu_test 4 | 5 | 6 | @multi_gpu_test(num_gpus=2) 7 | @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) 8 | @pytest.mark.parametrize("model", [ 9 | "meta-llama/Llama-3.2-11B-Vision-Instruct", 10 | ]) 11 | def test_models(hf_runner, vllm_runner, image_assets, 12 | distributed_executor_backend, model) -> None: 13 | 14 | dtype = "half" 15 | max_tokens = 5 16 | num_logprobs = 5 17 | tensor_parallel_size = 2 18 | 19 | if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"): 20 | from .test_mllama import models, run_test 21 | else: 22 | raise NotImplementedError(f"Unsupported model: {model}") 23 | 24 | run_test( 25 | hf_runner, 26 | vllm_runner, 27 | image_assets, 28 | model=models[0], 29 | size_factors=[0.25, 0.5, 1.0], 30 | dtype=dtype, 31 | max_tokens=max_tokens, 32 | num_logprobs=num_logprobs, 33 | tensor_parallel_size=tensor_parallel_size, 34 | distributed_executor_backend=distributed_executor_backend, 35 | ) 36 | -------------------------------------------------------------------------------- /tests/mq_llm_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/mq_llm_engine/__init__.py -------------------------------------------------------------------------------- /tests/multi_step/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/multi_step/__init__.py -------------------------------------------------------------------------------- /tests/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/multimodal/__init__.py -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='vllm_add_dummy_model', 4 | version='0.1', 5 | packages=['vllm_add_dummy_model'], 6 | entry_points={ 7 | 'vllm.general_plugins': 8 | ["register_dummy_model = vllm_add_dummy_model:register"] 9 | }) 10 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm import ModelRegistry 2 | 3 | 4 | def register(): 5 | # Test directly passing the model 6 | from .my_opt import MyOPTForCausalLM 7 | 8 | if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs(): 9 | ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM) 10 | 11 | # Test passing lazy model 12 | if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs(): 13 | ModelRegistry.register_model( 14 | "MyGemma2Embedding", 15 | "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding", 16 | ) 17 | 18 | if "MyLlava" not in ModelRegistry.get_supported_archs(): 19 | ModelRegistry.register_model("MyLlava", 20 | "vllm_add_dummy_model.my_llava:MyLlava") 21 | -------------------------------------------------------------------------------- /tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from vllm.model_executor.models.opt import OPTForCausalLM 6 | from vllm.model_executor.sampling_metadata import SamplingMetadata 7 | 8 | 9 | class MyOPTForCausalLM(OPTForCausalLM): 10 | 11 | def compute_logits( 12 | self, hidden_states: torch.Tensor, 13 | sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: 14 | # this dummy model always predicts the first token 15 | logits = super().compute_logits(hidden_states, sampling_metadata) 16 | if logits is not None: 17 | logits.zero_() 18 | logits[:, 0] += 1.0 19 | return logits 20 | -------------------------------------------------------------------------------- /tests/prefix_caching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/prefix_caching/__init__.py -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/quantization/__init__.py -------------------------------------------------------------------------------- /tests/quantization/test_experts_int8.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | """Tests experts_int8 quantization startup and generation, 3 | doesn't test correctness 4 | """ 5 | import pytest 6 | 7 | from tests.quantization.utils import is_quant_method_supported 8 | 9 | MODELS = ["ai21labs/Jamba-tiny-random"] 10 | 11 | 12 | @pytest.mark.skipif(not is_quant_method_supported("experts_int8"), 13 | reason="ExpertsInt8 is not supported on this GPU type.") 14 | @pytest.mark.parametrize("model", MODELS) 15 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 16 | @pytest.mark.parametrize("max_tokens", [10]) 17 | def test_model_experts_int8_startup( 18 | hf_runner, 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | ) -> None: 25 | 26 | with vllm_runner(model, dtype=dtype, 27 | quantization="experts_int8") as vllm_model: 28 | vllm_model.generate_greedy(example_prompts, max_tokens) 29 | -------------------------------------------------------------------------------- /tests/quantization/test_ipex_quant.py: -------------------------------------------------------------------------------- 1 | """Test model set-up and inference for quantized HF models supported 2 | on the CPU/GPU backend using IPEX (including AWQ/GPTQ). 3 | 4 | Validating the configuration and printing results for manual checking. 5 | 6 | Run `pytest tests/quantization/test_ipex_quant.py`. 7 | """ 8 | 9 | import pytest 10 | 11 | from vllm.platforms import current_platform 12 | 13 | MODELS = [ 14 | "AMead10/Llama-3.2-1B-Instruct-AWQ", 15 | "shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx 16 | ] 17 | DTYPE = ["bfloat16"] 18 | 19 | 20 | @pytest.mark.skipif(not current_platform.is_cpu() 21 | and not current_platform.is_xpu(), 22 | reason="only supports Intel CPU/XPU backend.") 23 | @pytest.mark.parametrize("model", MODELS) 24 | @pytest.mark.parametrize("dtype", DTYPE) 25 | def test_ipex_quant(vllm_runner, model, dtype): 26 | with vllm_runner(model, dtype=dtype) as llm: 27 | output = llm.generate_greedy(["The capital of France is"], 28 | max_tokens=32) 29 | assert output 30 | print(output) 31 | -------------------------------------------------------------------------------- /tests/quantization/utils.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.quantization import get_quantization_config 2 | from vllm.platforms import current_platform 3 | 4 | 5 | def is_quant_method_supported(quant_method: str) -> bool: 6 | # Currently, all quantization methods require Nvidia or AMD GPUs 7 | if not (current_platform.is_cuda() or current_platform.is_rocm()): 8 | return False 9 | 10 | capability = current_platform.get_device_capability() 11 | assert capability is not None 12 | 13 | min_capability = get_quantization_config(quant_method).get_min_capability() 14 | 15 | return capability.to_int() >= min_capability 16 | -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/samplers/__init__.py -------------------------------------------------------------------------------- /tests/samplers/test_ignore_eos.py: -------------------------------------------------------------------------------- 1 | """Make sure ignore_eos works. 2 | 3 | Run `pytest tests/samplers/test_ignore_eos.py`. 4 | """ 5 | 6 | import pytest 7 | 8 | from vllm import SamplingParams 9 | 10 | # We also test with llama because it has generation_config to specify EOS 11 | # (past regression). 12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] 13 | 14 | 15 | @pytest.mark.parametrize("model", MODELS) 16 | @pytest.mark.parametrize("dtype", ["half"]) 17 | @pytest.mark.parametrize("max_tokens", [512]) 18 | def test_ignore_eos( 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | ) -> None: 25 | with vllm_runner(model, dtype=dtype) as vllm_model: 26 | sampling_params = SamplingParams(max_tokens=max_tokens, 27 | ignore_eos=True) 28 | 29 | for prompt in example_prompts: 30 | ignore_eos_output = vllm_model.model.generate( 31 | prompt, sampling_params=sampling_params) 32 | output_length = len(ignore_eos_output[0].outputs[0].token_ids) 33 | assert output_length == max_tokens 34 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/spec_decode/e2e/__init__.py -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tensorizer_loader/__init__.py -------------------------------------------------------------------------------- /tests/test_embedded_commit.py: -------------------------------------------------------------------------------- 1 | import vllm 2 | 3 | 4 | def test_embedded_commit_defined(): 5 | assert hasattr(vllm, "__version__") 6 | assert hasattr(vllm, "__version_tuple__") 7 | assert vllm.__version__ != "dev" 8 | assert vllm.__version_tuple__ != (0, 0, "dev") 9 | -------------------------------------------------------------------------------- /tests/test_lazy_torch_compile.py: -------------------------------------------------------------------------------- 1 | # Description: Test the lazy import module 2 | # The utility function cannot be placed in `vllm.utils` 3 | # this needs to be a standalone script 4 | import sys 5 | from contextlib import nullcontext 6 | 7 | from vllm_test_utils import BlameResult, blame 8 | 9 | module_name = "torch._inductor.async_compile" 10 | 11 | # In CI, we only check finally if the module is imported. 12 | # If it is indeed imported, we can rerun the test with `use_blame=True`, 13 | # which will trace every function call to find the first import location, 14 | # and help find the root cause. 15 | # We don't run it in CI by default because it is slow. 16 | use_blame = False 17 | context = blame( 18 | lambda: module_name in sys.modules) if use_blame else nullcontext() 19 | with context as result: 20 | import vllm # noqa 21 | 22 | if use_blame: 23 | assert isinstance(result, BlameResult) 24 | print(f"the first import location is:\n{result.trace_stack}") 25 | 26 | assert module_name not in sys.modules, ( 27 | f"Module {module_name} is imported. To see the first" 28 | f" import location, run the test with `use_blame=True`.") 29 | -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- 1 | """Tests for the SamplingParams class. 2 | """ 3 | from vllm import SamplingParams 4 | 5 | 6 | def test_max_tokens_none(): 7 | """max_tokens=None should be allowed""" 8 | SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | import pytest 13 | pytest.main([__file__]) 14 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_cached_tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer 6 | 7 | 8 | def test_cached_tokenizer(): 9 | reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") 10 | reference_tokenizer.add_special_tokens({"cls_token": ""}) 11 | reference_tokenizer.add_special_tokens( 12 | {"additional_special_tokens": [""]}) 13 | cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) 14 | 15 | assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( 16 | "prompt") 17 | assert set(reference_tokenizer.all_special_ids) == set( 18 | cached_tokenizer.all_special_ids) 19 | assert set(reference_tokenizer.all_special_tokens) == set( 20 | cached_tokenizer.all_special_tokens) 21 | assert set(reference_tokenizer.all_special_tokens_extended) == set( 22 | cached_tokenizer.all_special_tokens_extended) 23 | -------------------------------------------------------------------------------- /tests/tokenization/test_get_eos.py: -------------------------------------------------------------------------------- 1 | """ 2 | This test file includes some cases where it is inappropriate to 3 | only get the `eos_token_id` from the tokenizer as defined by 4 | :meth:`vllm.LLMEngine._get_eos_token_id`. 5 | """ 6 | from vllm.transformers_utils.config import try_get_generation_config 7 | from vllm.transformers_utils.tokenizer import get_tokenizer 8 | 9 | 10 | def test_get_llama3_eos_token(): 11 | model_name = "meta-llama/Meta-Llama-3-8B-Instruct" 12 | 13 | tokenizer = get_tokenizer(model_name) 14 | assert tokenizer.eos_token_id == 128009 15 | 16 | generation_config = try_get_generation_config(model_name, 17 | trust_remote_code=False) 18 | assert generation_config is not None 19 | assert generation_config.eos_token_id == [128001, 128009] 20 | 21 | 22 | def test_get_blip2_eos_token(): 23 | model_name = "Salesforce/blip2-opt-2.7b" 24 | 25 | tokenizer = get_tokenizer(model_name) 26 | assert tokenizer.eos_token_id == 2 27 | 28 | generation_config = try_get_generation_config(model_name, 29 | trust_remote_code=False) 30 | assert generation_config is not None 31 | assert generation_config.eos_token_id == 50118 32 | -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import PreTrainedTokenizerBase 3 | 4 | from vllm.transformers_utils.tokenizer import get_tokenizer 5 | 6 | TOKENIZER_NAMES = [ 7 | "facebook/opt-125m", 8 | "gpt2", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) 13 | def test_tokenizer_revision(tokenizer_name: str): 14 | # Assume that "main" branch always exists 15 | tokenizer = get_tokenizer(tokenizer_name, revision="main") 16 | assert isinstance(tokenizer, PreTrainedTokenizerBase) 17 | 18 | # Assume that "never" branch always does not exist 19 | with pytest.raises(OSError, match='not a valid git identifier'): 20 | get_tokenizer(tokenizer_name, revision="never") 21 | -------------------------------------------------------------------------------- /tests/tool_use/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tool_use/__init__.py -------------------------------------------------------------------------------- /tests/tpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tpu/__init__.py -------------------------------------------------------------------------------- /tests/tpu/test_custom_dispatcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from vllm.config import CompilationLevel 4 | 5 | from ..utils import compare_two_settings 6 | 7 | # --enforce-eager on TPU causes graph compilation 8 | # this times out default Health Check in the MQLLMEngine, 9 | # so we set the timeout here to 30s 10 | os.environ["VLLM_RPC_TIMEOUT"] = "30000" 11 | 12 | 13 | def test_custom_dispatcher(): 14 | compare_two_settings( 15 | "google/gemma-2b", 16 | arg1=[ 17 | "--enforce-eager", 18 | f"-O{CompilationLevel.DYNAMO_ONCE}", 19 | ], 20 | arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], 21 | env1={}, 22 | env2={}) 23 | -------------------------------------------------------------------------------- /tests/tracing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/tracing/__init__.py -------------------------------------------------------------------------------- /tests/v1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/v1/__init__.py -------------------------------------------------------------------------------- /tests/v1/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/v1/engine/__init__.py -------------------------------------------------------------------------------- /tests/vllm_test_utils/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='vllm_test_utils', 5 | version='0.1', 6 | packages=['vllm_test_utils'], 7 | ) 8 | -------------------------------------------------------------------------------- /tests/vllm_test_utils/vllm_test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | vllm_utils is a package for vLLM testing utilities. 3 | It does not import any vLLM modules. 4 | """ 5 | 6 | from .blame import BlameResult, blame 7 | 8 | __all__ = ["blame", "BlameResult"] 9 | -------------------------------------------------------------------------------- /tests/weight_loading/models-large.txt: -------------------------------------------------------------------------------- 1 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main 2 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main 3 | compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main 4 | gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main 5 | awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main -------------------------------------------------------------------------------- /tests/weight_loading/run_model_weight_loading_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SUCCESS=0 3 | 4 | while getopts "c:" OPT; do 5 | case ${OPT} in 6 | c ) 7 | CONFIG="$OPTARG" 8 | ;; 9 | \? ) 10 | usage 11 | exit 1 12 | ;; 13 | esac 14 | done 15 | 16 | 17 | IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" 18 | 19 | for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" 20 | do 21 | LOCAL_SUCCESS=0 22 | IFS=', ' read -r -a array <<< "$MODEL_CONFIG" 23 | 24 | echo "=== RUNNING MODEL: $MODEL_CONFIG ===" 25 | 26 | export QUANTIZATION=${array[0]} 27 | export MODEL_NAME=${array[1]} 28 | export REVISION=${array[2]} 29 | pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$? 30 | 31 | if [[ $LOCAL_SUCCESS == 0 ]]; then 32 | echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" 33 | else 34 | echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" 35 | fi 36 | 37 | SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) 38 | 39 | done 40 | 41 | if [ "${SUCCESS}" -eq "0" ]; then 42 | exit 0 43 | else 44 | exit 1 45 | fi 46 | -------------------------------------------------------------------------------- /tests/weight_loading/test_weight_loading.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | MAX_MODEL_LEN = 1024 6 | MODEL_NAME = os.environ.get("MODEL_NAME", 7 | "robertgshaw2/zephyr-7b-beta-channelwise-gptq") 8 | REVISION = os.environ.get("REVISION", "main") 9 | QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin") 10 | 11 | 12 | def test_weight_loading(vllm_runner): 13 | """ 14 | Test parameter weight loading with tp>1. 15 | """ 16 | with vllm_runner(model_name=MODEL_NAME, 17 | revision=REVISION, 18 | dtype=torch.half if QUANTIZATION == "gptq" else "auto", 19 | quantization=QUANTIZATION, 20 | max_model_len=MAX_MODEL_LEN, 21 | tensor_parallel_size=2) as model: 22 | 23 | output = model.generate_greedy("Hello world!", max_tokens=20) 24 | print(output) 25 | assert output 26 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/tests/worker/__init__.py -------------------------------------------------------------------------------- /tools/actionlint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if command -v actionlint &> /dev/null; then 4 | actionlint "$@" 5 | exit 0 6 | elif [ -x ./actionlint ]; then 7 | ./actionlint "$@" 8 | exit 0 9 | fi 10 | 11 | # download a binary to the current directory - v1.7.3 12 | bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) 13 | ./actionlint "$@" 14 | -------------------------------------------------------------------------------- /tools/check_repo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) 3 | 4 | if ! git diff --quiet; then 5 | echo "Repo is dirty" >&2 6 | 7 | exit 1 8 | fi 9 | 10 | if ! git describe --tags; then 11 | echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 12 | 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /tools/mypy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CI=${1:-0} 4 | PYTHON_VERSION=${2:-3.9} 5 | 6 | if [ "$CI" -eq 1 ]; then 7 | set -e 8 | fi 9 | 10 | run_mypy() { 11 | echo "Running mypy on $1" 12 | if [ "$CI" -eq 1 ] && [ -z "$1" ]; then 13 | mypy --python-version "${PYTHON_VERSION}" "$@" 14 | return 15 | fi 16 | mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" 17 | } 18 | 19 | run_mypy # Note that this is less strict than CI 20 | run_mypy tests 21 | run_mypy vllm/attention 22 | run_mypy vllm/compilation 23 | run_mypy vllm/distributed 24 | run_mypy vllm/engine 25 | run_mypy vllm/executor 26 | run_mypy vllm/lora 27 | run_mypy vllm/model_executor 28 | run_mypy vllm/plugins 29 | run_mypy vllm/prompt_adapter 30 | run_mypy vllm/spec_decode 31 | run_mypy vllm/worker 32 | -------------------------------------------------------------------------------- /tools/png-lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Ensure that *.excalidraw.png files have the excalidraw metadata 4 | # embedded in them. This ensures they can be loaded back into 5 | # the tool and edited in the future. 6 | 7 | find . -iname '*.excalidraw.png' | while read -r file; do 8 | if git check-ignore -q "$file"; then 9 | continue 10 | fi 11 | if ! grep -q "excalidraw+json" "$file"; then 12 | echo "$file was not exported from excalidraw with 'Embed Scene' enabled." 13 | exit 1 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /tools/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | scversion="stable" 5 | 6 | if [ -d "shellcheck-${scversion}" ]; then 7 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}" 8 | fi 9 | 10 | if ! [ -x "$(command -v shellcheck)" ]; then 11 | if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then 12 | echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" 13 | exit 1 14 | fi 15 | 16 | # automatic local install if linux x86_64 17 | wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv 18 | export PATH="$PATH:$(pwd)/shellcheck-${scversion}" 19 | fi 20 | 21 | # TODO - fix warnings in .buildkite/run-amd-test.sh 22 | find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' 23 | -------------------------------------------------------------------------------- /tools/sphinx-lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sphinx-lint --disable trailing-whitespace,missing-final-newline docs 4 | -------------------------------------------------------------------------------- /use_existing_torch.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | requires_files = glob.glob('requirements*.txt') 4 | requires_files += ["pyproject.toml"] 5 | for file in requires_files: 6 | print(f">>> cleaning {file}") 7 | with open(file) as f: 8 | lines = f.readlines() 9 | if "torch" in "".join(lines).lower(): 10 | print("removed:") 11 | with open(file, 'w') as f: 12 | for line in lines: 13 | if 'torch' not in line.lower(): 14 | f.write(line) 15 | else: 16 | print(line.strip()) 17 | print(f"<<< done cleaning {file}") 18 | print() 19 | -------------------------------------------------------------------------------- /vllm/adapter_commons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/adapter_commons/__init__.py -------------------------------------------------------------------------------- /vllm/adapter_commons/layers.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Tuple 3 | 4 | 5 | @dataclass 6 | class AdapterMapping: 7 | # Per every token in input_ids: 8 | index_mapping: Tuple[int, ...] 9 | # Per sampled token: 10 | prompt_mapping: Tuple[int, ...] 11 | 12 | def __post_init__(self): 13 | self.index_mapping = tuple(self.index_mapping) 14 | self.prompt_mapping = tuple(self.prompt_mapping) -------------------------------------------------------------------------------- /vllm/adapter_commons/request.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AdapterRequest(ABC): 5 | """ 6 | Base class for adapter requests. 7 | """ 8 | 9 | @property 10 | @abstractmethod 11 | def adapter_id(self) -> int: 12 | raise NotImplementedError 13 | 14 | def __post_init__(self) -> None: 15 | if self.adapter_id < 1: 16 | raise ValueError(f"id must be > 0, got {self.adapter_id}") 17 | 18 | def __eq__(self, value: object) -> bool: 19 | return isinstance( 20 | value, self.__class__) and self.adapter_id == value.adapter_id 21 | 22 | def __hash__(self) -> int: 23 | return hash(self.adapter_id) 24 | -------------------------------------------------------------------------------- /vllm/adapter_commons/worker_manager.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Optional, Set 3 | 4 | import torch 5 | 6 | 7 | class AbstractWorkerManager(ABC): 8 | 9 | def __init__(self, device: torch.device): 10 | self.device = device 11 | 12 | @property 13 | @abstractmethod 14 | def is_enabled(self) -> bool: 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def set_active_adapters(self, requests: Set[Any], 19 | mapping: Optional[Any]) -> None: 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def add_adapter(self, adapter_request: Any) -> bool: 24 | raise NotImplementedError 25 | 26 | @abstractmethod 27 | def remove_adapter(self, adapter_id: int) -> bool: 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def remove_all_adapters(self) -> None: 32 | raise NotImplementedError 33 | 34 | @abstractmethod 35 | def list_adapters(self) -> Set[int]: 36 | raise NotImplementedError 37 | -------------------------------------------------------------------------------- /vllm/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/assets/__init__.py -------------------------------------------------------------------------------- /vllm/assets/audio.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal, Tuple 3 | from urllib.parse import urljoin 4 | 5 | import librosa 6 | import numpy as np 7 | 8 | from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL 9 | 10 | ASSET_DIR = "multimodal_asset" 11 | 12 | 13 | @dataclass(frozen=True) 14 | class AudioAsset: 15 | name: Literal["winning_call", "mary_had_lamb"] 16 | 17 | @property 18 | def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: 19 | 20 | audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", 21 | s3_prefix=ASSET_DIR) 22 | y, sr = librosa.load(audio_path, sr=None) 23 | assert isinstance(sr, int) 24 | return y, sr 25 | 26 | @property 27 | def url(self) -> str: 28 | return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") 29 | -------------------------------------------------------------------------------- /vllm/assets/image.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from PIL import Image 6 | 7 | from vllm.assets.base import get_vllm_public_assets 8 | 9 | VLM_IMAGES_DIR = "vision_model_images" 10 | 11 | 12 | @dataclass(frozen=True) 13 | class ImageAsset: 14 | name: Literal["stop_sign", "cherry_blossom"] 15 | 16 | @property 17 | def pil_image(self) -> Image.Image: 18 | 19 | image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", 20 | s3_prefix=VLM_IMAGES_DIR) 21 | return Image.open(image_path) 22 | 23 | @property 24 | def image_embeds(self) -> torch.Tensor: 25 | """ 26 | Image embeddings, only used for testing purposes with llava 1.5. 27 | """ 28 | image_path = get_vllm_public_assets(filename=f"{self.name}.pt", 29 | s3_prefix=VLM_IMAGES_DIR) 30 | return torch.load(image_path, map_location="cpu") 31 | -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.attention.backends.abstract import (AttentionBackend, 2 | AttentionMetadata, 3 | AttentionMetadataBuilder, 4 | AttentionState, AttentionType) 5 | from vllm.attention.layer import Attention 6 | from vllm.attention.selector import get_attn_backend 7 | 8 | __all__ = [ 9 | "Attention", 10 | "AttentionBackend", 11 | "AttentionMetadata", 12 | "AttentionType", 13 | "AttentionMetadataBuilder", 14 | "Attention", 15 | "AttentionState", 16 | "get_attn_backend", 17 | ] 18 | -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/blocksparse_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/attention/ops/blocksparse_attention/__init__.py -------------------------------------------------------------------------------- /vllm/compilation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/compilation/__init__.py -------------------------------------------------------------------------------- /vllm/compilation/compile_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from typing import Any 3 | 4 | _compile_context: Any = None 5 | 6 | 7 | def get_compile_context() -> Any: 8 | """Get the current compile context.""" 9 | return _compile_context 10 | 11 | 12 | @contextmanager 13 | def set_compile_context(context: Any): 14 | """A context manager that stores the current compile context, 15 | usually it is a list of sizes to specialize. 16 | """ 17 | global _compile_context 18 | prev_context = _compile_context 19 | _compile_context = context 20 | try: 21 | yield 22 | finally: 23 | _compile_context = prev_context 24 | -------------------------------------------------------------------------------- /vllm/compilation/counter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import dataclasses 3 | from contextlib import contextmanager 4 | 5 | 6 | @dataclasses.dataclass 7 | class CompilationCounter: 8 | num_models_seen: int = 0 9 | num_graphs_seen: int = 0 10 | # including the splitting ops 11 | num_piecewise_graphs_seen: int = 0 12 | # not including the splitting ops 13 | num_piecewise_capturable_graphs_seen: int = 0 14 | num_inductor_compilations: int = 0 15 | num_cudagraph_caputured: int = 0 16 | 17 | def clone(self) -> "CompilationCounter": 18 | return copy.deepcopy(self) 19 | 20 | @contextmanager 21 | def expect(self, **kwargs): 22 | old = self.clone() 23 | yield 24 | for k, v in kwargs.items(): 25 | assert getattr(self, k) - getattr(old, k) == v, ( 26 | f"{k} not as expected, before it is {getattr(old, k)}" 27 | f", after it is {getattr(self, k)}, " 28 | f"expected diff is {v}") 29 | 30 | 31 | compilation_counter = CompilationCounter() 32 | -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/utils.py: -------------------------------------------------------------------------------- 1 | """Block manager utils.""" 2 | from vllm.sequence import SequenceGroup 3 | from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, 4 | STR_NOT_IMPL_ENC_DEC_SWA) 5 | 6 | 7 | def check_no_caching_or_swa_for_blockmgr_encdec( 8 | block_mgr, seq_group: SequenceGroup) -> None: 9 | ''' 10 | Enforce that prefix caching & sliding-window attention (SWA) 11 | are currently unsupported *specifically* for encoder/decoder models. 12 | 13 | Raises NotImplementedError if unsupported scenario is detected. 14 | 15 | Arguments: 16 | 17 | * block_mgr: BlockSpaceManager instance 18 | * seq_group: SequenceGroup passed to block_mgr 19 | ''' 20 | 21 | if seq_group.is_encoder_decoder(): 22 | if block_mgr.max_block_sliding_window is not None: 23 | raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) 24 | 25 | if block_mgr.enable_caching: 26 | raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) 27 | -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_connector/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/factory.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from .base import KVConnectorBase 4 | 5 | if TYPE_CHECKING: 6 | from vllm.config import VllmConfig 7 | 8 | 9 | class KVConnectorFactory: 10 | 11 | @staticmethod 12 | def create_connector(rank: int, local_rank: int, 13 | config: "VllmConfig") -> KVConnectorBase: 14 | if config.kv_transfer_config.kv_connector == 'PyNcclConnector': 15 | from .simple_connector import SimpleConnector 16 | return SimpleConnector(rank, local_rank, config) 17 | else: 18 | raise ValueError(f"Unsupported connector type: " 19 | f"{config.kv_connector}") 20 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_pipe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/distributed/kv_transfer/kv_pipe/__init__.py -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Sequence as GenericSequence 3 | from typing import cast 4 | 5 | from vllm.model_executor.layers.sampler import SamplerOutput 6 | from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput 7 | 8 | 9 | def create_output_by_sequence_group( 10 | outputs: GenericSequence[SamplerOutput], 11 | num_seq_groups: int) -> List[List[SequenceGroupOutput]]: 12 | """Helper method which transforms a 2d list organized by 13 | [step][sequence group] into [sequence group][step]. 14 | """ 15 | output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [ 16 | [] for _ in range(num_seq_groups) 17 | ] 18 | for step in outputs: 19 | sequence_group_output: CompletionSequenceGroupOutput 20 | for i, sequence_group_output in enumerate(step): 21 | output_by_sequence_group[i].append(sequence_group_output) 22 | 23 | # Cast to the more generic type that CompletionSequenceGroupOutput 24 | # inherits from. 25 | return cast(List[List[SequenceGroupOutput]], output_by_sequence_group) 26 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_tool_parser import ToolParser, ToolParserManager 2 | from .granite_20b_fc_tool_parser import Granite20bFCToolParser 3 | from .granite_tool_parser import GraniteToolParser 4 | from .hermes_tool_parser import Hermes2ProToolParser 5 | from .internlm2_tool_parser import Internlm2ToolParser 6 | from .jamba_tool_parser import JambaToolParser 7 | from .llama_tool_parser import Llama3JsonToolParser 8 | from .mistral_tool_parser import MistralToolParser 9 | from .pythonic_tool_parser import PythonicToolParser 10 | 11 | __all__ = [ 12 | "ToolParser", "ToolParserManager", "Granite20bFCToolParser", 13 | "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", 14 | "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", 15 | "PythonicToolParser" 16 | ] 17 | -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/executor/msgspec_utils.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Any, Type 3 | 4 | from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE 5 | 6 | 7 | def encode_hook(obj: Any) -> Any: 8 | """Custom msgspec enc hook that supports array types. 9 | 10 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 11 | """ 12 | if isinstance(obj, array): 13 | assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( 14 | f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " 15 | f"Given array has a type code of {obj.typecode}.") 16 | return obj.tobytes() 17 | 18 | 19 | def decode_hook(type: Type, obj: Any) -> Any: 20 | """Custom msgspec dec hook that supports array types. 21 | 22 | See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder 23 | """ 24 | if type is array: 25 | deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) 26 | deserialized.frombytes(obj) 27 | return deserialized 28 | -------------------------------------------------------------------------------- /vllm/executor/multiproc_xpu_executor.py: -------------------------------------------------------------------------------- 1 | import vllm.envs as envs 2 | from vllm.executor.multiproc_gpu_executor import ( 3 | MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync) 4 | from vllm.executor.xpu_executor import XPUExecutor 5 | from vllm.logger import init_logger 6 | from vllm.utils import make_async 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor): 12 | """Python multiprocessing-based multi-XPU executor""" 13 | 14 | def _check_executor_parameters(self): 15 | mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD 16 | if mp_method != "spawn": 17 | raise RuntimeError( 18 | "XPU multiprocess executor only support spawn as mp method") 19 | 20 | 21 | class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor, 22 | MultiprocessingGPUExecutorAsync): 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__(*args, **kwargs) 26 | self.driver_exec_model = make_async(self.driver_worker.execute_model) 27 | -------------------------------------------------------------------------------- /vllm/forward_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from dataclasses import dataclass 3 | from typing import Any, Dict, Optional 4 | 5 | from vllm.config import VllmConfig 6 | 7 | 8 | @dataclass 9 | class ForwardContext: 10 | static_forward_context: Dict[str, Any] 11 | # TODO: extend to support per-layer dynamic forward context 12 | dynamic_forward_context: Any 13 | 14 | 15 | _forward_context: Optional[ForwardContext] = None 16 | 17 | 18 | def get_forward_context() -> ForwardContext: 19 | """Get the current forward context.""" 20 | assert _forward_context is not None, ( 21 | "Forward context is not set. " 22 | "Please use `set_forward_context` to set the forward context.") 23 | return _forward_context 24 | 25 | 26 | @contextmanager 27 | def set_forward_context(context: Any, vllm_config: VllmConfig): 28 | """A context manager that stores the current forward context, 29 | can be attention metadata, etc.""" 30 | global _forward_context 31 | prev_context = _forward_context 32 | _forward_context = ForwardContext( 33 | static_forward_context=vllm_config.compilation_config. 34 | static_forward_context, 35 | dynamic_forward_context=context) 36 | try: 37 | yield 38 | finally: 39 | _forward_context = prev_context 40 | -------------------------------------------------------------------------------- /vllm/logging_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.logging_utils.formatter import NewLineFormatter 2 | 3 | __all__ = [ 4 | "NewLineFormatter", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/logging_utils/formatter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NewLineFormatter(logging.Formatter): 5 | """Adds logging prefix to newlines to align multi-line messages.""" 6 | 7 | def __init__(self, fmt, datefmt=None, style="%"): 8 | logging.Formatter.__init__(self, fmt, datefmt, style) 9 | 10 | def format(self, record): 11 | msg = logging.Formatter.format(self, record) 12 | if record.message != "": 13 | parts = msg.split(record.message) 14 | msg = msg.replace("\n", "\r\n" + parts[0]) 15 | return msg 16 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/lora/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.parameter import (BasevLLMParameter, 2 | PackedvLLMParameter) 3 | from vllm.model_executor.sampling_metadata import (SamplingMetadata, 4 | SamplingMetadataCache) 5 | from vllm.model_executor.utils import set_random_seed 6 | 7 | __all__ = [ 8 | "SamplingMetadata", 9 | "SamplingMetadataCache", 10 | "set_random_seed", 11 | "BasevLLMParameter", 12 | "PackedvLLMParameter", 13 | ] 14 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/mamba/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/mamba/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | from .compressed_tensors_scheme import CompressedTensorsScheme 2 | from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, 3 | CompressedTensorsW4A16Sparse24) 4 | from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 5 | from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 6 | from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 7 | from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS, 8 | CompressedTensorsWNA16) 9 | 10 | __all__ = [ 11 | "CompressedTensorsScheme", 12 | "CompressedTensorsWNA16", 13 | "CompressedTensorsW8A16Fp8", 14 | "CompressedTensorsW4A16Sparse24", 15 | "CompressedTensorsW8A8Int8", 16 | "CompressedTensorsW8A8Fp8", 17 | "WNA16_SUPPORTED_BITS", 18 | "W4A16SPARSE24_SUPPORTED_BITS", 19 | ] 20 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_utils import replace_parameter, update_tensor_inplace 2 | 3 | __all__ = ['update_tensor_inplace', 'replace_parameter'] 4 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/machete_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | 5 | from vllm.scalar_type import ScalarType, scalar_types 6 | 7 | MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128] 8 | MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] 9 | 10 | 11 | def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]: 12 | if zero_points: 13 | return [scalar_types.uint4, scalar_types.uint8] 14 | else: 15 | return [scalar_types.uint4b8, scalar_types.uint8b128] 16 | 17 | 18 | def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]: 19 | return [torch.float16, torch.bfloat16] 20 | 21 | 22 | def check_machete_supports_shape(in_features: int, out_featrues: int) \ 23 | -> Tuple[bool, Optional[str]]: 24 | if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: 25 | return False, "Input features size must be divisible by "\ 26 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}" 27 | if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: 28 | return False, "Output features size must be divisible by "\ 29 | f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}" 30 | return True, None 31 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from vllm.config import VllmConfig 4 | from vllm.model_executor.model_loader.loader import (BaseModelLoader, 5 | get_model_loader) 6 | from vllm.model_executor.model_loader.utils import ( 7 | get_architecture_class_name, get_model_architecture) 8 | 9 | 10 | def get_model(*, vllm_config: VllmConfig) -> nn.Module: 11 | loader = get_model_loader(vllm_config.load_config) 12 | return loader.load_model(vllm_config=vllm_config) 13 | 14 | 15 | __all__ = [ 16 | "get_model", "get_model_loader", "BaseModelLoader", 17 | "get_architecture_class_name", "get_model_architecture" 18 | ] 19 | -------------------------------------------------------------------------------- /vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, 2 | SupportsPP, has_inner_state, supports_lora, 3 | supports_multimodal, supports_pp) 4 | from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, 5 | is_pooling_model, is_text_generation_model) 6 | from .registry import ModelRegistry 7 | 8 | __all__ = [ 9 | "ModelRegistry", 10 | "VllmModelForPooling", 11 | "is_pooling_model", 12 | "VllmModelForTextGeneration", 13 | "is_text_generation_model", 14 | "HasInnerState", 15 | "has_inner_state", 16 | "SupportsLoRA", 17 | "supports_lora", 18 | "SupportsMultiModal", 19 | "supports_multimodal", 20 | "SupportsPP", 21 | "supports_pp", 22 | ] 23 | -------------------------------------------------------------------------------- /vllm/model_executor/models/glm.py: -------------------------------------------------------------------------------- 1 | """Inference-only HF format GLM-4 model compatible with THUDM weights.""" 2 | from vllm.config import VllmConfig 3 | from vllm.model_executor.models.llama import LlamaForCausalLM 4 | 5 | from .utils import PPMissingLayer 6 | 7 | 8 | class GlmForCausalLM(LlamaForCausalLM): 9 | 10 | def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): 11 | super().__init__(vllm_config=vllm_config, prefix=prefix) 12 | # Hack Llama model to fit HF format GLM implementation 13 | # Attention difference between GLM and Llama: 14 | # 1. Half partial rotary_dim and no Neox style. 15 | # 2. There is no bias for o_proj in attention 16 | for layer in self.model.layers: 17 | if not isinstance(layer, PPMissingLayer): 18 | layer.self_attn.rotary_emb.rotary_dim //= 2 19 | layer.self_attn.rotary_emb.is_neox_style = False 20 | layer.self_attn.o_proj.bias = None 21 | layer.self_attn.o_proj.skip_bias_add = True 22 | -------------------------------------------------------------------------------- /vllm/model_executor/models/phi3.py: -------------------------------------------------------------------------------- 1 | # Adapted from llama.py 2 | """Inference-only Phi3 model code inherit from Llama.py""" 3 | 4 | from vllm.model_executor.models.llama import LlamaForCausalLM 5 | 6 | 7 | class Phi3ForCausalLM(LlamaForCausalLM): 8 | 9 | packed_modules_mapping = { 10 | "qkv_proj": [ 11 | "qkv_proj", 12 | ], 13 | "gate_up_proj": [ 14 | "gate_up_proj", 15 | ], 16 | } 17 | 18 | # BitandBytes specific attributes 19 | # Initialize an empty dict when there is no stacked parameter mapping. 20 | bitsandbytes_stacked_params_mapping = {} 21 | -------------------------------------------------------------------------------- /vllm/multimodal/audio.py: -------------------------------------------------------------------------------- 1 | from vllm.inputs.registry import InputContext 2 | 3 | from .base import MultiModalPlugin 4 | from .inputs import AudioItem, MultiModalData, MultiModalKwargs 5 | 6 | 7 | class AudioPlugin(MultiModalPlugin): 8 | """Plugin for audio data.""" 9 | 10 | def get_data_key(self) -> str: 11 | return "audio" 12 | 13 | def _default_input_mapper( 14 | self, 15 | ctx: InputContext, 16 | data: MultiModalData[AudioItem], 17 | **mm_processor_kwargs, 18 | ) -> MultiModalKwargs: 19 | raise NotImplementedError("There is no default audio input mapper") 20 | 21 | def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: 22 | raise NotImplementedError( 23 | "There is no default maximum multimodal tokens") 24 | -------------------------------------------------------------------------------- /vllm/platforms/neuron.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from .interface import Platform, PlatformEnum 4 | 5 | if TYPE_CHECKING: 6 | from vllm.config import VllmConfig 7 | else: 8 | VllmConfig = None 9 | 10 | 11 | class NeuronPlatform(Platform): 12 | _enum = PlatformEnum.NEURON 13 | device_name: str = "neuron" 14 | device_type: str = "neuron" 15 | supported_quantization: list[str] = ["neuron_quant"] 16 | 17 | @classmethod 18 | def get_device_name(cls, device_id: int = 0) -> str: 19 | return "neuron" 20 | 21 | @classmethod 22 | def check_and_update_config(cls, vllm_config: VllmConfig) -> None: 23 | parallel_config = vllm_config.parallel_config 24 | if parallel_config.worker_cls == "auto": 25 | parallel_config.worker_cls = \ 26 | "vllm.worker.neuron_worker.NeuronWorker" 27 | -------------------------------------------------------------------------------- /vllm/pooling_params.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import msgspec 4 | 5 | 6 | class PoolingParams( 7 | msgspec.Struct, 8 | omit_defaults=True, # type: ignore[call-arg] 9 | array_like=True): # type: ignore[call-arg] 10 | """Pooling parameters for embeddings API. 11 | 12 | Attributes: 13 | additional_data: Any additional data needed for pooling. 14 | """ 15 | additional_data: Optional[Any] = None 16 | 17 | def clone(self) -> "PoolingParams": 18 | """Returns a deep copy of the PoolingParams instance.""" 19 | return PoolingParams(additional_data=self.additional_data) 20 | 21 | def __repr__(self) -> str: 22 | return (f"PoolingParams(" 23 | f"additional_metadata={self.additional_data})") 24 | -------------------------------------------------------------------------------- /vllm/profiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .layerwise_profile import layerwise_profile 2 | 3 | __all__ = [ 4 | "layerwise_profile", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/prompt_adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/prompt_adapter/__init__.py -------------------------------------------------------------------------------- /vllm/prompt_adapter/request.py: -------------------------------------------------------------------------------- 1 | import msgspec 2 | 3 | from vllm.adapter_commons.request import AdapterRequest 4 | 5 | 6 | class PromptAdapterRequest( 7 | msgspec.Struct, 8 | array_like=True, # type: ignore[call-arg] 9 | omit_defaults=True, # type: ignore[call-arg] 10 | frozen=True): # type: ignore[call-arg] 11 | """ 12 | Request for a Prompt adapter. 13 | """ 14 | __metaclass__ = AdapterRequest 15 | 16 | prompt_adapter_name: str 17 | prompt_adapter_id: int 18 | prompt_adapter_local_path: str 19 | prompt_adapter_num_virtual_tokens: int 20 | 21 | def __hash__(self): 22 | return super().__hash__() 23 | 24 | @property 25 | def adapter_id(self): 26 | return self.prompt_adapter_id 27 | 28 | @property 29 | def name(self): 30 | return self.prompt_adapter_name 31 | 32 | @property 33 | def local_path(self): 34 | return self.prompt_adapter_local_path 35 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.envs import VLLM_USE_MODELSCOPE 2 | 3 | if VLLM_USE_MODELSCOPE: 4 | # Patch here, before each import happens 5 | import modelscope 6 | from packaging import version 7 | 8 | # patch_hub begins from modelscope>=1.18.1 9 | if version.parse(modelscope.__version__) <= version.parse('1.18.0'): 10 | raise ImportError( 11 | 'Using vLLM with ModelScope needs modelscope>=1.18.1, please ' 12 | 'install by `pip install modelscope -U`') 13 | 14 | from modelscope.utils.hf_util import patch_hub 15 | 16 | # Patch hub to download models from modelscope to speed up. 17 | patch_hub() 18 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/h2ovl.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py 3 | # -------------------------------------------------------- 4 | # H2OVL-Mississippi 5 | # Copyright (c) 2024 H2O.AI 6 | # Licensed under Apache 2.0 License [see LICENSE for details] 7 | # -------------------------------------------------------- 8 | 9 | from .internvl import InternVLChatConfig 10 | 11 | 12 | class H2OVLChatConfig(InternVLChatConfig): 13 | model_type = "h2ovl_chat" 14 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/mllama.py: -------------------------------------------------------------------------------- 1 | from transformers.models.mllama import configuration_mllama as mllama_hf_config 2 | 3 | 4 | class MllamaTextConfig(mllama_hf_config.MllamaTextConfig): 5 | ''' 6 | Use this class to override is_encoder_decoder: 7 | - transformers regards mllama as is_encoder_decoder=False 8 | - vllm needs is_encoder_decoder=True to enable cross-attention 9 | ''' 10 | 11 | def __init__( 12 | self, 13 | **kwargs, 14 | ): 15 | super().__init__(**kwargs) 16 | self.is_encoder_decoder = True 17 | 18 | 19 | class MllamaConfig(mllama_hf_config.MllamaConfig): 20 | 21 | def __init__( 22 | self, 23 | text_config=None, 24 | **kwargs, 25 | ): 26 | if isinstance(text_config, dict): 27 | text_config = MllamaTextConfig(**text_config) 28 | super().__init__(text_config=text_config, **kwargs) 29 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/nvlm_d.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py 3 | # -------------------------------------------------------- 4 | # NVLM-D 5 | # Copyright (c) 2024 NVIDIA 6 | # Licensed under Apache 2.0 License [see LICENSE for details] 7 | # -------------------------------------------------------- 8 | from .internvl import InternVLChatConfig 9 | 10 | 11 | class NVLM_D_Config(InternVLChatConfig): 12 | model_type = 'NVLM_D' 13 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .mistral import MistralTokenizer, maybe_serialize_tool_calls 2 | 3 | __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"] 4 | -------------------------------------------------------------------------------- /vllm/transformers_utils/utils.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | 6 | def check_gguf_file(model: Union[str, PathLike]) -> bool: 7 | """Check if the file is a GGUF model.""" 8 | model = Path(model) 9 | if not model.is_file(): 10 | return False 11 | elif model.suffix == ".gguf": 12 | return True 13 | 14 | with open(model, "rb") as f: 15 | header = f.read(4) 16 | return header == b"GGUF" 17 | -------------------------------------------------------------------------------- /vllm/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.triton_utils.importing import HAS_TRITON 2 | 3 | __all__ = ["HAS_TRITON"] 4 | 5 | if HAS_TRITON: 6 | 7 | from vllm.triton_utils.custom_cache_manager import ( 8 | maybe_set_triton_cache_manager) 9 | 10 | __all__ += ["maybe_set_triton_cache_manager"] 11 | -------------------------------------------------------------------------------- /vllm/triton_utils/importing.py: -------------------------------------------------------------------------------- 1 | from importlib.util import find_spec 2 | 3 | from vllm.logger import init_logger 4 | from vllm.platforms import current_platform 5 | 6 | logger = init_logger(__name__) 7 | 8 | HAS_TRITON = ( 9 | find_spec("triton") is not None 10 | and not current_platform.is_xpu() # Not compatible 11 | and not current_platform.is_neuron() # neuron has too old torch 12 | ) 13 | 14 | if not HAS_TRITON: 15 | logger.info("Triton not installed or not compatible; certain GPU-related" 16 | " functions will not be available.") 17 | -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/usage/__init__.py -------------------------------------------------------------------------------- /vllm/v1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/__init__.py -------------------------------------------------------------------------------- /vllm/v1/attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/attention/__init__.py -------------------------------------------------------------------------------- /vllm/v1/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/v1/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/core/__init__.py -------------------------------------------------------------------------------- /vllm/v1/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/executor/__init__.py -------------------------------------------------------------------------------- /vllm/v1/outputs.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict, List, Optional 3 | 4 | import torch 5 | 6 | 7 | @dataclass 8 | class SamplerOutput: 9 | 10 | # [num_reqs] 11 | sampled_token_ids: torch.Tensor 12 | 13 | # [num_reqs, max_num_logprobs + 1] 14 | logprob_token_ids: Optional[torch.Tensor] 15 | # [num_reqs, max_num_logprobs + 1] 16 | logprobs: Optional[torch.Tensor] 17 | 18 | # TODO: Support prompt logprobs. 19 | prompt_logprob_token_ids: Optional[torch.Tensor] 20 | prompt_logprobs: Optional[torch.Tensor] 21 | 22 | 23 | @dataclass 24 | class ModelRunnerOutput: 25 | 26 | # [num_reqs] 27 | req_ids: List[str] 28 | # req_id -> index 29 | req_id_to_index: Dict[str, int] 30 | 31 | # [num_reqs] 32 | sampled_token_ids_cpu: torch.Tensor 33 | 34 | # [num_reqs, max_num_logprobs + 1] 35 | logprob_token_ids_cpu: Optional[torch.Tensor] 36 | # [num_reqs, max_num_logprobs + 1] 37 | logprobs_cpu: Optional[torch.Tensor] 38 | -------------------------------------------------------------------------------- /vllm/v1/sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/sample/__init__.py -------------------------------------------------------------------------------- /vllm/v1/sample/metadata.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict 3 | 4 | import torch 5 | 6 | 7 | @dataclass 8 | class SamplingMetadata: 9 | 10 | temperature: torch.Tensor 11 | all_greedy: bool 12 | all_random: bool 13 | 14 | top_p: torch.Tensor 15 | top_k: torch.Tensor 16 | no_top_p: bool 17 | no_top_k: bool 18 | 19 | generators: Dict[int, torch.Generator] 20 | 21 | max_num_logprobs: int 22 | -------------------------------------------------------------------------------- /vllm/v1/serial_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | 4 | class PickleEncoder: 5 | 6 | def encode(self, obj): 7 | return pickle.dumps(obj) 8 | 9 | def decode(self, data): 10 | return pickle.loads(data) 11 | -------------------------------------------------------------------------------- /vllm/v1/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/v1/worker/__init__.py -------------------------------------------------------------------------------- /vllm/version.py: -------------------------------------------------------------------------------- 1 | try: 2 | from ._version import __version__, __version_tuple__ 3 | except Exception as e: 4 | import warnings 5 | 6 | warnings.warn(f"Failed to read commit hash:\n{e}", 7 | RuntimeWarning, 8 | stacklevel=2) 9 | 10 | __version__ = "dev" 11 | __version_tuple__ = (0, 0, __version__) 12 | -------------------------------------------------------------------------------- /vllm/vllm_flash_attn/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/vllm_flash_attn/.gitkeep -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cduk/vllm-pascal/9caa7c6b74eaa2ad270c0944ef52133fd6779ce4/vllm/worker/__init__.py --------------------------------------------------------------------------------