├── .buildkite ├── check-wheel-size.py ├── download-images.sh ├── run-amd-test.sh ├── run-benchmarks.sh ├── run-cpu-test.sh ├── run-neuron-test.sh ├── test-pipeline.yaml └── test-template.j2 ├── .clang-format ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug report.yml │ ├── 500-feature request.yml │ ├── 600-new model.yml │ ├── 700-performance discussion.yml │ ├── 750-RFC.yml │ ├── 800-misc discussion.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── clang-format.yml │ ├── mypy.yaml │ ├── publish.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── .yapfignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.neuron ├── Dockerfile.rocm ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_latency.py ├── benchmark_prefix_caching.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── cutlass_benchmarks │ ├── w8a8_benchmarks.py │ └── weight_shapes.py ├── kernels │ ├── benchmark_aqlm.py │ ├── benchmark_marlin.py │ ├── benchmark_mixtral_moe.py │ ├── benchmark_paged_attention.py │ ├── benchmark_rope.py │ └── benchmark_shapes.py ├── launch_tgi_server.sh ├── overheads │ └── benchmark_hashing.py └── sonnet.txt ├── cmake ├── cpu_extension.cmake ├── hipify.py └── utils.cmake ├── collect_env.py ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cu │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ └── dtype_fp8.cuh ├── cache.h ├── cache_kernels.cu ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── layernorm.cpp │ ├── pos_encoding.cpp │ └── pybind.cpp ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── dispatch_utils.h ├── layernorm_kernels.cu ├── moe │ ├── moe_ops.cpp │ ├── moe_ops.h │ └── topk_softmax_kernels.cu ├── moe_align_block_size_kernels.cu ├── ops.h ├── pos_encoding_kernels.cu ├── punica │ ├── LICENSE │ ├── bgmv │ │ ├── bgmv_bf16_bf16_bf16.cu │ │ ├── bgmv_bf16_fp32_bf16.cu │ │ ├── bgmv_config.h │ │ ├── bgmv_fp16_fp16_fp16.cu │ │ ├── bgmv_fp16_fp32_fp16.cu │ │ ├── bgmv_fp32_bf16_bf16.cu │ │ ├── bgmv_fp32_fp16_fp16.cu │ │ ├── bgmv_impl.cuh │ │ ├── generator.py │ │ └── vec_dtypes.cuh │ ├── punica_ops.cu │ ├── punica_ops.h │ ├── punica_pybind.cpp │ └── type_convert.h ├── pybind.cpp ├── quantization │ ├── aqlm │ │ └── gemm_kernels.cu │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── compressed_tensors │ │ └── int8_quant_kernels.cu │ ├── cutlass_w8a8 │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ ├── common.hpp │ │ ├── scaled_mm_dq_c2x.cu │ │ ├── scaled_mm_dq_c3x.cu │ │ └── scaled_mm_dq_entry.cu │ ├── fp8 │ │ ├── amd │ │ │ ├── hip_float8.h │ │ │ ├── hip_float8_impl.h │ │ │ └── quant_utils.cuh │ │ ├── common.cu │ │ └── nvidia │ │ │ └── quant_utils.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── gptq_marlin │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin.cuh │ │ ├── gptq_marlin_dtypes.cuh │ │ └── gptq_marlin_repack.cu │ ├── marlin │ │ ├── dense │ │ │ ├── LICENSE │ │ │ └── marlin_cuda_kernel.cu │ │ └── sparse │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ ├── base.h │ │ │ ├── mem.h │ │ │ └── mma.h │ │ │ └── marlin_24_cuda_kernel.cu │ └── squeezellm │ │ └── quant_cuda_kernel.cu └── reduction_utils.cuh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── assets │ ├── dev │ │ └── dockerfile-stages-dependency.png │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── community │ ├── meetups.rst │ └── sponsors.md │ ├── conf.py │ ├── dev │ ├── dockerfile │ │ └── dockerfile.rst │ ├── engine │ │ ├── async_llm_engine.rst │ │ ├── engine_index.rst │ │ └── llm_engine.rst │ ├── kernel │ │ └── paged_attention.rst │ ├── multimodal │ │ └── multimodal_index.rst │ ├── offline_inference │ │ ├── llm.rst │ │ ├── llm_inputs.rst │ │ └── offline_index.rst │ └── sampling_params.rst │ ├── generate_examples.py │ ├── getting_started │ ├── amd-installation.rst │ ├── cpu-installation.rst │ ├── examples │ │ └── examples_index.template.rst │ ├── installation.rst │ ├── neuron-installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── engine_args.rst │ ├── lora.rst │ ├── performance.rst │ ├── supported_models.rst │ └── vlm.rst │ ├── quantization │ ├── auto_awq.rst │ ├── fp8_e4m3_kvcache.rst │ └── fp8_e5m2_kvcache.rst │ └── serving │ ├── deploying_with_bentoml.rst │ ├── deploying_with_docker.rst │ ├── deploying_with_dstack.rst │ ├── deploying_with_kserve.rst │ ├── deploying_with_lws.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ ├── env_vars.rst │ ├── integrations.rst │ ├── metrics.rst │ ├── openai_compatible_server.md │ ├── run_on_sky.rst │ ├── serving_with_langchain.rst │ └── usage_stats.md ├── examples ├── api_client.py ├── aqlm_example.py ├── fp8 │ ├── README.md │ ├── extract_scales.py │ └── quantizer │ │ ├── README.md │ │ └── quantize.py ├── gradio_openai_chatbot_webserver.py ├── gradio_webserver.py ├── llava_example.py ├── llm_engine_example.py ├── logging_configuration.md ├── lora_with_quantization_inference.py ├── multilora_inference.py ├── offline_inference.py ├── offline_inference_arctic.py ├── offline_inference_distributed.py ├── offline_inference_embedding.py ├── offline_inference_neuron.py ├── offline_inference_openai.md ├── offline_inference_with_prefix.py ├── openai_chat_completion_client.py ├── openai_completion_client.py ├── openai_embedding_client.py ├── openai_example_batch.jsonl ├── production_monitoring │ ├── README.md │ ├── docker-compose.yaml │ ├── grafana.json │ └── prometheus.yaml ├── save_sharded_state.py ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja └── tensorize_vllm_model.py ├── format.sh ├── pyproject.toml ├── requirements-build.txt ├── requirements-common.txt ├── requirements-cpu.txt ├── requirements-cuda.txt ├── requirements-dev.txt ├── requirements-neuron.txt ├── requirements-rocm.txt ├── rocm_patch └── rocm_bf16.patch ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── __init__.py │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ ├── test_chat_template.py │ ├── test_openapi_server_ray.py │ └── test_request_tracker.py ├── basic_correctness │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_chunked_prefill.py │ └── test_preemption.py ├── conftest.py ├── core │ ├── __init__.py │ ├── block │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── e2e │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_correctness.py │ │ │ └── test_correctness_sliding_window.py │ │ ├── test_block_manager_v2.py │ │ ├── test_block_table.py │ │ ├── test_common.py │ │ ├── test_cpu_gpu_block_allocator.py │ │ ├── test_naive_block.py │ │ └── test_prefix_caching_block.py │ ├── test_block_manager.py │ ├── test_chunked_prefill_scheduler.py │ ├── test_scheduler.py │ └── utils.py ├── distributed │ ├── __init__.py │ ├── test_basic_distributed_correctness.py │ ├── test_chunked_prefill_distributed.py │ ├── test_comm_ops.py │ ├── test_custom_all_reduce.py │ └── test_pynccl.py ├── engine │ ├── __init__.py │ ├── output_processor │ │ ├── __init__.py │ │ ├── test_multi_step.py │ │ └── test_stop_checker.py │ ├── test_computed_prefix_blocks.py │ ├── test_detokenization.py │ ├── test_multiproc_workers.py │ ├── test_skip_tokenizer_init.py │ ├── test_stop_reason.py │ └── test_stop_strings.py ├── entrypoints │ ├── __init__.py │ ├── openai │ │ └── test_serving_chat.py │ ├── test_guided_processors.py │ ├── test_llm_encode.py │ ├── test_llm_generate.py │ ├── test_openai_run_batch.py │ ├── test_openai_server.py │ └── test_server_oot_registration.py ├── fp8_kv │ ├── llama2-70b-fp8-kv │ │ └── kv_cache_scales.json │ └── llama2-7b-fp8-kv │ │ └── kv_cache_scales.json ├── kernels │ ├── __init__.py │ ├── allclose_default.py │ ├── conftest.py │ ├── test_activation.py │ ├── test_attention.py │ ├── test_attention_selector.py │ ├── test_blocksparse_attention.py │ ├── test_cache.py │ ├── test_cutlass.py │ ├── test_flash_attn.py │ ├── test_int8_quant.py │ ├── test_layernorm.py │ ├── test_marlin_gemm.py │ ├── test_moe.py │ ├── test_pos_encoding.py │ ├── test_prefix_prefill.py │ ├── test_rand.py │ └── test_sampler.py ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ ├── __init__.py │ │ └── long_context_test_data.py │ ├── test_baichuan.py │ ├── test_chatglm3.py │ ├── test_gemma.py │ ├── test_layer_variation.py │ ├── test_layers.py │ ├── test_llama.py │ ├── test_long_context.py │ ├── test_lora.py │ ├── test_lora_checkpoints.py │ ├── test_lora_manager.py │ ├── test_mixtral.py │ ├── test_phi.py │ ├── test_punica.py │ ├── test_quant_model.py │ ├── test_tokenizer_group.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ ├── __init__.py │ └── test_metrics.py ├── model_executor │ ├── __init__.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── test_aqlm.py │ ├── test_big_models.py │ ├── test_embedding.py │ ├── test_fp8.py │ ├── test_gptq_marlin.py │ ├── test_gptq_marlin_24.py │ ├── test_llava.py │ ├── test_marlin.py │ ├── test_mistral.py │ ├── test_models.py │ ├── test_oot_registration.py │ ├── test_registry.py │ └── utils.py ├── multimodal │ ├── __init__.py │ └── test_processor.py ├── prefix_caching │ ├── __init__.py │ ├── test_disable_sliding_window.py │ └── test_prefix_caching.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── __init__.py │ ├── test_bitsandbytes.py │ ├── test_compressed_tensors.py │ ├── test_configs.py │ └── test_fp8.py ├── samplers │ ├── __init__.py │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_logits_processor.py │ ├── test_logprobs.py │ ├── test_ranks.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ └── test_seeded_generate.py ├── spec_decode │ ├── __init__.py │ ├── e2e │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_compatibility.py │ │ ├── test_integration.py │ │ ├── test_integration_dist.py │ │ ├── test_logprobs.py │ │ ├── test_multistep_correctness.py │ │ └── test_ngram_correctness.py │ ├── test_batch_expansion.py │ ├── test_dynamic_spec_decode.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_ngram_worker.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── tensorizer_loader │ ├── __init__.py │ └── test_tensorizer.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_inputs.py ├── test_logger.py ├── test_logits_processor.py ├── test_regression.py ├── test_sampling_params.py ├── test_sequence.py ├── test_sharded_state_loader.py ├── test_utils.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_image_processor.py │ ├── test_tokenizer.py │ └── test_tokenizer_group.py ├── utils.py └── worker │ ├── __init__.py │ ├── test_model_runner.py │ └── test_swap.py └── vllm ├── __init__.py ├── _custom_ops.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── blocksparse_attn.py │ ├── flash_attn.py │ ├── flashinfer.py │ ├── rocm_flash_attn.py │ ├── torch_sdpa.py │ └── xformers.py ├── layer.py ├── ops │ ├── __init__.py │ ├── blocksparse_attention │ │ ├── __init__.py │ │ ├── blocksparse_attention_kernel.py │ │ ├── interface.py │ │ └── utils.py │ ├── paged_attn.py │ ├── prefix_prefill.py │ └── triton_flash_attention.py └── selector.py ├── block.py ├── config.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ ├── prefix_caching_block.py │ └── utils.py ├── block_manager_v1.py ├── block_manager_v2.py ├── embedding_model_block_manager.py ├── evictor_v1.py ├── evictor_v2.py ├── interfaces.py ├── policy.py └── scheduler.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── custom_all_reduce.py │ ├── custom_all_reduce_utils.py │ ├── pynccl.py │ └── pynccl_wrapper.py ├── parallel_state.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── llm_engine.py ├── metrics.py └── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── multi_step.py │ ├── single_step.py │ ├── stop_checker.py │ └── util.py ├── entrypoints ├── __init__.py ├── api_server.py ├── llm.py └── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── protocol.py │ ├── run_batch.py │ ├── serving_chat.py │ ├── serving_completion.py │ ├── serving_embedding.py │ └── serving_engine.py ├── envs.py ├── executor ├── __init__.py ├── cpu_executor.py ├── distributed_gpu_executor.py ├── executor_base.py ├── gpu_executor.py ├── multiproc_gpu_executor.py ├── multiproc_worker_utils.py ├── neuron_executor.py ├── ray_gpu_executor.py └── ray_utils.py ├── inputs.py ├── logger.py ├── logging ├── __init__.py └── formatter.py ├── lora ├── __init__.py ├── fully_sharded_layers.py ├── layers.py ├── lora.py ├── models.py ├── punica.py ├── request.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── guided_decoding │ ├── __init__.py │ ├── lm_format_enforcer_decoding.py │ ├── outlines_decoding.py │ └── outlines_logits_processors.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ └── README │ │ └── fused_moe.py │ ├── layernorm.py │ ├── linear.py │ ├── logits_processor.py │ ├── ops │ │ ├── __init__.py │ │ ├── rand.py │ │ └── sample.py │ ├── pooler.py │ ├── quantization │ │ ├── __init__.py │ │ ├── aqlm.py │ │ ├── awq.py │ │ ├── base_config.py │ │ ├── bitsandbytes.py │ │ ├── compressed_tensors │ │ │ ├── __init__.py │ │ │ ├── compressed_tensors.py │ │ │ └── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ ├── compressed_tensors_unquantized.py │ │ │ │ └── compressed_tensors_w8a8_statictensor.py │ │ ├── deepspeedfp.py │ │ ├── fp8.py │ │ ├── gptq.py │ │ ├── gptq_marlin.py │ │ ├── gptq_marlin_24.py │ │ ├── marlin.py │ │ ├── schema.py │ │ ├── squeezellm.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── format_24.py │ │ │ ├── marlin_24_perms.py │ │ │ ├── marlin_perms.py │ │ │ ├── marlin_utils.py │ │ │ └── quant_utils.py │ ├── rejection_sampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── loader.py │ ├── neuron.py │ ├── tensorizer.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── arctic.py │ ├── baichuan.py │ ├── bloom.py │ ├── chatglm.py │ ├── commandr.py │ ├── dbrx.py │ ├── decilm.py │ ├── deepseek.py │ ├── falcon.py │ ├── gemma.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── internlm2.py │ ├── jais.py │ ├── llama.py │ ├── llama_embedding.py │ ├── llava.py │ ├── minicpm.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mpt.py │ ├── olmo.py │ ├── opt.py │ ├── orion.py │ ├── phi.py │ ├── phi3_small.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_moe.py │ ├── skywork_moe.py │ ├── skywork_moe_quant.py │ ├── stablelm.py │ ├── starcoder2.py │ ├── vlm_base.py │ └── xverse.py ├── pooling_metadata.py ├── sampling_metadata.py └── utils.py ├── multimodal ├── __init__.py ├── base.py ├── image.py └── registry.py ├── outputs.py ├── pooling_params.py ├── py.typed ├── sampling_params.py ├── sequence.py ├── spec_decode ├── __init__.py ├── batch_expansion.py ├── interfaces.py ├── metrics.py ├── multi_step_worker.py ├── ngram_worker.py ├── spec_decode_worker.py ├── top1_proposer.py └── util.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── arctic.py │ ├── chatglm.py │ ├── dbrx.py │ ├── falcon.py │ ├── jais.py │ └── mpt.py ├── detokenizer.py ├── image_processor.py ├── tokenizer.py ├── tokenizer_group │ ├── __init__.py │ ├── base_tokenizer_group.py │ ├── ray_tokenizer_group.py │ └── tokenizer_group.py └── tokenizers │ ├── __init__.py │ └── baichuan.py ├── usage ├── __init__.py └── usage_lib.py ├── utils.py └── worker ├── __init__.py ├── cache_engine.py ├── cpu_model_runner.py ├── cpu_worker.py ├── embedding_model_runner.py ├── model_runner.py ├── neuron_model_runner.py ├── neuron_worker.py ├── worker.py └── worker_base.py /.buildkite/check-wheel-size.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | MAX_SIZE_MB = 200 5 | 6 | 7 | def print_top_10_largest_files(zip_file): 8 | with zipfile.ZipFile(zip_file, 'r') as z: 9 | file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] 10 | file_sizes.sort(key=lambda x: x[1], reverse=True) 11 | for f, size in file_sizes[:10]: 12 | print(f"{f}: {size/(1024*1024)} MBs uncompressed.") 13 | 14 | 15 | def check_wheel_size(directory): 16 | for root, _, files in os.walk(directory): 17 | for f in files: 18 | if f.endswith(".whl"): 19 | wheel_path = os.path.join(root, f) 20 | wheel_size = os.path.getsize(wheel_path) 21 | wheel_size_mb = wheel_size / (1024 * 1024) 22 | if wheel_size_mb > MAX_SIZE_MB: 23 | print( 24 | f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " 25 | f"compare to the allowed size ({MAX_SIZE_MB} MB).") 26 | print_top_10_largest_files(wheel_path) 27 | return 1 28 | else: 29 | print(f"Wheel {wheel_path} is within the allowed size " 30 | f"({wheel_size_mb} MB).") 31 | return 0 32 | 33 | 34 | if __name__ == "__main__": 35 | import sys 36 | sys.exit(check_wheel_size(sys.argv[1])) 37 | -------------------------------------------------------------------------------- /.buildkite/download-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | set -o pipefail 5 | 6 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl) 7 | 8 | # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ 9 | mkdir -p images 10 | cd images 11 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt 12 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt 13 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt 14 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt 15 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg 16 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg 17 | 18 | cd - 19 | -------------------------------------------------------------------------------- /.buildkite/run-cpu-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the CPU docker image and run the offline inference inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -ex 4 | 5 | # Try building the docker image 6 | docker build -t cpu-test -f Dockerfile.cpu . 7 | 8 | # Setup cleanup 9 | remove_docker_container() { docker rm -f cpu-test || true; } 10 | trap remove_docker_container EXIT 11 | remove_docker_container 12 | 13 | # Run the image and launch offline inference 14 | docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py 15 | -------------------------------------------------------------------------------- /.buildkite/run-neuron-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the Neuron docker image and run the API server inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -e 4 | 5 | # Try building the docker image 6 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com 7 | 8 | # prune old image and containers to save disk space, and only once a day 9 | # by using a timestamp file in tmp. 10 | if [ -f /tmp/neuron-docker-build-timestamp ]; then 11 | last_build=$(cat /tmp/neuron-docker-build-timestamp) 12 | current_time=$(date +%s) 13 | if [ $((current_time - last_build)) -gt 86400 ]; then 14 | docker system prune -f 15 | echo $current_time > /tmp/neuron-docker-build-timestamp 16 | fi 17 | else 18 | echo $(date +%s) > /tmp/neuron-docker-build-timestamp 19 | fi 20 | 21 | docker build -t neuron -f Dockerfile.neuron . 22 | 23 | # Setup cleanup 24 | remove_docker_container() { docker rm -f neuron || true; } 25 | trap remove_docker_container EXIT 26 | remove_docker_container 27 | 28 | # Run the image 29 | docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ 30 | --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & 31 | 32 | # Wait for the server to start 33 | wait_for_server_to_start() { 34 | timeout=300 35 | counter=0 36 | 37 | while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do 38 | sleep 1 39 | counter=$((counter + 1)) 40 | if [ $counter -ge $timeout ]; then 41 | echo "Timeout after $timeout seconds" 42 | break 43 | fi 44 | done 45 | } 46 | wait_for_server_to_start 47 | 48 | # Test a simple prompt 49 | curl -X POST -H "Content-Type: application/json" \ 50 | localhost:8000/generate \ 51 | -d '{"prompt": "San Francisco is a"}' 52 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | UseTab: Never 3 | IndentWidth: 2 4 | ColumnLimit: 80 5 | 6 | # Force pointers to the type for C++. 7 | DerivePointerAlignment: false 8 | PointerAlignment: Left 9 | 10 | # Reordering #include statements can (and currently will) introduce errors 11 | SortIncludes: false 12 | 13 | # Style choices 14 | AlignConsecutiveAssignments: false 15 | AlignConsecutiveDeclarations: false 16 | IndentPPDirectives: BeforeHash 17 | 18 | IncludeCategories: 19 | - Regex: '^<' 20 | Priority: 4 21 | - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' 22 | Priority: 3 23 | - Regex: '^"(qoda|\.\.)/' 24 | Priority: 2 25 | - Regex: '.*' 26 | Priority: 1 27 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | vllm/*.so 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://docs.vllm.ai/ 3 | title: "[Doc]: " 4 | labels: ["documentation"] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description of what content in https://docs.vllm.ai/ is an issue. 12 | validations: 13 | required: true 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | - type: markdown 20 | attributes: 21 | value: > 22 | Thanks for contributing 🎉! 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/200-installation.yml: -------------------------------------------------------------------------------- 1 | name: 🛠️ Installation 2 | description: Report an issue here when you hit errors during installation. 3 | title: "[Installation]: " 4 | labels: ["installation"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 22 | value: | 23 | ```text 24 | The output of `python collect_env.py` 25 | ``` 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: How you are installing vllm 31 | description: | 32 | Paste the full command you are trying to execute. 33 | value: | 34 | ```sh 35 | pip install -vvv vllm 36 | ``` 37 | - type: markdown 38 | attributes: 39 | value: > 40 | Thanks for contributing 🎉! 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/300-usage.yml: -------------------------------------------------------------------------------- 1 | name: 💻 Usage 2 | description: Raise an issue here if you don't know how to use vllm. 3 | title: "[Usage]: " 4 | labels: ["usage"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 22 | value: | 23 | ```text 24 | The output of `python collect_env.py` 25 | ``` 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: How would you like to use vllm 31 | description: | 32 | A detailed description of how you want to use vllm. 33 | value: | 34 | I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. 35 | - type: markdown 36 | attributes: 37 | value: > 38 | Thanks for contributing 🎉! 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/500-feature request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new vllm feature 3 | title: "[Feature]: " 4 | labels: ["feature request"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: 🚀 The feature, motivation and pitch 14 | description: > 15 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Alternatives 21 | description: > 22 | A description of any alternative solutions or features you've considered, if any. 23 | - type: textarea 24 | attributes: 25 | label: Additional context 26 | description: > 27 | Add any other context or screenshots about the feature request. 28 | - type: markdown 29 | attributes: 30 | value: > 31 | Thanks for contributing 🎉! 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/600-new model.yml: -------------------------------------------------------------------------------- 1 | name: 🤗 Support request for a new model from huggingface 2 | description: Submit a proposal/request for a new model from huggingface 3 | title: "[New Model]: " 4 | labels: ["new model"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | 12 | #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. 13 | - type: textarea 14 | attributes: 15 | label: The model to consider. 16 | description: > 17 | A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . 18 | validations: 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: The closest model vllm already supports. 23 | description: > 24 | Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? 25 | - type: textarea 26 | attributes: 27 | label: What's your difficulty of supporting the model you want? 28 | description: > 29 | For example, any new operators or new architecture? 30 | - type: markdown 31 | attributes: 32 | value: > 33 | Thanks for contributing 🎉! 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/700-performance discussion.yml: -------------------------------------------------------------------------------- 1 | name: ⚡ Discussion on the performance of vllm 2 | description: Submit a proposal/discussion about the performance of vllm 3 | title: "[Performance]: " 4 | labels: ["performance"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Proposal to improve performance 14 | description: > 15 | How do you plan to improve vllm's performance? 16 | validations: 17 | required: false 18 | - type: textarea 19 | attributes: 20 | label: Report of performance regression 21 | description: > 22 | Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . 23 | validations: 24 | required: false 25 | - type: textarea 26 | attributes: 27 | label: Misc discussion on performance 28 | description: > 29 | Anything about the performance. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: Your current environment (if you think it is necessary) 35 | description: | 36 | Please run the following and paste the output below. 37 | ```sh 38 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 39 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 40 | python collect_env.py 41 | ``` 42 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 43 | value: | 44 | ```text 45 | The output of `python collect_env.py` 46 | ``` 47 | validations: 48 | required: false 49 | - type: markdown 50 | attributes: 51 | value: > 52 | Thanks for contributing 🎉! 53 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/750-RFC.yml: -------------------------------------------------------------------------------- 1 | name: 💬 Request for comments (RFC). 2 | description: Ask for feedback on major architectural changes or design choices. 3 | title: "[RFC]: " 4 | labels: ["RFC"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference. 11 | - type: textarea 12 | attributes: 13 | label: Motivation. 14 | description: > 15 | The motivation of the RFC. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Proposed Change. 21 | description: > 22 | The proposed change of the RFC. 23 | validations: 24 | required: true 25 | - type: textarea 26 | attributes: 27 | label: Feedback Period. 28 | description: > 29 | The feedback period of the RFC. Usually at least one week. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: CC List. 35 | description: > 36 | The list of people you want to CC. 37 | validations: 38 | required: false 39 | - type: textarea 40 | attributes: 41 | label: Any Other Things. 42 | description: > 43 | Any other things you would like to mention. 44 | validations: 45 | required: false 46 | - type: markdown 47 | attributes: 48 | value: > 49 | Thanks for contributing 🎉! 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/800-misc discussion.yml: -------------------------------------------------------------------------------- 1 | name: 🎲 Misc/random discussions that do not fit into the above categories. 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. 3 | title: "[Misc]: " 4 | labels: ["misc"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Anything you want to discuss about vllm. 14 | description: > 15 | Anything you want to discuss about vllm. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/workflows/clang-format.yml: -------------------------------------------------------------------------------- 1 | name: clang-format 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | clang-format: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.11"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install clang-format==18.1.5 29 | - name: Running clang-format 30 | run: | 31 | EXCLUDES=( 32 | 'csrc/moe/topk_softmax_kernels.cu' 33 | 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu' 34 | 'csrc/punica/bgmv/bgmv_config.h' 35 | 'csrc/punica/bgmv/bgmv_impl.cuh' 36 | 'csrc/punica/bgmv/vec_dtypes.cuh' 37 | 'csrc/punica/punica_ops.cu' 38 | 'csrc/punica/type_convert.h' 39 | ) 40 | find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ 41 | | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ 42 | | xargs clang-format --dry-run --Werror -------------------------------------------------------------------------------- /.github/workflows/mypy.yaml: -------------------------------------------------------------------------------- 1 | name: mypy 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install mypy==1.9.0 29 | pip install types-setuptools 30 | pip install types-PyYAML 31 | pip install types-requests 32 | pip install types-setuptools 33 | - name: Mypy 34 | run: | 35 | mypy vllm/attention --config-file pyproject.toml 36 | mypy vllm/core --config-file pyproject.toml 37 | mypy vllm/distributed --config-file pyproject.toml 38 | mypy vllm/entrypoints --config-file pyproject.toml 39 | mypy vllm/executor --config-file pyproject.toml 40 | mypy vllm/multimodal --config-file pyproject.toml 41 | mypy vllm/usage --config-file pyproject.toml 42 | mypy vllm/*.py --config-file pyproject.toml 43 | mypy vllm/transformers_utils --config-file pyproject.toml 44 | mypy vllm/engine --config-file pyproject.toml 45 | mypy vllm/worker --config-file pyproject.toml 46 | mypy vllm/spec_decode --config-file pyproject.toml 47 | mypy vllm/model_executor --config-file pyproject.toml 48 | mypy vllm/lora --config-file pyproject.toml 49 | mypy vllm/logging --config-file pyproject.toml 50 | mypy vllm/model_executor --config-file pyproject.toml 51 | 52 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2 29 | - name: Analysing the code with ruff 30 | run: | 31 | ruff . 32 | - name: Spelling check with codespell 33 | run: | 34 | codespell --toml pyproject.toml 35 | - name: Run isort 36 | run: | 37 | isort . --check-only 38 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install wheel packaging 12 | $python_executable -m pip install -r requirements-cuda.txt 13 | 14 | # Limit the number of parallel jobs to avoid OOM 15 | export MAX_JOBS=1 16 | # Make sure punica is built for the release (for LoRA) 17 | export VLLM_INSTALL_PUNICA_KERNELS=1 18 | # Make sure release wheels are built for the following architectures 19 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" 20 | # Build 21 | $python_executable setup.py bdist_wheel --dist-dir=dist 22 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: true, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive . 32 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | Thank you for your interest in contributing to vLLM! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add a new model. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | 19 | ## Setup for development 20 | 21 | ### Build from source 22 | 23 | ```bash 24 | pip install -e . # This may take several minutes. 25 | ``` 26 | 27 | ### Testing 28 | 29 | ```bash 30 | pip install -r requirements-dev.txt 31 | 32 | # linting and formatting 33 | bash format.sh 34 | # Static type checking 35 | mypy 36 | # Unit tests 37 | pytest tests/ 38 | ``` 39 | **Note:** Currently, the repository does not pass the mypy tests. 40 | 41 | 42 | ## Contributing Guidelines 43 | 44 | ### Issue Reporting 45 | 46 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 47 | If not, please file a new issue, providing as much relevant information as possible. 48 | 49 | ### Pull Requests & Code Reviews 50 | 51 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. 52 | 53 | ### Thank You 54 | 55 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. 56 | Your contributions make vLLM a great tool for everyone! 57 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. 2 | 3 | FROM ubuntu:22.04 4 | 5 | RUN apt-get update -y \ 6 | && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ 7 | && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 8 | 9 | RUN pip install --upgrade pip \ 10 | && pip install wheel packaging ninja setuptools>=49.4.0 numpy 11 | 12 | COPY ./ /workspace/vllm 13 | 14 | WORKDIR /workspace/vllm 15 | 16 | RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu 17 | 18 | RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install 19 | 20 | WORKDIR /workspace/ 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /Dockerfile.neuron: -------------------------------------------------------------------------------- 1 | # default base image 2 | ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04" 3 | 4 | FROM $BASE_IMAGE 5 | 6 | RUN echo "Base image is $BASE_IMAGE" 7 | 8 | # Install some basic utilities 9 | RUN apt-get update && apt-get install python3 python3-pip -y 10 | 11 | ### Mount Point ### 12 | # When launching the container, mount the code directory to /app 13 | ARG APP_MOUNT=/app 14 | VOLUME [ ${APP_MOUNT} ] 15 | WORKDIR ${APP_MOUNT} 16 | 17 | RUN python3 -m pip install --upgrade pip 18 | RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas 19 | RUN python3 -m pip install sentencepiece transformers==4.36.2 -U 20 | RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U 21 | RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U 22 | 23 | COPY ./vllm /app/vllm/vllm 24 | COPY ./setup.py /app/vllm/setup.py 25 | COPY ./requirements-common.txt /app/vllm/requirements-common.txt 26 | COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt 27 | 28 | RUN cd /app/vllm \ 29 | && python3 -m pip install -U -r requirements-neuron.txt 30 | 31 | ENV VLLM_BUILD_WITH_NEURON 1 32 | RUN cd /app/vllm \ 33 | && pip install -e . \ 34 | && cd .. 35 | 36 | CMD ["/bin/bash"] 37 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements-common.txt 3 | include requirements-cuda.txt 4 | include requirements-rocm.txt 5 | include requirements-neuron.txt 6 | include requirements-cpu.txt 7 | include CMakeLists.txt 8 | 9 | recursive-include cmake * 10 | recursive-include csrc * 11 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /benchmarks/cutlass_benchmarks/weight_shapes.py: -------------------------------------------------------------------------------- 1 | # Weight Shapes are in the format 2 | # ([K, N], TP_SPLIT_DIM) 3 | # Example: 4 | # A shape of ([14336, 4096], 0) indicates the following GEMM shape, 5 | # - TP1 : K = 14336, N = 4096 6 | # - TP2 : K = 7168, N = 4096 7 | # A shape of ([4096, 6144], 1) indicates the following GEMM shape, 8 | # - TP1 : K = 4096, N = 6144 9 | # - TP4 : K = 4096, N = 1536 10 | 11 | # TP1 shapes 12 | WEIGHT_SHAPES = { 13 | "mistralai/Mistral-7B-v0.1": [ 14 | ([4096, 6144], 1), 15 | ([4096, 4096], 0), 16 | ([4096, 28672], 1), 17 | ([14336, 4096], 0), 18 | ], 19 | "meta-llama/Llama-2-7b-hf": [ 20 | ([4096, 12288], 1), 21 | ([4096, 4096], 0), 22 | ([4096, 22016], 1), 23 | ([11008, 4096], 0), 24 | ], 25 | "meta-llama/Llama-2-13b-hf": [ 26 | ([5120, 15360], 1), 27 | ([5120, 5120], 0), 28 | ([5120, 27648], 1), 29 | ([13824, 5120], 0), 30 | ], 31 | "meta-llama/Llama-2-70b-hf": [ 32 | ([8192, 10240], 1), 33 | ([8192, 8192], 0), 34 | ([8192, 57344], 1), 35 | ([28672, 8192], 0), 36 | ], 37 | } 38 | -------------------------------------------------------------------------------- /benchmarks/kernels/benchmark_shapes.py: -------------------------------------------------------------------------------- 1 | WEIGHT_SHAPES = { 2 | "ideal": [[4 * 256 * 32, 256 * 32]], 3 | "mistralai/Mistral-7B-v0.1/TP1": [ 4 | [4096, 6144], 5 | [4096, 4096], 6 | [4096, 28672], 7 | [14336, 4096], 8 | ], 9 | "mistralai/Mistral-7B-v0.1/TP2": [ 10 | [4096, 3072], 11 | [2048, 4096], 12 | [4096, 14336], 13 | [7168, 4096], 14 | ], 15 | "mistralai/Mistral-7B-v0.1/TP4": [ 16 | [4096, 1536], 17 | [1024, 4096], 18 | [4096, 7168], 19 | [3584, 4096], 20 | ], 21 | "meta-llama/Llama-2-7b-hf/TP1": [ 22 | [4096, 12288], 23 | [4096, 4096], 24 | [4096, 22016], 25 | [11008, 4096], 26 | ], 27 | "meta-llama/Llama-2-7b-hf/TP2": [ 28 | [4096, 6144], 29 | [2048, 4096], 30 | [4096, 11008], 31 | [5504, 4096], 32 | ], 33 | "meta-llama/Llama-2-7b-hf/TP4": [ 34 | [4096, 3072], 35 | [1024, 4096], 36 | [4096, 5504], 37 | [2752, 4096], 38 | ], 39 | "meta-llama/Llama-2-13b-hf/TP1": [ 40 | [5120, 15360], 41 | [5120, 5120], 42 | [5120, 27648], 43 | [13824, 5120], 44 | ], 45 | "meta-llama/Llama-2-13b-hf/TP2": [ 46 | [5120, 7680], 47 | [2560, 5120], 48 | [5120, 13824], 49 | [6912, 5120], 50 | ], 51 | "meta-llama/Llama-2-13b-hf/TP4": [ 52 | [5120, 3840], 53 | [1280, 5120], 54 | [5120, 6912], 55 | [3456, 5120], 56 | ], 57 | "meta-llama/Llama-2-70b-hf/TP1": [ 58 | [8192, 10240], 59 | [8192, 8192], 60 | [8192, 57344], 61 | [28672, 8192], 62 | ], 63 | "meta-llama/Llama-2-70b-hf/TP2": [ 64 | [8192, 5120], 65 | [4096, 8192], 66 | [8192, 28672], 67 | [14336, 8192], 68 | ], 69 | "meta-llama/Llama-2-70b-hf/TP4": [ 70 | [8192, 2560], 71 | [2048, 8192], 72 | [8192, 14336], 73 | [7168, 8192], 74 | ], 75 | } 76 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:1.4.0 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8.cuh" 8 | -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 4 | * Copyright (c) 2023, The vLLM team. 5 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | #pragma once 20 | 21 | #include 22 | 23 | namespace vllm { 24 | 25 | // A vector type to store Q, K, V elements. 26 | template 27 | struct Vec {}; 28 | 29 | // A vector type to store FP32 accumulators. 30 | template 31 | struct FloatVec {}; 32 | 33 | // Template vector operations. 34 | template 35 | inline __device__ Acc mul(A a, B b); 36 | 37 | template 38 | inline __device__ float sum(T v); 39 | 40 | template 41 | inline __device__ float dot(T a, T b) { 42 | return sum(mul(a, b)); 43 | } 44 | 45 | template 46 | inline __device__ float dot(T a, T b) { 47 | return sum(mul(a, b)); 48 | } 49 | 50 | template 51 | inline __device__ void zero(T& dst) { 52 | constexpr int WORDS = sizeof(T) / 4; 53 | union { 54 | T raw; 55 | uint32_t words[WORDS]; 56 | } tmp; 57 | 58 | #pragma unroll 59 | for (int ii = 0; ii < WORDS; ++ii) { 60 | tmp.words[ii] = 0u; 61 | } 62 | dst = tmp.raw; 63 | } 64 | 65 | } // namespace vllm 66 | -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 4 | * Copyright (c) 2023, The vLLM team. 5 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | #pragma once 20 | 21 | #include "../cuda_compat.h" 22 | #include "attention_dtypes.h" 23 | 24 | #include 25 | #include 26 | 27 | namespace vllm { 28 | 29 | // Q*K^T operation. 30 | template 31 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 32 | using A_vec = typename FloatVec::Type; 33 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 34 | A_vec qk_vec = mul(q[0], k[0]); 35 | #pragma unroll 36 | for (int ii = 1; ii < N; ++ii) { 37 | qk_vec = fma(q[ii], k[ii], qk_vec); 38 | } 39 | 40 | // Finalize the reduction across lanes. 41 | float qk = sum(qk_vec); 42 | #pragma unroll 43 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 44 | qk += VLLM_SHFL_XOR_SYNC(qk, mask); 45 | } 46 | return qk; 47 | } 48 | 49 | template 50 | struct Qk_dot { 51 | template 52 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 53 | return qk_dot_(q, k); 54 | } 55 | }; 56 | 57 | } // namespace vllm 58 | -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8 7 | #ifndef USE_ROCM 8 | #include 9 | #endif // USE_ROCM 10 | #endif // ENABLE_FP8 11 | 12 | namespace vllm { 13 | 14 | enum class Fp8KVCacheDataType { 15 | kAuto = 0, 16 | kFp8E4M3 = 1, 17 | kFp8E5M2 = 2, 18 | }; 19 | 20 | // fp8 vector types for quantization of kv cache 21 | template <> 22 | struct Vec { 23 | using Type = uint8_t; 24 | }; 25 | 26 | template <> 27 | struct Vec { 28 | using Type = uint16_t; 29 | }; 30 | 31 | template <> 32 | struct Vec { 33 | using Type = uint32_t; 34 | }; 35 | 36 | template <> 37 | struct Vec { 38 | using Type = uint2; 39 | }; 40 | 41 | } // namespace vllm 42 | -------------------------------------------------------------------------------- /csrc/cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | void swap_blocks(torch::Tensor& src, torch::Tensor& dst, 9 | const torch::Tensor& block_mapping); 10 | 11 | void copy_blocks(std::vector& key_caches, 12 | std::vector& value_caches, 13 | const torch::Tensor& block_mapping); 14 | 15 | void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, 16 | torch::Tensor& key_cache, torch::Tensor& value_cache, 17 | torch::Tensor& slot_mapping, 18 | const std::string& kv_cache_dtype, const float kv_scale); 19 | 20 | void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, 21 | torch::Tensor& key_cache, 22 | torch::Tensor& value_cache, 23 | torch::Tensor& slot_mapping, 24 | const std::string& kv_cache_dtype); 25 | 26 | // Just for unittest 27 | void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, 28 | const float scale, const std::string& kv_cache_dtype); 29 | -------------------------------------------------------------------------------- /csrc/cpu/pybind.cpp: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | #include "cuda_utils.h" 3 | #include "ops.h" 4 | #include 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 7 | // vLLM custom ops 8 | pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); 9 | 10 | // Attention ops 11 | ops.def("paged_attention_v1", &paged_attention_v1, 12 | "Compute the attention between an input query and the cached " 13 | "keys/values using PagedAttention."); 14 | ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2."); 15 | 16 | // Activation ops 17 | ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU."); 18 | ops.def("gelu_and_mul", &gelu_and_mul, 19 | "Activation function used in GeGLU with `none` approximation."); 20 | ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul, 21 | "Activation function used in GeGLU with `tanh` approximation."); 22 | ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2."); 23 | ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation."); 24 | 25 | // Layernorm 26 | ops.def("rms_norm", &rms_norm, 27 | "Apply Root Mean Square (RMS) Normalization to the input tensor."); 28 | 29 | ops.def("fused_add_rms_norm", &fused_add_rms_norm, 30 | "In-place fused Add and RMS Normalization"); 31 | 32 | // Rotary embedding 33 | ops.def("rotary_embedding", &rotary_embedding, 34 | "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); 35 | 36 | // Cache ops 37 | pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); 38 | cache_ops.def("swap_blocks", &swap_blocks, 39 | "Swap in (out) the cache blocks from src to dst"); 40 | cache_ops.def("copy_blocks", ©_blocks, 41 | "Copy the cache blocks from src to dst"); 42 | cache_ops.def("reshape_and_cache", &reshape_and_cache, 43 | "Reshape the key and value tensors and cache them"); 44 | } 45 | -------------------------------------------------------------------------------- /csrc/cuda_compat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_ROCM 4 | #include 5 | #endif 6 | 7 | #ifndef USE_ROCM 8 | #define WARP_SIZE 32 9 | #else 10 | #define WARP_SIZE warpSize 11 | #endif 12 | 13 | #ifndef USE_ROCM 14 | #define VLLM_LDG(arg) __ldg(arg) 15 | #else 16 | #define VLLM_LDG(arg) *(arg) 17 | #endif 18 | 19 | #ifndef USE_ROCM 20 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \ 21 | __shfl_xor_sync(uint32_t(-1), var, lane_mask) 22 | #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ 23 | __shfl_xor_sync(uint32_t(-1), var, lane_mask, width) 24 | #else 25 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) 26 | #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ 27 | __shfl_xor(var, lane_mask, width) 28 | #endif 29 | 30 | #ifndef USE_ROCM 31 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) 32 | #else 33 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) 34 | #endif 35 | 36 | #ifndef USE_ROCM 37 | #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \ 38 | __shfl_down_sync(uint32_t(-1), var, lane_delta) 39 | #else 40 | #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta) 41 | #endif 42 | 43 | #ifndef USE_ROCM 44 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 45 | cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) 46 | #else 47 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 48 | hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) 49 | #endif 50 | -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | int get_device_attribute(int attribute, int device_id); 6 | 7 | int get_max_shared_memory_per_block_device_attribute(int device_id); 8 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int get_device_attribute(int attribute, int device_id) { 6 | int device, value; 7 | if (device_id < 0) { 8 | cudaGetDevice(&device); 9 | } else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), 13 | device); 14 | return value; 15 | } 16 | 17 | int get_max_shared_memory_per_block_device_attribute(int device_id) { 18 | int attribute; 19 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 20 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 21 | 22 | #ifdef USE_ROCM 23 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 24 | #else 25 | attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; 26 | #endif 27 | 28 | return get_device_attribute(attribute, device_id); 29 | } 30 | -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #pragma once 6 | 7 | #include 8 | 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 10 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 11 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 12 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 13 | 14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 15 | AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 16 | 17 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ 18 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 19 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 20 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ 21 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) 22 | 23 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ 24 | AT_DISPATCH_SWITCH(TYPE, NAME, \ 25 | VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) 26 | 27 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ 28 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ 29 | AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ 30 | AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ 31 | AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ 32 | AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) 33 | 34 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ 35 | AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) 36 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.cpp: -------------------------------------------------------------------------------- 1 | #include "moe_ops.h" 2 | 3 | #include 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("topk_softmax", &topk_softmax, 7 | "Apply topk softmax to the gating outputs."); 8 | } 9 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, 6 | torch::Tensor& token_expert_indices, 7 | torch::Tensor& gating_output); 8 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half) 6 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/generator.py: -------------------------------------------------------------------------------- 1 | DTYPES = ["fp16", "bf16", "fp32"] 2 | DTYPE_MAP = { 3 | "fp16": "nv_half", 4 | "bf16": "nv_bfloat16", 5 | "fp32": "float", 6 | } 7 | 8 | TEMPLATE = """ 9 | #include "bgmv_config.h" 10 | #include "bgmv_impl.cuh" 11 | 12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) 13 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype}) 14 | """.lstrip() # noqa: E501 15 | 16 | for input_dtype in DTYPES: 17 | for output_dtype in DTYPES: 18 | for weight_dtype in DTYPES: 19 | if weight_dtype == "fp32": 20 | # FP32 weights are not supported. 21 | continue 22 | if output_dtype == "fp32": 23 | # LoRA A matrix. 24 | if input_dtype != weight_dtype: 25 | # NOTE(woosuk): While Punica supports the case where the 26 | # input and weight dtypes are different, we only generate 27 | # the kernels the same dtypes to reduce the binary size. 28 | continue 29 | elif input_dtype == "fp32": 30 | # LoRA B matrix. 31 | if output_dtype != weight_dtype: 32 | # NOTE(woosuk): While Punica supports the case where the 33 | # output and weight dtypes are different, we only generate 34 | # the kernels the same dtypes to reduce the binary size. 35 | continue 36 | elif not (input_dtype == output_dtype == weight_dtype): 37 | # NOTE(woosuk): While Punica supports mixed data types for 38 | # input, output, and weight, we only generate the kernels with 39 | # the same data types to reduce the binary size. 40 | continue 41 | 42 | kernel_definition = TEMPLATE.format( 43 | input_dtype=DTYPE_MAP[input_dtype], 44 | output_dtype=DTYPE_MAP[output_dtype], 45 | weight_dtype=DTYPE_MAP[weight_dtype]) 46 | filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu" 47 | with open(filename, "w") as f: 48 | f.write(kernel_definition) 49 | -------------------------------------------------------------------------------- /csrc/punica/punica_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w, 6 | torch::Tensor indicies, int64_t layer_idx, float scale); 7 | 8 | void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w, 9 | torch::Tensor indicies, int64_t layer_idx, 10 | float scale, int64_t h_in, int64_t h_out, 11 | int64_t y_offset); 12 | -------------------------------------------------------------------------------- /csrc/punica/punica_pybind.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "punica_ops.h" 4 | 5 | //====== pybind ====== 6 | 7 | #define DEFINE_pybind(name) m.def(#name, &name, #name); 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv"); 11 | m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level, 12 | "dispatch_bgmv_low_level"); 13 | } 14 | -------------------------------------------------------------------------------- /csrc/punica/type_convert.h: -------------------------------------------------------------------------------- 1 | #ifndef CSRC__PUNICA__TYPE_CONVERT_H__ 2 | #define CSRC__PUNICA__TYPE_CONVERT_H__ 3 | 4 | #ifndef USE_ROCM 5 | 6 | #include 7 | #include 8 | 9 | #else 10 | 11 | #include 12 | #include 13 | 14 | #define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__ 15 | 16 | typedef __half nv_half; 17 | typedef __hip_bfloat16 nv_bfloat16; 18 | typedef __hip_bfloat162 nv_bfloat162; 19 | 20 | __TYPE_CONVERT__HOST_DEVICE__ 21 | inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) { 22 | return __hip_bfloat162{val, val}; 23 | } 24 | 25 | __TYPE_CONVERT__HOST_DEVICE__ 26 | inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) { 27 | return __hip_bfloat162{vall, valr}; 28 | } 29 | 30 | template 31 | __TYPE_CONVERT__HOST_DEVICE__ 32 | inline T_dst convert_type(T_src val) { 33 | return static_cast(val); 34 | } 35 | 36 | template <> 37 | __TYPE_CONVERT__HOST_DEVICE__ 38 | inline float convert_type<__half, float>(__half val) { 39 | return __half2float(val); 40 | } 41 | 42 | template <> 43 | __TYPE_CONVERT__HOST_DEVICE__ 44 | inline __half convert_type(float val) { 45 | return __float2half(val); 46 | } 47 | 48 | template <> 49 | __TYPE_CONVERT__HOST_DEVICE__ 50 | inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) { 51 | return __bfloat162float(val); 52 | } 53 | 54 | template <> 55 | __TYPE_CONVERT__HOST_DEVICE__ 56 | inline __hip_bfloat16 convert_type(float val) { 57 | return __float2bfloat16(val); 58 | } 59 | 60 | template 61 | __TYPE_CONVERT__HOST_DEVICE__ 62 | inline T vllm_add(T a, T b) { 63 | return a + b; 64 | } 65 | 66 | template <> 67 | __TYPE_CONVERT__HOST_DEVICE__ 68 | inline __half vllm_add<__half>(__half a, __half b) { 69 | return __hadd(a, b); 70 | } 71 | 72 | template <> 73 | __TYPE_CONVERT__HOST_DEVICE__ 74 | inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) { 75 | return __hadd(a, b); 76 | } 77 | 78 | #undef __TYPE_CONVERT__HOST_DEVICE__ 79 | 80 | #endif // USE_ROCM 81 | 82 | #endif // CSRC__PUNICA__TYPE_CONVERT_H__ 83 | -------------------------------------------------------------------------------- /csrc/quantization/cutlass_w8a8/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cutlass/cutlass.h" 4 | 5 | /** 6 | * Helper function for checking CUTLASS errors 7 | */ 8 | #define CUTLASS_CHECK(status) \ 9 | { \ 10 | TORCH_CHECK(status == cutlass::Status::kSuccess, \ 11 | cutlassGetStatusString(status)) \ 12 | } 13 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/compat.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _compat_cuh 6 | #define _compat_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | // atomicAdd for half types, to support CC < 7.x 11 | 12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) { 13 | unsigned int* address_as_ui = 14 | (unsigned int*)((char*)address - ((size_t)address & 2)); 15 | unsigned int old = *address_as_ui; 16 | unsigned int assumed; 17 | 18 | do { 19 | assumed = old; 20 | __half_raw hsum; 21 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 22 | half tmpres = __hadd(hsum, val); 23 | hsum = __half_raw(tmpres); 24 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) 25 | : (old & 0xffff0000) | hsum.x; 26 | old = atomicCAS(address_as_ui, assumed, old); 27 | } while (assumed != old); 28 | } 29 | 30 | // atomicAdd for half2 types 31 | 32 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) { 33 | unsigned int* address_as_ui = (unsigned int*)address; 34 | unsigned int old = *address_as_ui; 35 | unsigned int assumed; 36 | do { 37 | assumed = old; 38 | half2 old_val = *((half2*)&old); 39 | half2 new_val = __hadd2(old_val, val); 40 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 41 | } while (assumed != old); 42 | } 43 | 44 | // 45 | 46 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 47 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 48 | 49 | __device__ __forceinline__ void atomicAdd(half* address, half val) { 50 | atomicAdd_half(address, val); 51 | } 52 | 53 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 54 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { 55 | atomicAdd_half2(address, val); 56 | } 57 | #endif 58 | 59 | #endif 60 | #endif 61 | 62 | } // namespace gptq 63 | } // namespace vllm 64 | #endif 65 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 14 | 15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 16 | const uint32_t q_1, 17 | half2 (&dq)[4], int stride, 18 | const uint32_t zero) { 19 | half dqh[8]; 20 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 21 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 22 | 23 | for (int i = 0; i < 4; i++) 24 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 25 | } 26 | 27 | } // namespace gptq 28 | } // namespace vllm 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_util.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_util_cuh 6 | #define _qdq_util_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | 11 | union half2_uint32 { 12 | uint32_t as_uint32; 13 | half2 as_half2; 14 | __device__ half2_uint32(uint32_t val) : as_uint32(val) {} 15 | __device__ half2_uint32(half2 val) : as_half2(val) {} 16 | }; 17 | 18 | union half_uint16 { 19 | uint16_t as_uint16; 20 | half as_half; 21 | __device__ half_uint16(uint16_t val) : as_uint16(val) {} 22 | __device__ half_uint16(half val) : as_half(val) {} 23 | }; 24 | 25 | // Max_scale premultiplied by 1/256 26 | 27 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) { 28 | int qs_i = qs + 1; 29 | half qs_h = __int2half_rn(qs_i * qs_i); 30 | qs_h = __hmul(qs_h, max_scale); 31 | return qs_h; 32 | } 33 | 34 | __forceinline__ __device__ half dq(const int q, const int qzero, 35 | const half scale) { 36 | return __hmul(__int2half_rn(q - qzero), scale); 37 | } 38 | 39 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) { 40 | // return __hsub(__int2half_rn(q), __int2half_rn(qzero)); 41 | return __int2half_rn(q - qzero); 42 | } 43 | 44 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, 45 | const int mask) { 46 | return (int)((q >> shift) & mask); 47 | } 48 | 49 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, 50 | const int shift, const int mask) { 51 | return (int)(__funnelshift_rc(q0, q1, shift) & mask); 52 | } 53 | 54 | } // namespace gptq 55 | } // namespace vllm 56 | #endif 57 | -------------------------------------------------------------------------------- /csrc/quantization/marlin/sparse/common/base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All 3 | * Rights Reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #pragma once 19 | 20 | namespace marlin_24 { 21 | 22 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } 23 | 24 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed 25 | // for instance as inputs to tensor core operations. Consequently, all 26 | // corresponding index accesses must be compile-time constants, which is why we 27 | // extensively use `#pragma unroll` throughout the kernel code to guarantee 28 | // this. 29 | template 30 | struct Vec { 31 | T elems[n]; 32 | __device__ T& operator[](int i) { return elems[i]; } 33 | }; 34 | 35 | template 36 | struct ShapeBase { 37 | static constexpr int M = M_, N = N_, K = K_; 38 | }; 39 | 40 | using I4 = Vec; 41 | 42 | // Matrix fragments for tensor core instructions; their precise layout is 43 | // documented here: 44 | // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type 45 | using FragA = Vec; 46 | using FragB = Vec; 47 | using FragM = Vec; 48 | using FragC = Vec; 49 | using FragS = Vec; // quantization scales 50 | 51 | } // namespace marlin_24 52 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | myst-parser == 2.0.0 5 | sphinx-argparse 6 | 7 | # packages to install to build the documentation 8 | pydantic 9 | -f https://download.pytorch.org/whl/cpu 10 | torch 11 | py-cpuinfo 12 | transformers 13 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args 14 | -------------------------------------------------------------------------------- /docs/source/assets/dev/dockerfile-stages-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/dev/dockerfile-stages-dependency.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/community/meetups.rst: -------------------------------------------------------------------------------- 1 | .. _meetups: 2 | 3 | vLLM Meetups 4 | ============ 5 | 6 | We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: 7 | 8 | - `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ 9 | - `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ 10 | - `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ 11 | 12 | We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__. 13 | -------------------------------------------------------------------------------- /docs/source/community/sponsors.md: -------------------------------------------------------------------------------- 1 | # Sponsors 2 | 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! 4 | 5 | 6 | 7 | 8 | - a16z 9 | - AMD 10 | - Anyscale 11 | - AWS 12 | - Crusoe Cloud 13 | - Databricks 14 | - DeepInfra 15 | - Dropbox 16 | - Lambda Lab 17 | - NVIDIA 18 | - Replicate 19 | - Roblox 20 | - RunPod 21 | - Trainy 22 | - UC Berkeley 23 | - UC San Diego 24 | 25 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. -------------------------------------------------------------------------------- /docs/source/dev/dockerfile/dockerfile.rst: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | ==================== 3 | 4 | See `here `_ for the main Dockerfile to construct 5 | the image for running an OpenAI compatible server with vLLM. 6 | 7 | - Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: 8 | 9 | - All build stages 10 | - The default build target (highlighted in grey) 11 | - External images (with dashed borders) 12 | 13 | The edges of the build graph represent: 14 | 15 | - FROM ... dependencies (with a solid line and a full arrow head) 16 | - COPY --from=... dependencies (with a dashed line and an empty arrow head) 17 | - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) 18 | 19 | .. figure:: ../../assets/dev/dockerfile-stages-dependency.png 20 | :alt: query 21 | :width: 100% 22 | :align: center 23 | 24 | Made using: https://github.com/patrickhoefler/dockerfilegraph 25 | 26 | Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): 27 | 28 | .. code:: bash 29 | 30 | dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile 31 | 32 | or in case you want to run it directly with the docker image: 33 | 34 | .. code:: bash 35 | 36 | docker run \ 37 | --rm \ 38 | --user "$(id -u):$(id -g)" \ 39 | --workdir /workspace \ 40 | --volume "$(pwd)":/workspace \ 41 | ghcr.io/patrickhoefler/dockerfilegraph:alpine \ 42 | --output png \ 43 | --dpi 200 \ 44 | --max-label-length 50 \ 45 | --filename Dockerfile \ 46 | --legend 47 | 48 | (To run it for a different file, you can pass in a different argument to the flag `--filename`.) 49 | 50 | -------------------------------------------------------------------------------- /docs/source/dev/engine/async_llm_engine.rst: -------------------------------------------------------------------------------- 1 | AsyncLLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.AsyncLLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/engine/engine_index.rst: -------------------------------------------------------------------------------- 1 | vLLM Engine 2 | ================================= 3 | 4 | .. automodule:: vllm.engine 5 | .. currentmodule:: vllm.engine 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Engines 10 | 11 | llm_engine 12 | async_llm_engine 13 | 14 | -------------------------------------------------------------------------------- /docs/source/dev/engine/llm_engine.rst: -------------------------------------------------------------------------------- 1 | LLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.LLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/multimodal/multimodal_index.rst: -------------------------------------------------------------------------------- 1 | Multi-Modality 2 | ============== 3 | 4 | .. currentmodule:: vllm.multimodal 5 | 6 | vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. 7 | 8 | :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data`` 9 | which allows you to pass in multi-modal input alongside text and token prompts. 10 | 11 | By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, 12 | you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data `, 13 | as well as :meth:`MULTIMODAL_REGISTRY.register_input ` for each modality type to support. 14 | 15 | .. contents:: 16 | :local: 17 | :backlinks: none 18 | 19 | Module Contents 20 | +++++++++++++++ 21 | 22 | .. automodule:: vllm.multimodal 23 | 24 | Registry 25 | -------- 26 | 27 | .. data:: vllm.multimodal.MULTIMODAL_REGISTRY 28 | 29 | The global :class:`MultiModalRegistry` which is used by model runners. 30 | 31 | .. autoclass:: vllm.multimodal.MultiModalRegistry 32 | :members: 33 | :show-inheritance: 34 | 35 | Base Classes 36 | ------------ 37 | 38 | .. autoclass:: vllm.multimodal.MultiModalData 39 | :members: 40 | :show-inheritance: 41 | 42 | .. autoclass:: vllm.multimodal.MultiModalPlugin 43 | :members: 44 | :show-inheritance: 45 | 46 | Image Classes 47 | ------------- 48 | 49 | .. automodule:: vllm.multimodal.image 50 | :members: 51 | :show-inheritance: 52 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm.rst: -------------------------------------------------------------------------------- 1 | LLM Class 2 | ========= 3 | 4 | .. autoclass:: vllm.LLM 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/llm_inputs.rst: -------------------------------------------------------------------------------- 1 | LLM Inputs 2 | ========== 3 | 4 | .. autodata:: vllm.inputs.PromptStrictInputs 5 | 6 | .. autoclass:: vllm.inputs.TextPrompt 7 | :show-inheritance: 8 | :members: 9 | :member-order: bysource 10 | 11 | .. autoclass:: vllm.inputs.TokensPrompt 12 | :show-inheritance: 13 | :members: 14 | :member-order: bysource 15 | -------------------------------------------------------------------------------- /docs/source/dev/offline_inference/offline_index.rst: -------------------------------------------------------------------------------- 1 | Offline Inference 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | llm 8 | llm_inputs 9 | -------------------------------------------------------------------------------- /docs/source/dev/sampling_params.rst: -------------------------------------------------------------------------------- 1 | Sampling Parameters 2 | =================== 3 | 4 | .. autoclass:: vllm.SamplingParams 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/getting_started/examples/examples_index.template.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Scripts 7 | 8 | %EXAMPLE_DOCS% 9 | -------------------------------------------------------------------------------- /docs/source/models/engine_args.rst: -------------------------------------------------------------------------------- 1 | .. _engine_args: 2 | 3 | Engine Arguments 4 | ================ 5 | 6 | Below, you can find an explanation of every engine argument for vLLM: 7 | 8 | .. argparse:: 9 | :module: vllm.engine.arg_utils 10 | :func: _engine_args_parser 11 | :prog: -m vllm.entrypoints.openai.api_server 12 | :nodefaultconst: 13 | 14 | Async Engine Arguments 15 | ---------------------- 16 | 17 | Below are the additional arguments related to the asynchronous engine: 18 | 19 | .. argparse:: 20 | :module: vllm.engine.arg_utils 21 | :func: _async_engine_args_parser 22 | :prog: -m vllm.entrypoints.openai.api_server 23 | :nodefaultconst: -------------------------------------------------------------------------------- /docs/source/models/vlm.rst: -------------------------------------------------------------------------------- 1 | .. _vlm: 2 | 3 | Using VLMs 4 | ========== 5 | 6 | This document shows you how to run and serve Vision Language Models (VLMs) using vLLM. 7 | 8 | Engine Arguments 9 | ---------------- 10 | 11 | The following :ref:`engine arguments ` are specific to VLMs: 12 | 13 | .. argparse:: 14 | :module: vllm.engine.arg_utils 15 | :func: _vlm_engine_args_parser 16 | :prog: -m vllm.entrypoints.openai.api_server 17 | :nodefaultconst: 18 | 19 | Offline Batched Inference 20 | ------------------------- 21 | 22 | To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine. 23 | 24 | .. code-block:: python 25 | 26 | llm = LLM( 27 | model="llava-hf/llava-1.5-7b-hf", 28 | image_input_type="pixel_values", 29 | image_token_id=32000, 30 | image_input_shape="1,3,336,336", 31 | image_feature_size=576, 32 | ) 33 | 34 | For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: 35 | 36 | * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. 37 | * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`. 38 | 39 | .. code-block:: python 40 | 41 | prompt = "" * 576 + ( 42 | "\nUSER: What is the content of this image?\nASSISTANT:") 43 | 44 | # Load the image using PIL.Image 45 | image = ... 46 | 47 | outputs = llm.generate({ 48 | "prompt": prompt, 49 | "multi_modal_data": ImagePixelData(image), 50 | }) 51 | 52 | for o in outputs: 53 | generated_text = o.outputs[0].text 54 | print(generated_text) 55 | 56 | A code example can be found in `examples/llava_example.py `_. 57 | -------------------------------------------------------------------------------- /docs/source/quantization/fp8_e5m2_kvcache.rst: -------------------------------------------------------------------------------- 1 | .. _fp8_kv_cache: 2 | 3 | FP8 E5M2 KV Cache 4 | ================== 5 | 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. 8 | 9 | Here is an example of how to enable this feature: 10 | 11 | .. code-block:: python 12 | 13 | from vllm import LLM, SamplingParams 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 23 | # Create an LLM. 24 | llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") 25 | # Generate texts from the prompts. The output is a list of RequestOutput objects 26 | # that contain the prompt, generated text, and other information. 27 | outputs = llm.generate(prompts, sampling_params) 28 | # Print the outputs. 29 | for output in outputs: 30 | prompt = output.prompt 31 | generated_text = output.outputs[0].text 32 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 33 | 34 | 35 | Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type. 36 | 37 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_bentoml.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_bentoml: 2 | 3 | Deploying with BentoML 4 | ====================== 5 | 6 | `BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial `vLLM inference in the BentoML documentation `_. -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_kserve.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_kserve: 2 | 3 | Deploying with KServe 4 | ============================ 5 | 6 | vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see `this guide `_ for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_lws.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_lws: 2 | 3 | Deploying with LWS 4 | ============================ 5 | 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. 7 | A major use case is for multi-host/multi-node distributed inference. 8 | 9 | vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. 10 | 11 | Please see `this guide `_ for more details on 12 | deploying vLLM on Kubernetes using LWS. 13 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /docs/source/serving/env_vars.rst: -------------------------------------------------------------------------------- 1 | Environment Variables 2 | ======================== 3 | 4 | vLLM uses the following environment variables to configure the system: 5 | 6 | .. literalinclude:: ../../../vllm/envs.py 7 | :language: python 8 | :start-after: begin-env-vars-definition 9 | :end-before: end-env-vars-definition 10 | -------------------------------------------------------------------------------- /docs/source/serving/integrations.rst: -------------------------------------------------------------------------------- 1 | Integrations 2 | ------------ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | run_on_sky 8 | deploying_with_kserve 9 | deploying_with_triton 10 | deploying_with_bentoml 11 | deploying_with_lws 12 | deploying_with_dstack 13 | serving_with_langchain 14 | -------------------------------------------------------------------------------- /docs/source/serving/metrics.rst: -------------------------------------------------------------------------------- 1 | Production Metrics 2 | ================== 3 | 4 | vLLM exposes a number of metrics that can be used to monitor the health of the 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM 6 | OpenAI compatible API server. 7 | 8 | The following metrics are exposed: 9 | 10 | .. literalinclude:: ../../../vllm/engine/metrics.py 11 | :language: python 12 | :start-after: begin-metrics-definitions 13 | :end-before: end-metrics-definitions 14 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_langchain.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_langchain: 2 | 3 | Serving with Langchain 4 | ============================ 5 | 6 | vLLM is also available via `Langchain `_ . 7 | 8 | To install langchain, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install langchain langchain_community -q 13 | 14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. 15 | 16 | .. code-block:: python 17 | 18 | from langchain_community.llms import VLLM 19 | 20 | llm = VLLM(model="mosaicml/mpt-7b", 21 | trust_remote_code=True, # mandatory for hf models 22 | max_new_tokens=128, 23 | top_k=10, 24 | top_p=0.95, 25 | temperature=0.8, 26 | # tensor_parallel_size=... # for distributed inference 27 | ) 28 | 29 | print(llm("What is the capital of France ?")) 30 | 31 | Please refer to this `Tutorial `_ for more details. 32 | -------------------------------------------------------------------------------- /docs/source/serving/usage_stats.md: -------------------------------------------------------------------------------- 1 | # Usage Stats Collection 2 | 3 | vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit. 4 | 5 | ## What data is collected? 6 | 7 | You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py). 8 | 9 | Here is an example as of v0.4.0: 10 | 11 | ```json 12 | { 13 | "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", 14 | "provider": "GCP", 15 | "num_cpu": 24, 16 | "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", 17 | "cpu_family_model_stepping": "6,85,7", 18 | "total_memory": 101261135872, 19 | "architecture": "x86_64", 20 | "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", 21 | "gpu_count": 2, 22 | "gpu_type": "NVIDIA L4", 23 | "gpu_memory_per_device": 23580639232, 24 | "model_architecture": "OPTForCausalLM", 25 | "vllm_version": "0.3.2+cu123", 26 | "context": "LLM_CLASS", 27 | "log_time": 1711663373492490000, 28 | "source": "production", 29 | "dtype": "torch.float16", 30 | "tensor_parallel_size": 1, 31 | "block_size": 16, 32 | "gpu_memory_utilization": 0.9, 33 | "quantization": null, 34 | "kv_cache_dtype": "auto", 35 | "enable_lora": false, 36 | "enable_prefix_caching": false, 37 | "enforce_eager": false, 38 | "disable_custom_all_reduce": true 39 | } 40 | ``` 41 | 42 | You can preview the collected data by running the following command: 43 | 44 | ```bash 45 | tail ~/.config/vllm/usage_stats.json 46 | ``` 47 | 48 | ## Opt-out of Usage Stats Collection 49 | 50 | You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: 51 | 52 | ```bash 53 | # Any of the following methods can disable usage stats collection 54 | export VLLM_NO_USAGE_STATS=1 55 | export DO_NOT_TRACK=1 56 | mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track 57 | ``` 58 | -------------------------------------------------------------------------------- /examples/aqlm_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from vllm import LLM, SamplingParams 4 | 5 | 6 | def main(): 7 | 8 | parser = argparse.ArgumentParser(description='AQLM examples') 9 | 10 | parser.add_argument('--model', 11 | '-m', 12 | type=str, 13 | default=None, 14 | help='model path, as for HF') 15 | parser.add_argument('--choice', 16 | '-c', 17 | type=int, 18 | default=0, 19 | help='known good models by index, [0-4]') 20 | parser.add_argument('--tensor_parallel_size', 21 | '-t', 22 | type=int, 23 | default=1, 24 | help='tensor parallel size') 25 | 26 | args = parser.parse_args() 27 | 28 | models = [ 29 | "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", 30 | "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", 31 | "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf", 32 | "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", 33 | "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", 34 | ] 35 | 36 | model = LLM(args.model if args.model is not None else models[args.choice], 37 | tensor_parallel_size=args.tensor_parallel_size) 38 | 39 | sampling_params = SamplingParams(max_tokens=100, temperature=0) 40 | outputs = model.generate("Hello my name is", 41 | sampling_params=sampling_params) 42 | print(outputs[0].outputs[0].text) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /examples/fp8/quantizer/README.md: -------------------------------------------------------------------------------- 1 | ### Quantizer Utilities 2 | `quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM: 3 | `https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py` 4 | 5 | ### Prerequisite 6 | 7 | #### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later 8 | `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 9 | 10 | #### AMMO Download (code and docs) 11 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` 12 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` 13 | 14 | ### Usage 15 | 16 | #### Run on H100 system for speed if FP8; number of GPUs depends on the model size 17 | 18 | #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: 19 | `python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` 20 | 21 | Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) 22 | ``` 23 | # ll ./ll2_7b_fp8/ 24 | total 19998244 25 | drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ 26 | drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ 27 | -rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json 28 | -rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz 29 | -rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors 30 | # 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default=None) 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue().launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference_arctic.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="snowflake/snowflake-arctic-instruct", 15 | quantization="deepspeedfp", 16 | tensor_parallel_size=8, 17 | trust_remote_code=True) 18 | # Generate texts from the prompts. The output is a list of RequestOutput objects 19 | # that contain the prompt, generated text, and other information. 20 | 21 | outputs = llm.generate(prompts, sampling_params) 22 | # Print the outputs. 23 | for output in outputs: 24 | prompt = output.prompt 25 | generated_text = output.outputs[0].text 26 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 27 | -------------------------------------------------------------------------------- /examples/offline_inference_embedding.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | 11 | # Create an LLM. 12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) 13 | # Generate embedding. The output is a list of EmbeddingRequestOutputs. 14 | outputs = model.encode(prompts) 15 | # Print the outputs. 16 | for output in outputs: 17 | print(output.outputs.embedding) # list of 4096 floats 18 | -------------------------------------------------------------------------------- /examples/offline_inference_neuron.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM( 15 | model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 16 | max_num_seqs=8, 17 | # The max_model_len and block_size arguments are required to be same as 18 | # max sequence length when targeting neuron device. 19 | # Currently, this is a known limitation in continuous batching support 20 | # in transformers-neuronx. 21 | # TODO(liangfu): Support paged-attention in transformers-neuronx. 22 | max_model_len=128, 23 | block_size=128, 24 | # The device can be automatically detected when AWS Neuron SDK is installed. 25 | # The device argument can be either unspecified for automated detection, 26 | # or explicitly assigned. 27 | device="neuron", 28 | tensor_parallel_size=2) 29 | # Generate texts from the prompts. The output is a list of RequestOutput objects 30 | # that contain the prompt, generated text, and other information. 31 | outputs = llm.generate(prompts, sampling_params) 32 | # Print the outputs. 33 | for output in outputs: 34 | prompt = output.prompt 35 | generated_text = output.outputs[0].text 36 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 37 | -------------------------------------------------------------------------------- /examples/openai_chat_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | chat_completion = client.chat.completions.create( 17 | messages=[{ 18 | "role": "system", 19 | "content": "You are a helpful assistant." 20 | }, { 21 | "role": "user", 22 | "content": "Who won the world series in 2020?" 23 | }, { 24 | "role": 25 | "assistant", 26 | "content": 27 | "The Los Angeles Dodgers won the World Series in 2020." 28 | }, { 29 | "role": "user", 30 | "content": "Where was it played?" 31 | }], 32 | model=model, 33 | ) 34 | 35 | print("Chat completion results:") 36 | print(chat_completion) 37 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | # Completion API 17 | stream = False 18 | completion = client.completions.create( 19 | model=model, 20 | prompt="A robot may not injure a human being", 21 | echo=False, 22 | n=2, 23 | stream=stream, 24 | logprobs=3) 25 | 26 | print("Completion results:") 27 | if stream: 28 | for c in completion: 29 | print(c) 30 | else: 31 | print(completion) 32 | -------------------------------------------------------------------------------- /examples/openai_embedding_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | responses = client.embeddings.create(input=[ 17 | "Hello my name is", 18 | "The best thing about vLLM is that it supports many different models" 19 | ], 20 | model=model) 21 | 22 | for data in responses.data: 23 | print(data.embedding) # list of float of len 4096 24 | -------------------------------------------------------------------------------- /examples/openai_example_batch.jsonl: -------------------------------------------------------------------------------- 1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} 2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} 3 | -------------------------------------------------------------------------------- /examples/production_monitoring/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/production_monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- 1 | <#meta#> 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} 4 | <#system#> 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 6 | <#chat#> 7 | {% for message in messages %} 8 | {% if message['role'] == 'user' %} 9 | <#user#> 10 | {{ message['content']|trim -}} 11 | {% if not loop.last %} 12 | 13 | {% endif %} 14 | {% elif message['role'] == 'assistant' %} 15 | <#bot#> 16 | {{ message['content']|trim -}} 17 | {% if not loop.last %} 18 | 19 | {% endif %} 20 | {% elif message['role'] == 'user_context' %} 21 | <#user_context#> 22 | {{ message['content']|trim -}} 23 | {% if not loop.last %} 24 | 25 | {% endif %} 26 | {% endif %} 27 | {% endfor %} 28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 29 | <#bot#> 30 | {% endif %} -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Should be mirrored in requirements-build.txt 3 | requires = [ 4 | "cmake>=3.21", 5 | "ninja", 6 | "packaging", 7 | "setuptools >= 49.4.0", 8 | "wheel", 9 | ] 10 | build-backend = "setuptools.build_meta" 11 | 12 | [tool.ruff] 13 | # Allow lines to be as long as 80. 14 | line-length = 80 15 | exclude = [ 16 | # External file, leaving license intact 17 | "examples/fp8/quantizer/quantize.py" 18 | ] 19 | 20 | [tool.ruff.lint] 21 | select = [ 22 | # pycodestyle 23 | "E", 24 | # Pyflakes 25 | "F", 26 | # pyupgrade 27 | # "UP", 28 | # flake8-bugbear 29 | "B", 30 | # flake8-simplify 31 | "SIM", 32 | # isort 33 | # "I", 34 | "G", 35 | ] 36 | ignore = [ 37 | # star imports 38 | "F405", "F403", 39 | # lambda expression assignment 40 | "E731", 41 | # Loop control variable not used within loop body 42 | "B007", 43 | ] 44 | 45 | [tool.mypy] 46 | python_version = "3.8" 47 | 48 | ignore_missing_imports = true 49 | check_untyped_defs = true 50 | follow_imports = "skip" 51 | 52 | files = "vllm" 53 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 54 | exclude = [ 55 | "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", 56 | # Ignore triton kernels in ops. 57 | 'vllm/attention/ops/.*\.py$' 58 | ] 59 | 60 | [tool.codespell] 61 | ignore-words-list = "dout, te, indicies, subtile" 62 | skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" 63 | 64 | [tool.isort] 65 | use_parentheses = true 66 | skip_gitignore = true 67 | 68 | [tool.pytest.ini_options] 69 | markers = [ 70 | "skip_global_cleanup", 71 | "llm: run tests for vLLM API only", 72 | "openai: run tests for OpenAI API only", 73 | ] 74 | -------------------------------------------------------------------------------- /requirements-build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.21 3 | ninja 4 | packaging 5 | setuptools>=49.4.0 6 | wheel 7 | -------------------------------------------------------------------------------- /requirements-common.txt: -------------------------------------------------------------------------------- 1 | cmake >= 3.21 2 | ninja # For faster builds. 3 | psutil 4 | sentencepiece # Required for LLaMA tokenizer. 5 | numpy 6 | requests 7 | py-cpuinfo 8 | transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. 9 | tokenizers >= 0.19.1 # Required for Llama 3. 10 | fastapi 11 | aiohttp 12 | openai 13 | uvicorn[standard] 14 | pydantic >= 2.0 # Required for OpenAI server. 15 | pillow # Required for image processing 16 | prometheus_client >= 0.18.0 17 | prometheus-fastapi-instrumentator >= 7.0.0 18 | tiktoken >= 0.6.0 # Required for DBRX tokenizer 19 | lm-format-enforcer == 0.10.1 20 | outlines == 0.0.34 # Requires torch >= 2.1.0 21 | typing_extensions 22 | filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 23 | -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for x86_64 CPUs 5 | torch == 2.3.0+cpu 6 | triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. -------------------------------------------------------------------------------- /requirements-cuda.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for NVIDIA GPUs 5 | ray >= 2.9 6 | nvidia-ml-py # for pynvml package 7 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | tomli==2.0.1 5 | ruff==0.1.5 6 | codespell==2.2.6 7 | isort==5.13.2 8 | clang-format==18.1.5 9 | 10 | # type checking 11 | mypy==1.9.0 12 | types-PyYAML 13 | types-requests 14 | types-setuptools 15 | 16 | # testing 17 | pytest 18 | tensorizer>=2.9.0 19 | pytest-forked 20 | pytest-asyncio 21 | pytest-rerunfailures 22 | pytest-shard 23 | 24 | # testing utils 25 | awscli 26 | einops # required for MPT 27 | httpx 28 | peft 29 | requests 30 | ray 31 | sentence-transformers # required for embedding 32 | 33 | # Benchmarking 34 | aiohttp 35 | 36 | # quantization 37 | bitsandbytes==0.42.0 38 | -------------------------------------------------------------------------------- /requirements-neuron.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for Neuron devices 5 | transformers-neuronx >= 0.9.0 6 | torch-neuronx >= 2.1.0 7 | neuronx-cc 8 | -------------------------------------------------------------------------------- /requirements-rocm.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for AMD GPUs 5 | ray >= 2.10.0 6 | pytest-asyncio 7 | -------------------------------------------------------------------------------- /rocm_patch/rocm_bf16.patch: -------------------------------------------------------------------------------- 1 | --- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000 2 | +++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000 3 | @@ -90,10 +90,10 @@ 4 | #include "math_fwd.h" // ocml device functions 5 | 6 | #if defined(__HIPCC_RTC__) 7 | -#define __HOST_DEVICE__ __device__ 8 | +#define __HOST_DEVICE__ __device__ static 9 | #else 10 | #include 11 | -#define __HOST_DEVICE__ __host__ __device__ 12 | +#define __HOST_DEVICE__ __host__ __device__ static inline 13 | #endif 14 | 15 | // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on 16 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/async_engine/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | def __init__(self, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self._num_aborts = 0 20 | 21 | async def abort(self, request_id: str) -> None: 22 | await super().abort(request_id) 23 | self._num_aborts += 1 24 | 25 | def testing_stats(self) -> Dict[str, Any]: 26 | return {"num_aborted_requests": self._num_aborts} 27 | 28 | 29 | @app.get("/stats") 30 | def stats() -> Response: 31 | """Get the statistics of the engine.""" 32 | return JSONResponse(engine.testing_stats()) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--host", type=str, default="localhost") 38 | parser.add_argument("--port", type=int, default=8000) 39 | parser = AsyncEngineArgs.add_cli_args(parser) 40 | args = parser.parse_args() 41 | 42 | engine_args = AsyncEngineArgs.from_cli_args(args) 43 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 44 | vllm.entrypoints.api_server.engine = engine 45 | uvicorn.run( 46 | app, 47 | host=args.host, 48 | port=args.port, 49 | log_level="debug", 50 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 51 | -------------------------------------------------------------------------------- /tests/basic_correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/basic_correctness/__init__.py -------------------------------------------------------------------------------- /tests/basic_correctness/test_basic_correctness.py: -------------------------------------------------------------------------------- 1 | """Compare the short outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py`. 4 | """ 5 | import os 6 | import weakref 7 | 8 | import pytest 9 | 10 | from vllm import LLM 11 | 12 | MODELS = [ 13 | "facebook/opt-125m", 14 | "meta-llama/Llama-2-7b-hf", 15 | ] 16 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND" 17 | 18 | 19 | def test_vllm_gc_ed(): 20 | """Verify vllm instance is GC'ed when it is deleted""" 21 | llm = LLM("facebook/opt-125m") 22 | weak_llm = weakref.ref(llm) 23 | del llm 24 | # If there's any circular reference to vllm, this fails 25 | # because llm instance is not GC'ed. 26 | assert weak_llm() is None 27 | 28 | 29 | @pytest.mark.parametrize("model", MODELS) 30 | @pytest.mark.parametrize("dtype", ["half"]) 31 | @pytest.mark.parametrize("max_tokens", [5]) 32 | @pytest.mark.parametrize("enforce_eager", [False, True]) 33 | def test_models( 34 | hf_runner, 35 | vllm_runner, 36 | example_prompts, 37 | model: str, 38 | dtype: str, 39 | max_tokens: int, 40 | enforce_eager: bool, 41 | ) -> None: 42 | backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND) 43 | if backend_by_env_var == "FLASHINFER" and enforce_eager is False: 44 | pytest.skip("Skipping non-eager test for FlashInferBackend.") 45 | 46 | hf_model = hf_runner(model, dtype=dtype) 47 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 48 | del hf_model 49 | 50 | vllm_model = vllm_runner(model, 51 | dtype=dtype, 52 | enforce_eager=enforce_eager, 53 | gpu_memory_utilization=0.7) 54 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 55 | del vllm_model 56 | 57 | for i in range(len(example_prompts)): 58 | hf_output_ids, hf_output_str = hf_outputs[i] 59 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 60 | assert hf_output_str == vllm_output_str, ( 61 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 62 | assert hf_output_ids == vllm_output_ids, ( 63 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 64 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/block/__init__.py -------------------------------------------------------------------------------- /tests/core/block/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def should_do_global_cleanup_after_test() -> bool: 6 | """Disable the global cleanup fixture for tests in this directory. This 7 | provides a ~10x speedup for unit tests that don't load a model to GPU. 8 | 9 | This requires that tests in this directory clean up after themselves if they 10 | use the GPU. 11 | """ 12 | return False 13 | -------------------------------------------------------------------------------- /tests/core/block/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/block/e2e/__init__.py -------------------------------------------------------------------------------- /tests/core/block/test_common.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | 5 | from vllm.core.block.common import RefCounter 6 | 7 | 8 | @pytest.mark.parametrize("seed", list(range(20))) 9 | @pytest.mark.parametrize("num_incrs", [1, 100]) 10 | @pytest.mark.parametrize("num_blocks", [1024]) 11 | def test_incr(seed: int, num_incrs: int, num_blocks: int): 12 | random.seed(seed) 13 | 14 | all_block_indices = list(range(num_blocks)) 15 | counter = RefCounter(all_block_indices=all_block_indices) 16 | 17 | block_id = random.randint(0, num_blocks - 1) 18 | for i in range(num_incrs): 19 | value = counter.incr(block_id) 20 | assert value == i + 1 21 | 22 | 23 | @pytest.mark.parametrize("seed", list(range(20))) 24 | @pytest.mark.parametrize("num_incrs", [1, 100]) 25 | @pytest.mark.parametrize("num_blocks", [1024]) 26 | def test_incr_decr(seed: int, num_incrs: int, num_blocks: int): 27 | random.seed(seed) 28 | 29 | all_block_indices = list(range(num_blocks)) 30 | counter = RefCounter(all_block_indices=all_block_indices) 31 | 32 | block_id = random.randint(0, num_blocks - 1) 33 | for i in range(num_incrs): 34 | value = counter.incr(block_id) 35 | assert value == i + 1 36 | 37 | for i in range(num_incrs): 38 | value = counter.decr(block_id) 39 | assert value == num_incrs - (i + 1) 40 | 41 | with pytest.raises(AssertionError): 42 | counter.decr(block_id) 43 | -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/distributed/__init__.py -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/engine/__init__.py -------------------------------------------------------------------------------- /tests/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /tests/engine/test_computed_prefix_blocks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.arg_utils import EngineArgs 4 | from vllm.engine.llm_engine import LLMEngine 5 | from vllm.sampling_params import SamplingParams 6 | 7 | 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 9 | @pytest.mark.parametrize("block_size", [16]) 10 | def test_computed_prefix_blocks(model: str, block_size: int): 11 | # This test checks if we are able to run the engine to completion 12 | # without triggering asserts. 13 | # We are in a scenario where all blocks from the second request's prompt 14 | # are full and already computed when the second request arrives. 15 | prompt = ( 16 | "You are a helpful assistant. How do I build a car from cardboard and " 17 | "paper clips? Is there an easy to follow video tutorial available " 18 | "online for free?") 19 | prompt2 = ( 20 | " Please recommend to me some resources where I can learn not only to " 21 | "handle technical difficulties of building a car, but also " 22 | "decoration.") 23 | 24 | engine_args = EngineArgs(model=model, 25 | block_size=block_size, 26 | enable_prefix_caching=True) 27 | 28 | engine = LLMEngine.from_engine_args(engine_args) 29 | sampling_params = SamplingParams() 30 | 31 | engine.add_request("0", prompt + prompt2, sampling_params) 32 | engine.step() 33 | engine.add_request("1", prompt, sampling_params) 34 | engine.step() 35 | -------------------------------------------------------------------------------- /tests/engine/test_detokenization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_computed_prefix_blocks(model: str): 9 | # This test checks if the engine generates completions both with and 10 | # without optional detokenization, that detokenization includes text 11 | # and no-detokenization doesn't, and that both completions have the same 12 | # token_ids. 13 | prompt = ( 14 | "You are a helpful assistant. How do I build a car from cardboard and " 15 | "paper clips? Is there an easy to follow video tutorial available " 16 | "online for free?") 17 | 18 | llm = LLM(model=model) 19 | sampling_params = SamplingParams(max_tokens=10, 20 | temperature=0.0, 21 | detokenize=False) 22 | 23 | outputs_no_detokenization = llm.generate(prompt, 24 | sampling_params)[0].outputs[0] 25 | sampling_params.detokenize = True 26 | outputs_with_detokenization = llm.generate(prompt, 27 | sampling_params)[0].outputs[0] 28 | 29 | assert outputs_no_detokenization.text == '' 30 | assert outputs_with_detokenization.text != '' 31 | assert outputs_no_detokenization.token_ids == \ 32 | outputs_with_detokenization.token_ids 33 | -------------------------------------------------------------------------------- /tests/engine/test_skip_tokenizer_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_skip_tokenizer_initialization(model: str): 9 | # This test checks if the flag skip_tokenizer_init skips the initialization 10 | # of tokenizer and detokenizer. The generated output is expected to contain 11 | # token ids. 12 | llm = LLM(model=model, skip_tokenizer_init=True) 13 | sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) 14 | with pytest.raises(ValueError) as err: 15 | llm.generate("abc", sampling_params) 16 | assert "prompts must be None if" in str(err.value) 17 | outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, 18 | sampling_params=sampling_params) 19 | assert len(outputs) > 0 20 | completions = outputs[0].outputs 21 | assert len(completions) > 0 22 | assert completions[0].text == "" 23 | assert completions[0].token_ids 24 | -------------------------------------------------------------------------------- /tests/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/entrypoints/__init__.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/test_serving_chat.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass 3 | 4 | import pytest 5 | 6 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat 7 | 8 | MODEL_NAME = "openai-community/gpt2" 9 | CHAT_TEMPLATE = "Dummy chat template for testing {}" 10 | 11 | pytestmark = pytest.mark.openai 12 | 13 | 14 | @dataclass 15 | class MockModelConfig: 16 | tokenizer = MODEL_NAME 17 | trust_remote_code = False 18 | tokenizer_mode = "auto" 19 | max_model_len = 100 20 | tokenizer_revision = None 21 | embedding_mode = False 22 | 23 | 24 | @dataclass 25 | class MockEngine: 26 | 27 | async def get_model_config(self): 28 | return MockModelConfig() 29 | 30 | 31 | async def _async_serving_chat_init(): 32 | engine = MockEngine() 33 | model_config = await engine.get_model_config() 34 | 35 | serving_completion = OpenAIServingChat(engine, 36 | model_config, 37 | served_model_names=[MODEL_NAME], 38 | response_role="assistant", 39 | chat_template=CHAT_TEMPLATE) 40 | return serving_completion 41 | 42 | 43 | def test_async_serving_chat_init(): 44 | serving_completion = asyncio.run(_async_serving_chat_init()) 45 | assert serving_completion.tokenizer is not None 46 | assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE 47 | -------------------------------------------------------------------------------- /tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "llama", 3 | "kv_cache": { 4 | "dtype": "float8_e4m3fn", 5 | "scaling_factor": { 6 | "0": { 7 | "0": 0.0152239128947258, 8 | "1": 0.0188860222697258, 9 | "2": 0.0354178324341774, 10 | "3": 0.0376674123108387, 11 | "4": 0.0418526791036129, 12 | "5": 0.0433175228536129, 13 | "6": 0.0397600457072258, 14 | "7": 0.0424455925822258, 15 | "8": 0.0415387861430645, 16 | "9": 0.0408412404358387, 17 | "10": 0.0395856611430645, 18 | "11": 0.0377371683716774, 19 | "12": 0.0400739423930645, 20 | "13": 0.040771484375, 21 | "14": 0.0393415205180645, 22 | "15": 0.0369001142680645, 23 | "16": 0.03857421875, 24 | "17": 0.0387486070394516, 25 | "18": 0.0403180830180645, 26 | "19": 0.0396205373108387, 27 | "20": 0.0375627800822258, 28 | "21": 0.0407366082072258, 29 | "22": 0.0432477705180645, 30 | "23": 0.0377022884786129, 31 | "24": 0.0399693101644516, 32 | "25": 0.0374581478536129, 33 | "26": 0.0413295216858387, 34 | "27": 0.0442243330180645, 35 | "28": 0.0424804724752903, 36 | "29": 0.0456891767680645, 37 | "30": 0.0409109964966774, 38 | "31": 0.0482352152466774 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/kernels/__init__.py -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Reference default values of atol and rtol are from 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 6 | default_rtol = { 7 | torch.float16: 1e-3, 8 | torch.bfloat16: 1.6e-2, 9 | torch.float: 1.3e-6 10 | } 11 | 12 | 13 | def get_default_atol(output) -> float: 14 | return default_atol[output.dtype] 15 | 16 | 17 | def get_default_rtol(output) -> float: 18 | return default_rtol[output.dtype] 19 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.utils import (create_kv_caches_with_random, 4 | create_kv_caches_with_random_flash) 5 | 6 | 7 | @pytest.fixture() 8 | def kv_cache_factory(): 9 | return create_kv_caches_with_random 10 | 11 | 12 | @pytest.fixture() 13 | def kv_cache_factory_flashinfer(): 14 | return create_kv_caches_with_random_flash 15 | -------------------------------------------------------------------------------- /tests/kernels/test_int8_quant.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm._C import ops 5 | 6 | DTYPES = [torch.half, torch.bfloat16, torch.float] 7 | HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing 8 | NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing 9 | SEEDS = [0] 10 | SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] 11 | 12 | 13 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 14 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 15 | @pytest.mark.parametrize("dtype", DTYPES) 16 | @pytest.mark.parametrize("seed", SEEDS) 17 | @pytest.mark.parametrize("scale", SCALE) 18 | @torch.inference_mode() 19 | def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, 20 | seed: int, scale: float) -> None: 21 | torch.random.manual_seed(seed) 22 | torch.cuda.manual_seed(seed) 23 | x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 24 | 25 | out1 = (x / scale).round().clamp( 26 | torch.iinfo(torch.int8).min, 27 | torch.iinfo(torch.int8).max).to(torch.int8) 28 | out2 = torch.empty_like(x, dtype=torch.int8) 29 | ops.static_scaled_int8_quant(out2, x, scale) 30 | assert torch.allclose(out1, out2, 31 | atol=1) # big atol to account for rounding errors 32 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm.model_executor.layers.layernorm import RMSNorm 5 | 6 | DTYPES = [torch.half, torch.bfloat16, torch.float] 7 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 8 | HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 9 | 8199] # Arbitrary values for testing 10 | ADD_RESIDUAL = [False, True] 11 | SEEDS = [0] 12 | CUDA_DEVICES = [ 13 | f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) 14 | ] 15 | 16 | 17 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 18 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 19 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) 20 | @pytest.mark.parametrize("dtype", DTYPES) 21 | @pytest.mark.parametrize("seed", SEEDS) 22 | @pytest.mark.parametrize("device", CUDA_DEVICES) 23 | @torch.inference_mode() 24 | def test_rms_norm( 25 | num_tokens: int, 26 | hidden_size: int, 27 | add_residual: bool, 28 | dtype: torch.dtype, 29 | seed: int, 30 | device: str, 31 | ) -> None: 32 | torch.random.manual_seed(seed) 33 | if torch.cuda.is_available(): 34 | torch.cuda.manual_seed(seed) 35 | torch.set_default_device(device) 36 | layer = RMSNorm(hidden_size).to(dtype=dtype) 37 | layer.weight.data.normal_(mean=1.0, std=0.1) 38 | scale = 1 / (2 * hidden_size) 39 | x = torch.randn(num_tokens, hidden_size, dtype=dtype) 40 | x *= scale 41 | residual = torch.randn_like(x) * scale if add_residual else None 42 | 43 | # NOTE(woosuk): The reference implementation should be executed first 44 | # because the custom kernel is in-place. 45 | ref_out = layer._forward(x, residual) 46 | out = layer(x, residual) 47 | # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger 48 | # numerical errors than other operators because they involve reductions. 49 | # Therefore, we use a larger tolerance. 50 | if add_residual: 51 | assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2) 52 | assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2) 53 | else: 54 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2) 55 | -------------------------------------------------------------------------------- /tests/kernels/test_rand.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm.model_executor.layers.ops.rand import seeded_uniform 7 | from vllm.model_executor.utils import set_random_seed 8 | 9 | 10 | @pytest.mark.parametrize("dtype", 11 | [torch.float32, torch.float16, torch.bfloat16]) 12 | @pytest.mark.parametrize("use_3d", [True, False]) 13 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool): 14 | device = "cuda" 15 | for seed in range(512): 16 | set_random_seed(seed) 17 | rows = random.randint(1, 512) 18 | cols = random.randint(1, 64000) 19 | if use_3d: 20 | third_dim = random.randint(2, 10) 21 | dims = [rows, third_dim, cols] 22 | else: 23 | dims = [rows, cols] 24 | seeds = torch.randint(torch.iinfo(torch.long).min, 25 | torch.iinfo(torch.long).max, (rows, ), 26 | device=device) 27 | 28 | # Test that the same seed produces the same output 29 | out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 30 | out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 31 | torch.testing.assert_close(out, out2) 32 | # del to save memory 33 | del out2 34 | 35 | out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 36 | torch.testing.assert_close(out, out3) 37 | # del to save memory 38 | del out3 39 | 40 | # Initialize out tensor with garbage to ensure that it is overwritten 41 | out_with_tensor = seeded_uniform( 42 | *dims, 43 | out=torch.full( 44 | (*dims, ), 45 | -1, 46 | dtype=dtype, 47 | device=device, 48 | ), 49 | seeds=seeds, 50 | dtype=dtype, 51 | ) 52 | torch.testing.assert_close(out, out_with_tensor) 53 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/lora/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/lora/data/__init__.py -------------------------------------------------------------------------------- /tests/lora/test_gemma.py: -------------------------------------------------------------------------------- 1 | import vllm 2 | from vllm.lora.request import LoRARequest 3 | 4 | MODEL_PATH = "google/gemma-7b" 5 | 6 | 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str: 8 | prompts = [ 9 | "Quote: Imagination is", 10 | "Quote: Be yourself;", 11 | "Quote: So many books,", 12 | ] 13 | sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) 14 | outputs = llm.generate( 15 | prompts, 16 | sampling_params, 17 | lora_request=LoRARequest(str(lora_id), lora_id, lora_path) 18 | if lora_id else None) 19 | # Print the outputs. 20 | generated_texts = [] 21 | for output in outputs: 22 | prompt = output.prompt 23 | generated_text = output.outputs[0].text.strip() 24 | generated_texts.append(generated_text) 25 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 26 | return generated_texts 27 | 28 | 29 | def test_gemma_lora(gemma_lora_files): 30 | llm = vllm.LLM(MODEL_PATH, 31 | max_model_len=1024, 32 | enable_lora=True, 33 | max_loras=4) 34 | 35 | expected_lora_output = [ 36 | "more important than knowledge.\nAuthor: Albert Einstein\n", 37 | "everyone else is already taken.\nAuthor: Oscar Wilde\n", 38 | "so little time\nAuthor: Frank Zappa\n", 39 | ] 40 | 41 | output1 = do_sample(llm, gemma_lora_files, lora_id=1) 42 | for i in range(len(expected_lora_output)): 43 | assert output1[i].startswith(expected_lora_output[i]) 44 | output2 = do_sample(llm, gemma_lora_files, lora_id=2) 45 | for i in range(len(expected_lora_output)): 46 | assert output2[i].startswith(expected_lora_output[i]) 47 | -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/model_executor/__init__.py -------------------------------------------------------------------------------- /tests/model_executor/weight_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import huggingface_hub.constants 5 | import pytest 6 | from huggingface_hub.utils import LocalEntryNotFoundError 7 | 8 | from vllm.model_executor.model_loader.weight_utils import ( 9 | download_weights_from_hf, enable_hf_transfer) 10 | 11 | 12 | def test_hf_transfer_auto_activation(): 13 | if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ: 14 | # in case it is already set, we can't test the auto activation 15 | pytest.skip( 16 | "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation") 17 | enable_hf_transfer() 18 | try: 19 | # enable hf hub transfer if available 20 | import hf_transfer # type: ignore # noqa 21 | HF_TRANFER_ACTIVE = True 22 | except ImportError: 23 | HF_TRANFER_ACTIVE = False 24 | assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == 25 | HF_TRANFER_ACTIVE) 26 | 27 | 28 | def test_download_weights_from_hf(): 29 | with tempfile.TemporaryDirectory() as tmpdir: 30 | # assert LocalEntryNotFoundError error is thrown 31 | # if offline is set and model is not cached 32 | huggingface_hub.constants.HF_HUB_OFFLINE = True 33 | with pytest.raises(LocalEntryNotFoundError): 34 | download_weights_from_hf("facebook/opt-125m", 35 | allow_patterns=["*.safetensors", "*.bin"], 36 | cache_dir=tmpdir) 37 | 38 | # download the model 39 | huggingface_hub.constants.HF_HUB_OFFLINE = False 40 | download_weights_from_hf("facebook/opt-125m", 41 | allow_patterns=["*.safetensors", "*.bin"], 42 | cache_dir=tmpdir) 43 | 44 | # now it should work offline 45 | huggingface_hub.constants.HF_HUB_OFFLINE = True 46 | assert download_weights_from_hf( 47 | "facebook/opt-125m", 48 | allow_patterns=["*.safetensors", "*.bin"], 49 | cache_dir=tmpdir) is not None 50 | 51 | 52 | if __name__ == "__main__": 53 | test_hf_transfer_auto_activation() 54 | test_download_weights_from_hf() 55 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/test_big_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | This tests bigger models and use half precision. 4 | 5 | Run `pytest tests/models/test_big_models.py`. 6 | """ 7 | import pytest 8 | 9 | MODELS = [ 10 | "meta-llama/Llama-2-7b-hf", 11 | # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py 12 | # "Deci/DeciLM-7b", # Broken 13 | # "tiiuae/falcon-7b", # Broken 14 | "EleutherAI/gpt-j-6b", 15 | # "mosaicml/mpt-7b", # Broken 16 | # "Qwen/Qwen1.5-0.5B" # Broken, 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("model", MODELS) 21 | @pytest.mark.parametrize("dtype", ["half"]) 22 | @pytest.mark.parametrize("max_tokens", [32]) 23 | def test_models( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | ) -> None: 31 | hf_model = hf_runner(model, dtype=dtype) 32 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, hf_output_str = hf_outputs[i] 41 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 42 | assert hf_output_str == vllm_output_str, ( 43 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 44 | assert hf_output_ids == vllm_output_ids, ( 45 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 46 | 47 | 48 | @pytest.mark.parametrize("model", MODELS) 49 | @pytest.mark.parametrize("dtype", ["half"]) 50 | def test_model_print( 51 | vllm_runner, 52 | model: str, 53 | dtype: str, 54 | ) -> None: 55 | vllm_model = vllm_runner(model, dtype=dtype) 56 | # This test is for verifying whether the model's extra_repr 57 | # can be printed correctly. 58 | print(vllm_model.model.llm_engine.model_executor.driver_worker. 59 | model_runner.model) 60 | del vllm_model 61 | -------------------------------------------------------------------------------- /tests/models/test_embedding.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_llama_embedding.py`. 4 | """ 5 | import pytest 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | MODELS = [ 10 | "intfloat/e5-mistral-7b-instruct", 11 | ] 12 | 13 | 14 | def compare_embeddings(embeddings1, embeddings2): 15 | similarities = [ 16 | F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0) 17 | for e1, e2 in zip(embeddings1, embeddings2) 18 | ] 19 | return similarities 20 | 21 | 22 | @pytest.mark.parametrize("model", MODELS) 23 | @pytest.mark.parametrize("dtype", ["half"]) 24 | def test_models( 25 | hf_runner, 26 | vllm_runner, 27 | example_prompts, 28 | model: str, 29 | dtype: str, 30 | ) -> None: 31 | hf_model = hf_runner(model, dtype=dtype) 32 | hf_outputs = hf_model.encode(example_prompts) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.encode(example_prompts) 37 | del vllm_model 38 | 39 | similarities = compare_embeddings(hf_outputs, vllm_outputs) 40 | all_similarities = torch.stack(similarities) 41 | tolerance = 1e-2 42 | assert torch.all((all_similarities <= 1.0 + tolerance) 43 | & (all_similarities >= 1.0 - tolerance) 44 | ), f"Not all values are within {tolerance} of 1.0" 45 | -------------------------------------------------------------------------------- /tests/models/test_mistral.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_mistral.py`. 4 | """ 5 | import pytest 6 | 7 | from .utils import check_logprobs_close 8 | 9 | MODELS = [ 10 | "mistralai/Mistral-7B-Instruct-v0.1", 11 | "mistralai/Mistral-7B-Instruct-v0.3", 12 | ] 13 | 14 | 15 | @pytest.mark.parametrize("model", MODELS) 16 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 17 | @pytest.mark.parametrize("max_tokens", [64]) 18 | @pytest.mark.parametrize("num_logprobs", [5]) 19 | def test_models( 20 | hf_runner, 21 | vllm_runner, 22 | example_prompts, 23 | model: str, 24 | dtype: str, 25 | max_tokens: int, 26 | num_logprobs: int, 27 | ) -> None: 28 | # TODO(sang): Sliding window should be tested separately. 29 | hf_model = hf_runner(model, dtype=dtype) 30 | hf_outputs = hf_model.generate_greedy_logprobs_limit( 31 | example_prompts, max_tokens, num_logprobs) 32 | del hf_model 33 | 34 | vllm_model = vllm_runner(model, dtype=dtype) 35 | vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, 36 | max_tokens, 37 | num_logprobs) 38 | del vllm_model 39 | check_logprobs_close( 40 | outputs_0_lst=hf_outputs, 41 | outputs_1_lst=vllm_outputs, 42 | name_0="hf", 43 | name_1="vllm", 44 | ) 45 | -------------------------------------------------------------------------------- /tests/models/test_oot_registration.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from vllm import LLM, ModelRegistry, SamplingParams 4 | from vllm.model_executor.models.opt import OPTForCausalLM 5 | from vllm.model_executor.sampling_metadata import SamplingMetadata 6 | 7 | 8 | class MyOPTForCausalLM(OPTForCausalLM): 9 | 10 | def compute_logits(self, hidden_states: torch.Tensor, 11 | sampling_metadata: SamplingMetadata) -> torch.Tensor: 12 | # this dummy model always predicts the first token 13 | logits = super().compute_logits(hidden_states, sampling_metadata) 14 | logits.zero_() 15 | logits[:, 0] += 1.0 16 | return logits 17 | 18 | 19 | def test_oot_registration(): 20 | # register our dummy model 21 | ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) 22 | prompts = ["Hello, my name is", "The text does not matter"] 23 | sampling_params = SamplingParams(temperature=0) 24 | llm = LLM(model="facebook/opt-125m") 25 | first_token = llm.get_tokenizer().decode(0) 26 | outputs = llm.generate(prompts, sampling_params) 27 | 28 | for output in outputs: 29 | generated_text = output.outputs[0].text 30 | # make sure only the first token is generated 31 | rest = generated_text.replace(first_token, "") 32 | assert rest == "" 33 | -------------------------------------------------------------------------------- /tests/models/test_registry.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.model_executor.models import _MODELS, ModelRegistry 4 | 5 | 6 | @pytest.mark.parametrize("model_cls", _MODELS) 7 | def test_registry_imports(model_cls): 8 | # Ensure all model classes can be imported successfully 9 | ModelRegistry.load_model_cls(model_cls) 10 | -------------------------------------------------------------------------------- /tests/models/utils.py: -------------------------------------------------------------------------------- 1 | def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): 2 | """Compare the logprobs of two sequences generated by different models, 3 | which should be similar but not necessarily equal. 4 | """ 5 | # Loop through responses to each prompt. 6 | for prompt_idx, (outputs_0, 7 | outputs_1) in enumerate(zip(outputs_0_lst, 8 | outputs_1_lst)): 9 | output_ids_0, output_str_0, logprobs_0 = outputs_0 10 | output_ids_1, output_str_1, logprobs_1 = outputs_1 11 | 12 | # Loop through generated tokens. 13 | for idx, (output_id_0, 14 | output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): 15 | 16 | # If generated tokens don't match, then 17 | if output_id_0 != output_id_1: 18 | # Each predicted token must be in top N logprobs of the other 19 | assert output_id_0 in logprobs_1[idx], ( 20 | f"Test{prompt_idx}:" 21 | f"\n{name_0}:\t{output_str_0!r}" 22 | f"\n{name_1}:\t{output_str_1!r}") 23 | assert output_id_1 in logprobs_0[idx], ( 24 | f"Test{prompt_idx}:" 25 | f"\n{name_0}:\t{output_str_0!r}" 26 | f"\n{name_1}:\t{output_str_1!r}") 27 | 28 | # Break out since sequences will now diverge. 29 | break 30 | -------------------------------------------------------------------------------- /tests/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/multimodal/__init__.py -------------------------------------------------------------------------------- /tests/prefix_caching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/prefix_caching/__init__.py -------------------------------------------------------------------------------- /tests/prefix_caching/test_disable_sliding_window.py: -------------------------------------------------------------------------------- 1 | """Compare the with and without prefix caching. 2 | 3 | Run `pytest tests/prefix_caching/test_prefix_caching.py`. 4 | """ 5 | import pytest 6 | 7 | from tests.conftest import cleanup 8 | from vllm import LLM 9 | 10 | MODEL_LEN_LEN = [ 11 | # Example models with sliding window. 12 | ("bigcode/starcoder2-3b", 4096, 16384), 13 | # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI 14 | 15 | # Confirm model with sliding window works. 16 | # config has "use_sliding_window": false 17 | ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768), 18 | # config has no sliding window attribute. 19 | ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048), 20 | ] 21 | 22 | 23 | @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) 24 | def test_disable_sliding_window(model_len_len, ): 25 | model, sliding_len, full_len = model_len_len 26 | vllm_disabled_model = LLM(model, disable_sliding_window=True) 27 | vllm_disabled_model.generate("Hi my name is") 28 | model_config = vllm_disabled_model.llm_engine.model_config 29 | assert model_config.max_model_len == sliding_len, ( 30 | "Max len expected to equal sliding_len of %s, but got %s", sliding_len, 31 | model_config.max_model_len) 32 | 33 | del vllm_disabled_model 34 | cleanup() 35 | 36 | vllm_enabled_model = LLM(model, disable_sliding_window=False) 37 | vllm_enabled_model.generate("Hi my name is") 38 | model_config = vllm_enabled_model.llm_engine.model_config 39 | assert model_config.max_model_len == full_len, ( 40 | "Max len expected to equal full_len of %s, but got %s", full_len, 41 | model_config.max_model_len) 42 | 43 | del vllm_enabled_model 44 | cleanup() 45 | -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/quantization/__init__.py -------------------------------------------------------------------------------- /tests/quantization/test_compressed_tensors.py: -------------------------------------------------------------------------------- 1 | """Test model set-up and weight loading for sparseml-quantized models. 2 | 3 | Run `pytest tests/quantization/test_compressed_tensors.py`. 4 | """ 5 | 6 | import torch 7 | 8 | from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 9 | CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor) 10 | 11 | 12 | def test_compressed_tensors_w8a8_static_setup(vllm_runner): 13 | model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" 14 | llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) 15 | model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model 16 | layer = model.model.layers[0] 17 | 18 | qkv_proj = layer.self_attn.qkv_proj 19 | o_proj = layer.self_attn.o_proj 20 | gate_up_proj = layer.mlp.gate_up_proj 21 | down_proj = layer.mlp.down_proj 22 | 23 | assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) 24 | assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) 25 | assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) 26 | assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) 27 | 28 | assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) 29 | 30 | assert qkv_proj.weight.dtype is torch.int8 31 | assert o_proj.weight.dtype is torch.int8 32 | assert gate_up_proj.weight.dtype is torch.int8 33 | 34 | assert qkv_proj.weight_scale.shard_splitter is not None 35 | assert qkv_proj.weight_scale.logical_widths is not None 36 | assert qkv_proj.input_scale.dtype is torch.float32 37 | -------------------------------------------------------------------------------- /tests/quantization/test_fp8.py: -------------------------------------------------------------------------------- 1 | """Tests whether FP8 computation is enabled correctly. 2 | 3 | Run `pytest tests/quantization/test_fp8.py --forked`. 4 | """ 5 | import pytest 6 | import torch 7 | 8 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS 9 | from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod 10 | 11 | capability = torch.cuda.get_device_capability() 12 | capability = capability[0] * 10 + capability[1] 13 | 14 | 15 | @pytest.mark.skipif( 16 | capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), 17 | reason="FP8 is not supported on this GPU type.") 18 | def test_load_fp16_model(vllm_runner) -> None: 19 | llm = vllm_runner("facebook/opt-125m", quantization="fp8") 20 | 21 | model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model 22 | fc1 = model.model.decoder.layers[0].fc1 23 | assert isinstance(fc1.quant_method, Fp8LinearMethod) 24 | assert fc1.weight.dtype == torch.float8_e4m3fn 25 | -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/samplers/__init__.py -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py`. 4 | """ 5 | import gc 6 | 7 | import pytest 8 | import torch 9 | 10 | # FIXME(zhuohan): The test can not pass if we: 11 | # 1. Increase max_tokens to 256. 12 | # 2. Increase beam_width to 8. 13 | # 3. Use the model "huggyllama/llama-7b". 14 | MAX_TOKENS = [128] 15 | BEAM_WIDTHS = [4] 16 | MODELS = ["facebook/opt-125m"] 17 | 18 | 19 | @pytest.mark.parametrize("model", MODELS) 20 | @pytest.mark.parametrize("dtype", ["half"]) 21 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 22 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 23 | def test_beam_search_single_input( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | beam_width: int, 31 | ) -> None: 32 | example_prompts = example_prompts[:1] 33 | hf_model = hf_runner(model, dtype=dtype) 34 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 35 | max_tokens) 36 | del hf_model 37 | 38 | vllm_model = vllm_runner(model, dtype=dtype) 39 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 40 | max_tokens) 41 | del vllm_model 42 | # NOTE(woosuk): For some reason, the following GC is required to avoid 43 | # GPU OOM errors in the following tests using `vllm_runner`. 44 | gc.collect() 45 | torch.cuda.empty_cache() 46 | 47 | for i in range(len(example_prompts)): 48 | hf_output_ids, _ = hf_outputs[i] 49 | vllm_output_ids, _ = vllm_outputs[i] 50 | assert len(hf_output_ids) == len(vllm_output_ids) 51 | for j in range(len(hf_output_ids)): 52 | assert hf_output_ids[j] == vllm_output_ids[j], ( 53 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 54 | f"vLLM: {vllm_output_ids}") 55 | -------------------------------------------------------------------------------- /tests/samplers/test_ignore_eos.py: -------------------------------------------------------------------------------- 1 | """Make sure ignore_eos works. 2 | 3 | Run `pytest tests/samplers/test_ignore_eos.py`. 4 | """ 5 | 6 | import pytest 7 | 8 | from vllm import SamplingParams 9 | 10 | # We also test with llama because it has generation_config to specify EOS 11 | # (past regression). 12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] 13 | 14 | 15 | @pytest.mark.parametrize("model", MODELS) 16 | @pytest.mark.parametrize("dtype", ["half"]) 17 | @pytest.mark.parametrize("max_tokens", [512]) 18 | def test_ignore_eos( 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | ) -> None: 25 | vllm_model = vllm_runner(model, dtype=dtype) 26 | sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) 27 | 28 | for prompt in example_prompts: 29 | ignore_eos_output = vllm_model.model.generate( 30 | prompt, sampling_params=sampling_params) 31 | output_length = len(ignore_eos_output[0].outputs[0].token_ids) 32 | assert output_length == max_tokens 33 | -------------------------------------------------------------------------------- /tests/samplers/test_logits_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm import SamplingParams 5 | 6 | MODELS = ["facebook/opt-125m"] 7 | 8 | 9 | @pytest.mark.parametrize("model", MODELS) 10 | @pytest.mark.parametrize("dtype", ["half"]) 11 | def test_logits_processor_force_generate( 12 | vllm_runner, 13 | example_prompts, 14 | model: str, 15 | dtype: str, 16 | ) -> None: 17 | vllm_model = vllm_runner(model, dtype=dtype) 18 | tokenizer = vllm_model.model.get_tokenizer() 19 | repeat_times = 2 20 | enforced_answers = " vLLM" 21 | vllm_token_ids = tokenizer.encode(enforced_answers, 22 | add_special_tokens=False) 23 | max_tokens = len(vllm_token_ids) * repeat_times 24 | 25 | def pick_vllm(token_ids, logits): 26 | token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] 27 | logits[token_id] = torch.finfo(logits.dtype).max 28 | return logits 29 | 30 | params_with_logprobs = SamplingParams( 31 | logits_processors=[pick_vllm], 32 | prompt_logprobs=3, 33 | max_tokens=max_tokens, 34 | ) 35 | 36 | # test logits_processors when prompt_logprobs is not None 37 | vllm_model.model._add_request( 38 | example_prompts[0], 39 | params=params_with_logprobs, 40 | ) 41 | 42 | # test prompt_logprobs is not None 43 | vllm_model.model._add_request( 44 | example_prompts[1], 45 | params=SamplingParams( 46 | prompt_logprobs=3, 47 | max_tokens=max_tokens, 48 | ), 49 | ) 50 | 51 | # test grouped requests 52 | vllm_model.model._add_request( 53 | example_prompts[2], 54 | params=SamplingParams(max_tokens=max_tokens), 55 | ) 56 | 57 | outputs = vllm_model.model._run_engine(use_tqdm=False) 58 | 59 | assert outputs[0].outputs[0].text == enforced_answers * repeat_times 60 | -------------------------------------------------------------------------------- /tests/samplers/test_ranks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import SamplingParams 4 | 5 | MODELS = ["facebook/opt-125m"] 6 | 7 | 8 | @pytest.mark.parametrize("model", MODELS) 9 | @pytest.mark.parametrize("dtype", ["half"]) 10 | def test_ranks( 11 | vllm_runner, 12 | model, 13 | dtype, 14 | example_prompts, 15 | ): 16 | max_tokens = 5 17 | num_top_logprobs = 5 18 | num_prompt_logprobs = 5 19 | 20 | vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) 21 | 22 | ## Test greedy logprobs ranks 23 | vllm_sampling_params = SamplingParams(temperature=0.0, 24 | top_p=1.0, 25 | max_tokens=max_tokens, 26 | logprobs=num_top_logprobs, 27 | prompt_logprobs=num_prompt_logprobs) 28 | vllm_results = vllm_model.generate_w_logprobs(example_prompts, 29 | vllm_sampling_params) 30 | for result in vllm_results: 31 | assert result[2] is not None 32 | assert len(result[2]) == len(result[0]) 33 | # check whether all chosen tokens have ranks = 1 34 | for token, logprobs in zip(result[0], result[2]): 35 | assert token in logprobs 36 | assert logprobs[token].rank == 1 37 | 38 | ## Test non-greedy logprobs ranks 39 | sampling_params = SamplingParams(temperature=1.0, 40 | top_p=1.0, 41 | max_tokens=max_tokens, 42 | logprobs=num_top_logprobs, 43 | prompt_logprobs=num_prompt_logprobs) 44 | res = vllm_model.generate_w_logprobs(example_prompts, sampling_params) 45 | for result in res: 46 | assert result[2] is not None 47 | assert len(result[2]) == len(result[0]) 48 | # check whether all chosen tokens have ranks 49 | for token, logprobs in zip(result[0], result[2]): 50 | assert logprobs[token].rank >= 1 51 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/spec_decode/e2e/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/e2e/test_integration.py: -------------------------------------------------------------------------------- 1 | """Tests which cover integration of the speculative decoding framework with 2 | other features, e.g. cuda graphs. 3 | """ 4 | 5 | import pytest 6 | 7 | from .conftest import run_greedy_equality_correctness_test 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "common_llm_kwargs", 12 | [{ 13 | # Required for spec decode. 14 | "use_v2_block_manager": True, 15 | 16 | # Verify equality when cuda graphs allowed. 17 | "enforce_eager": False, 18 | "model": "JackFram/llama-68m", 19 | }]) 20 | @pytest.mark.parametrize( 21 | "per_test_common_llm_kwargs", 22 | [ 23 | { 24 | # Identical models. 25 | "speculative_model": "JackFram/llama-68m", 26 | "num_speculative_tokens": 5, 27 | }, 28 | ]) 29 | @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) 30 | @pytest.mark.parametrize("test_llm_kwargs", [{}]) 31 | @pytest.mark.parametrize("batch_size", [8]) 32 | @pytest.mark.parametrize("output_len", [32]) 33 | @pytest.mark.parametrize("seed", [1]) 34 | def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator, 35 | batch_size, output_len): 36 | """Verify spec decode equality when cuda graphs are enabled. 37 | """ 38 | run_greedy_equality_correctness_test( 39 | baseline_llm_generator, 40 | test_llm_generator, 41 | batch_size, 42 | max_output_len=output_len, 43 | force_output_len=True, 44 | ) 45 | -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/tensorizer_loader/__init__.py -------------------------------------------------------------------------------- /tests/test_inputs.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | 5 | from vllm.inputs import parse_and_batch_prompt 6 | 7 | STRING_INPUTS = [ 8 | '', 9 | 'foo', 10 | 'foo bar', 11 | 'foo baz bar', 12 | 'foo bar qux baz', 13 | ] 14 | 15 | TOKEN_INPUTS = [ 16 | [-1], 17 | [1], 18 | [1, 2], 19 | [1, 3, 4], 20 | [1, 2, 4, 3], 21 | ] 22 | 23 | INPUTS_SLICES = [ 24 | slice(None, None, -1), 25 | slice(None, None, 2), 26 | slice(None, None, -2), 27 | ] 28 | 29 | 30 | def test_parse_single_batch_empty(): 31 | with pytest.raises(ValueError, match="at least one prompt"): 32 | parse_and_batch_prompt([]) 33 | 34 | with pytest.raises(ValueError, match="at least one prompt"): 35 | parse_and_batch_prompt([[]]) 36 | 37 | 38 | @pytest.mark.parametrize('string_input', STRING_INPUTS) 39 | def test_parse_single_batch_string_consistent(string_input: str): 40 | assert parse_and_batch_prompt(string_input) \ 41 | == parse_and_batch_prompt([string_input]) 42 | 43 | 44 | @pytest.mark.parametrize('token_input', TOKEN_INPUTS) 45 | def test_parse_single_batch_token_consistent(token_input: List[int]): 46 | assert parse_and_batch_prompt(token_input) \ 47 | == parse_and_batch_prompt([token_input]) 48 | 49 | 50 | @pytest.mark.parametrize('inputs_slice', INPUTS_SLICES) 51 | def test_parse_single_batch_string_slice(inputs_slice: slice): 52 | assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \ 53 | == parse_and_batch_prompt(STRING_INPUTS[inputs_slice]) 54 | -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | """Containing tests that check for regressions in vLLM's behavior. 2 | 3 | It should include tests that are reported by users and making sure they 4 | will never happen again. 5 | 6 | """ 7 | import gc 8 | 9 | import torch 10 | 11 | from vllm import LLM, SamplingParams 12 | 13 | 14 | def test_duplicated_ignored_sequence_group(): 15 | """https://github.com/vllm-project/vllm/issues/1655""" 16 | 17 | sampling_params = SamplingParams(temperature=0.01, 18 | top_p=0.1, 19 | max_tokens=256) 20 | llm = LLM(model="facebook/opt-125m", 21 | max_num_batched_tokens=4096, 22 | tensor_parallel_size=1) 23 | prompts = ["This is a short prompt", "This is a very long prompt " * 1000] 24 | outputs = llm.generate(prompts, sampling_params=sampling_params) 25 | 26 | assert len(prompts) == len(outputs) 27 | 28 | 29 | def test_max_tokens_none(): 30 | sampling_params = SamplingParams(temperature=0.01, 31 | top_p=0.1, 32 | max_tokens=None) 33 | llm = LLM(model="facebook/opt-125m", 34 | max_num_batched_tokens=4096, 35 | tensor_parallel_size=1) 36 | prompts = ["Just say hello!"] 37 | outputs = llm.generate(prompts, sampling_params=sampling_params) 38 | 39 | assert len(prompts) == len(outputs) 40 | 41 | 42 | def test_gc(): 43 | llm = LLM("facebook/opt-125m", enforce_eager=True) 44 | del llm 45 | 46 | gc.collect() 47 | torch.cuda.empty_cache() 48 | 49 | # The memory allocated for model and KV cache should be released. 50 | # The memory allocated for PyTorch and others should be less than 50MB. 51 | # Usually, it's around 10MB. 52 | allocated = torch.cuda.memory_allocated() 53 | assert allocated < 50 * 1024 * 1024 54 | 55 | 56 | if __name__ == "__main__": 57 | import pytest 58 | pytest.main([__file__]) 59 | -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- 1 | """Tests for the SamplingParams class. 2 | """ 3 | from vllm import SamplingParams 4 | 5 | 6 | def test_max_tokens_none(): 7 | """max_tokens=None should be allowed""" 8 | SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | import pytest 13 | pytest.main([__file__]) 14 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_cached_tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer 6 | 7 | 8 | def test_cached_tokenizer(): 9 | reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") 10 | reference_tokenizer.add_special_tokens({"cls_token": ""}) 11 | reference_tokenizer.add_special_tokens( 12 | {"additional_special_tokens": [""]}) 13 | cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) 14 | 15 | assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( 16 | "prompt") 17 | assert set(reference_tokenizer.all_special_ids) == set( 18 | cached_tokenizer.all_special_ids) 19 | assert set(reference_tokenizer.all_special_tokens) == set( 20 | cached_tokenizer.all_special_tokens) 21 | assert set(reference_tokenizer.all_special_tokens_extended) == set( 22 | cached_tokenizer.all_special_tokens_extended) 23 | -------------------------------------------------------------------------------- /tests/tokenization/test_image_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers.image_processing_utils import BaseImageProcessor 3 | 4 | from vllm.transformers_utils.image_processor import get_image_processor 5 | 6 | IMAGE_PROCESSOR_NAMES = [ 7 | "llava-hf/llava-1.5-7b-hf", 8 | "llava-hf/llava-v1.6-34b-hf", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES) 13 | def test_image_processor_revision(processor_name: str): 14 | # Assume that "main" branch always exists 15 | image_processor = get_image_processor(processor_name, revision="main") 16 | assert isinstance(image_processor, BaseImageProcessor) 17 | 18 | # Assume that "never" branch always does not exist 19 | with pytest.raises(OSError, match='not a valid git identifier'): 20 | get_image_processor(processor_name, revision="never") 21 | -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import PreTrainedTokenizerBase 3 | 4 | from vllm.transformers_utils.tokenizer import get_tokenizer 5 | 6 | TOKENIZER_NAMES = [ 7 | "facebook/opt-125m", 8 | "gpt2", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) 13 | def test_tokenizer_revision(tokenizer_name: str): 14 | # Assume that "main" branch always exists 15 | tokenizer = get_tokenizer(tokenizer_name, revision="main") 16 | assert isinstance(tokenizer, PreTrainedTokenizerBase) 17 | 18 | # Assume that "never" branch always does not exist 19 | with pytest.raises(OSError, match='not a valid git identifier'): 20 | get_tokenizer(tokenizer_name, revision="never") 21 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/worker/__init__.py -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.entrypoints.llm import LLM 7 | from vllm.executor.ray_utils import initialize_ray_cluster 8 | from vllm.inputs import PromptStrictInputs, TextPrompt, TokensPrompt 9 | from vllm.model_executor.models import ModelRegistry 10 | from vllm.outputs import (CompletionOutput, EmbeddingOutput, 11 | EmbeddingRequestOutput, RequestOutput) 12 | from vllm.pooling_params import PoolingParams 13 | from vllm.sampling_params import SamplingParams 14 | 15 | __version__ = "0.4.3" 16 | 17 | __all__ = [ 18 | "LLM", 19 | "ModelRegistry", 20 | "PromptStrictInputs", 21 | "TextPrompt", 22 | "TokensPrompt", 23 | "SamplingParams", 24 | "RequestOutput", 25 | "CompletionOutput", 26 | "EmbeddingOutput", 27 | "EmbeddingRequestOutput", 28 | "LLMEngine", 29 | "EngineArgs", 30 | "AsyncLLMEngine", 31 | "AsyncEngineArgs", 32 | "initialize_ray_cluster", 33 | "PoolingParams", 34 | ] 35 | -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.attention.backends.abstract import (AttentionBackend, 2 | AttentionMetadata) 3 | from vllm.attention.layer import Attention 4 | from vllm.attention.selector import get_attn_backend 5 | 6 | __all__ = [ 7 | "Attention", 8 | "AttentionBackend", 9 | "AttentionMetadata", 10 | "Attention", 11 | "get_attn_backend", 12 | ] 13 | -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/attention/ops/blocksparse_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/ops/blocksparse_attention/__init__.py -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/utils.py: -------------------------------------------------------------------------------- 1 | """Block manager utils.""" 2 | from vllm.sequence import SequenceGroup 3 | 4 | # Exception strings for non-implemented block manager enc/dec scenarios 5 | 6 | STR_NOT_IMPL_ENC_DEC_SWA = \ 7 | "Sliding window attention for encoder/decoder models " + \ 8 | "is not currently supported." 9 | 10 | STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ 11 | "Prefix caching for encoder/decoder models " + \ 12 | "is not currently supported." 13 | 14 | 15 | def _get_block_mgr_sliding_window_attr(block_mgr): 16 | ''' 17 | BlockManagerV1 and BlockManagerV2 have slightly different 18 | members related to sliding window attention (SWA). This 19 | function extracts the appropriate member to use for determining 20 | whether SWA is enabled. 21 | 22 | Arguments: 23 | 24 | * block_mgr: BlockManagerV1 or BlockManagerV2 instance 25 | ''' 26 | 27 | if hasattr(block_mgr, 'block_sliding_window'): 28 | return block_mgr.block_sliding_window 29 | if hasattr(block_mgr, 'max_block_sliding_window'): 30 | return block_mgr.max_block_sliding_window 31 | 32 | raise AttributeError("Block manager instance has neither " + \ 33 | "block_sliding_window nor " + \ 34 | "max_block_sliding_window attributes.") 35 | 36 | 37 | def check_no_caching_or_swa_for_blockmgr_encdec( 38 | block_mgr, seq_group: SequenceGroup) -> None: 39 | ''' 40 | Enforce that prefix caching & sliding-window attention (SWA) 41 | are currently unsupported *specifically* for encoder/decoder models. 42 | 43 | Raises NotImplementedError if unsupported scenario is detected. 44 | 45 | Arguments: 46 | 47 | * block_mgr: BlockSpaceManager instance 48 | * seq_group: SequenceGroup passed to block_mgr 49 | ''' 50 | 51 | if seq_group.is_encoder_decoder(): 52 | if _get_block_mgr_sliding_window_attr(block_mgr) is not None: 53 | raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) 54 | 55 | if block_mgr.enable_caching: 56 | raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) 57 | -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from typing import Deque 3 | 4 | from vllm.sequence import SequenceGroup 5 | 6 | 7 | class Policy: 8 | 9 | def get_priority( 10 | self, 11 | now: float, 12 | seq_group: SequenceGroup, 13 | ) -> float: 14 | raise NotImplementedError 15 | 16 | def sort_by_priority( 17 | self, 18 | now: float, 19 | seq_groups: Deque[SequenceGroup], 20 | ) -> Deque[SequenceGroup]: 21 | return deque( 22 | sorted( 23 | seq_groups, 24 | key=lambda seq_group: self.get_priority(now, seq_group), 25 | reverse=True, 26 | )) 27 | 28 | 29 | class FCFS(Policy): 30 | 31 | def get_priority( 32 | self, 33 | now: float, 34 | seq_group: SequenceGroup, 35 | ) -> float: 36 | return now - seq_group.metrics.arrival_time 37 | 38 | 39 | class PolicyFactory: 40 | 41 | _POLICY_REGISTRY = {'fcfs': FCFS} 42 | 43 | @classmethod 44 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 45 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 46 | -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 5 | from typing import Sequence 6 | 7 | import torch 8 | 9 | 10 | def ensure_divisibility(numerator, denominator): 11 | """Ensure that numerator is divisible by the denominator.""" 12 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 13 | numerator, denominator) 14 | 15 | 16 | def divide(numerator, denominator): 17 | """Ensure that numerator is divisible by the denominator and return 18 | the division value.""" 19 | ensure_divisibility(numerator, denominator) 20 | return numerator // denominator 21 | 22 | 23 | def split_tensor_along_last_dim( 24 | tensor: torch.Tensor, 25 | num_partitions: int, 26 | contiguous_split_chunks: bool = False, 27 | ) -> Sequence[torch.Tensor]: 28 | """ Split a tensor along its last dimension. 29 | 30 | Arguments: 31 | tensor: input tensor. 32 | num_partitions: number of partitions to split the tensor 33 | contiguous_split_chunks: If True, make each chunk contiguous 34 | in memory. 35 | 36 | Returns: 37 | A list of Tensors 38 | """ 39 | # Get the size and dimension. 40 | last_dim = tensor.dim() - 1 41 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 42 | # Split. 43 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 44 | # NOTE: torch.split does not create contiguous tensors by default. 45 | if contiguous_split_chunks: 46 | return tuple(chunk.contiguous() for chunk in tensor_list) 47 | 48 | return tensor_list 49 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing import Sequence as GenericSequence 3 | from typing import Union 4 | 5 | from vllm.sequence import PoolerOutput, SamplerOutput, SequenceGroupOutput 6 | 7 | 8 | def create_output_by_sequence_group( 9 | outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]], 10 | num_seq_groups: int) -> List[List[SequenceGroupOutput]]: 11 | """Helper method which transforms a 2d list organized by 12 | [step][sequence group] into [sequence group][step]. 13 | """ 14 | output_by_sequence_group: List[List[SequenceGroupOutput]] = [ 15 | [] for _ in range(num_seq_groups) 16 | ] 17 | for step in outputs: 18 | for i, sequence_group_output in enumerate(step): 19 | output_by_sequence_group[i].append(sequence_group_output) 20 | 21 | return output_by_sequence_group 22 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/logging/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.logging.formatter import NewLineFormatter 2 | 3 | __all__ = [ 4 | "NewLineFormatter", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/logging/formatter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NewLineFormatter(logging.Formatter): 5 | """Adds logging prefix to newlines to align multi-line messages.""" 6 | 7 | def __init__(self, fmt, datefmt=None, style="%"): 8 | logging.Formatter.__init__(self, fmt, datefmt, style) 9 | 10 | def format(self, record): 11 | msg = logging.Formatter.format(self, record) 12 | if record.message != "": 13 | parts = msg.split(record.message) 14 | msg = msg.replace("\n", "\r\n" + parts[0]) 15 | return msg 16 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/request.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class LoRARequest: 7 | """ 8 | Request for a LoRA adapter. 9 | 10 | Note that this class should be be used internally. For online 11 | serving, it is recommended to not allow users to use this class but 12 | instead provide another layer of abstraction to prevent users from 13 | accessing unauthorized LoRA adapters. 14 | 15 | lora_int_id must be globally unique for a given adapter. 16 | This is currently not enforced in vLLM. 17 | """ 18 | 19 | lora_name: str 20 | lora_int_id: int 21 | lora_local_path: str 22 | long_lora_max_len: Optional[int] = None 23 | 24 | def __post_init__(self): 25 | if self.lora_int_id < 1: 26 | raise ValueError( 27 | f"lora_int_id must be > 0, got {self.lora_int_id}") 28 | 29 | def __eq__(self, value: object) -> bool: 30 | return isinstance( 31 | value, LoRARequest) and self.lora_int_id == value.lora_int_id 32 | 33 | def __hash__(self) -> int: 34 | return self.lora_int_id 35 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.sampling_metadata import SamplingMetadata 2 | from vllm.model_executor.utils import set_random_seed 3 | 4 | __all__ = [ 5 | "SamplingMetadata", 6 | "set_random_seed", 7 | ] 8 | -------------------------------------------------------------------------------- /vllm/model_executor/guided_decoding/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, 4 | CompletionRequest) 5 | from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( 6 | get_lm_format_enforcer_guided_decoding_logits_processor) 7 | from vllm.model_executor.guided_decoding.outlines_decoding import ( 8 | get_outlines_guided_decoding_logits_processor) 9 | from vllm.sampling_params import LogitsProcessor 10 | 11 | 12 | async def get_guided_decoding_logits_processor( 13 | guided_decoding_backend: str, request: Union[CompletionRequest, 14 | ChatCompletionRequest], 15 | tokenizer) -> Optional[LogitsProcessor]: 16 | if guided_decoding_backend == 'outlines': 17 | return await get_outlines_guided_decoding_logits_processor( 18 | request, tokenizer) 19 | if guided_decoding_backend == 'lm-format-enforcer': 20 | return await get_lm_format_enforcer_guided_decoding_logits_processor( 21 | request, tokenizer) 22 | 23 | raise ValueError( 24 | f"Unknown guided decoding backend '{guided_decoding_backend}'. " 25 | "Must be one of 'outlines, 'lm-format-enforcer'") 26 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.fused_moe.fused_moe import ( 2 | fused_experts, fused_moe, fused_topk, get_config_file_name) 3 | 4 | __all__ = [ 5 | "fused_moe", 6 | "fused_topk", 7 | "fused_experts", 8 | "get_config_file_name", 9 | ] 10 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/pooler.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from vllm.model_executor.pooling_metadata import (PoolingMetadata, 7 | PoolingTensors) 8 | from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput 9 | 10 | 11 | class PoolingType(IntEnum): 12 | """Enumeration for different types of pooling methods.""" 13 | LAST = 0 14 | 15 | 16 | class Pooler(nn.Module): 17 | """A layer that pools specific information from hidden states. 18 | 19 | This layer does the following: 20 | 1. Extracts specific tokens or aggregates data based on pooling method. 21 | 2. Normalizes output if specified. 22 | 3. Returns structured results as `PoolerOutput`. 23 | 24 | Attributes: 25 | pooling_type: The type of pooling to use (LAST, AVERAGE, MAX). 26 | normalize: Whether to normalize the pooled data. 27 | """ 28 | 29 | def __init__(self, pooling_type: PoolingType, normalize: bool): 30 | super().__init__() 31 | self.pooling_type = pooling_type 32 | self.normalize = normalize 33 | 34 | def forward( 35 | self, 36 | hidden_states: torch.Tensor, 37 | pooling_metadata: PoolingMetadata, 38 | ) -> PoolerOutput: 39 | """Pools specific information from hidden states based on metadata.""" 40 | prompt_lens = PoolingTensors.from_pooling_metadata( 41 | pooling_metadata, hidden_states.device).prompt_lens 42 | 43 | if self.pooling_type == PoolingType.LAST: 44 | last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1 45 | pooled_data = hidden_states[last_token_flat_indices] 46 | else: 47 | raise ValueError(f"Invalid pooling type: {self.pooling_type}") 48 | 49 | if self.normalize: 50 | pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1) 51 | 52 | pooled_outputs = [ 53 | EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data 54 | ] 55 | 56 | return PoolerOutput(outputs=pooled_outputs) 57 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | 3 | from vllm.model_executor.layers.quantization.aqlm import AQLMConfig 4 | from vllm.model_executor.layers.quantization.awq import AWQConfig 5 | from vllm.model_executor.layers.quantization.base_config import ( 6 | QuantizationConfig) 7 | from vllm.model_executor.layers.quantization.bitsandbytes import ( 8 | BitsAndBytesConfig) 9 | from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 10 | CompressedTensorsConfig) 11 | from vllm.model_executor.layers.quantization.deepspeedfp import ( 12 | DeepSpeedFPConfig) 13 | from vllm.model_executor.layers.quantization.fp8 import Fp8Config 14 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig 15 | from vllm.model_executor.layers.quantization.gptq_marlin import ( 16 | GPTQMarlinConfig) 17 | from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( 18 | GPTQMarlin24Config) 19 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig 20 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig 21 | 22 | QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { 23 | "aqlm": AQLMConfig, 24 | "awq": AWQConfig, 25 | "deepspeedfp": DeepSpeedFPConfig, 26 | "fp8": Fp8Config, 27 | # The order of gptq methods is important for config.py iteration over 28 | # override_quantization_method(..) 29 | "marlin": MarlinConfig, 30 | "gptq_marlin_24": GPTQMarlin24Config, 31 | "gptq_marlin": GPTQMarlinConfig, 32 | "gptq": GPTQConfig, 33 | "squeezellm": SqueezeLLMConfig, 34 | "sparseml": CompressedTensorsConfig, 35 | "bitsandbytes": BitsAndBytesConfig, 36 | } 37 | 38 | 39 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: 40 | if quantization not in QUANTIZATION_METHODS: 41 | raise ValueError(f"Invalid quantization method: {quantization}") 42 | return QUANTIZATION_METHODS[quantization] 43 | 44 | 45 | __all__ = [ 46 | "QuantizationConfig", 47 | "get_quantization_config", 48 | "QUANTIZATION_METHODS", 49 | ] 50 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 2 | from .compressed_tensors_unquantized import ( # noqa: F401 3 | CompressedTensorsUnquantized) 4 | from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 5 | CompressedTensorsW8A8StaticTensor) 6 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import torch 4 | 5 | __all__ = ["CompressedTensorsScheme"] 6 | 7 | 8 | class CompressedTensorsScheme(ABC): 9 | """ 10 | Abstract class used to describe the weight creation and forward pass 11 | of different quantization schemes supported by CompressedTensors. 12 | """ 13 | 14 | @abstractmethod 15 | def create_weights(self, *args, **kwargs): 16 | """ 17 | Weight creation for the particular scheme. Inputs to this function 18 | 19 | """ 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): 24 | """ 25 | Run the forward pass for the particular scheme. This is where 26 | scheme-specific dequant/quant steps/kernels should be applied. 27 | 28 | :param layer: toch.nn.Module with the registered weights and 29 | other parameters relevant to the particular scheme. 30 | :param x: input to the layer 31 | 32 | """ 33 | raise NotImplementedError 34 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.nn import Parameter 6 | 7 | from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( 8 | CompressedTensorsScheme) 9 | from vllm.model_executor.utils import set_weight_attrs 10 | 11 | __all__ = ["CompressedTensorsUnquantized"] 12 | 13 | 14 | class CompressedTensorsUnquantized(CompressedTensorsScheme): 15 | """ 16 | Implements the scheme for all layers which are ignored 17 | in the CompressedTensors config. The input and loaded weight are used 18 | in a linear transformation. 19 | """ 20 | 21 | def create_weights(self, layer: torch.nn.Module, 22 | output_partition_sizes: List[int], 23 | input_size_per_partition: int, 24 | params_dtype: torch.dtype, weight_loader: Callable, 25 | **kwargs): 26 | 27 | weight = Parameter(torch.empty(sum(output_partition_sizes), 28 | input_size_per_partition, 29 | device="cuda", 30 | dtype=params_dtype), 31 | requires_grad=False) 32 | 33 | set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) 34 | layer.register_parameter("weight", weight) 35 | set_weight_attrs(weight, {"weight_loader": weight_loader}) 36 | 37 | def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): 38 | weight = layer.weight 39 | return F.linear(x, weight) 40 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/quantization/utils/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch import nn 4 | 5 | from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, 6 | ModelConfig, ParallelConfig, SchedulerConfig, 7 | VisionLanguageConfig) 8 | from vllm.model_executor.model_loader.loader import (BaseModelLoader, 9 | get_model_loader) 10 | from vllm.model_executor.model_loader.utils import ( 11 | get_architecture_class_name, get_model_architecture) 12 | 13 | 14 | def get_model(*, model_config: ModelConfig, load_config: LoadConfig, 15 | device_config: DeviceConfig, parallel_config: ParallelConfig, 16 | scheduler_config: SchedulerConfig, 17 | lora_config: Optional[LoRAConfig], 18 | vision_language_config: Optional[VisionLanguageConfig], 19 | cache_config: CacheConfig) -> nn.Module: 20 | loader = get_model_loader(load_config) 21 | return loader.load_model(model_config=model_config, 22 | device_config=device_config, 23 | lora_config=lora_config, 24 | vision_language_config=vision_language_config, 25 | parallel_config=parallel_config, 26 | scheduler_config=scheduler_config, 27 | cache_config=cache_config) 28 | 29 | 30 | __all__ = [ 31 | "get_model", "get_model_loader", "BaseModelLoader", 32 | "get_architecture_class_name", "get_model_architecture" 33 | ] 34 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Tuple, Type 4 | 5 | import torch 6 | from torch import nn 7 | 8 | from vllm.config import ModelConfig 9 | from vllm.model_executor.models import ModelRegistry 10 | 11 | 12 | @contextlib.contextmanager 13 | def set_default_torch_dtype(dtype: torch.dtype): 14 | """Sets the default torch dtype to the given dtype.""" 15 | old_dtype = torch.get_default_dtype() 16 | torch.set_default_dtype(dtype) 17 | yield 18 | torch.set_default_dtype(old_dtype) 19 | 20 | 21 | def get_model_architecture( 22 | model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: 23 | architectures = getattr(model_config.hf_config, "architectures", []) 24 | # Special handling for quantized Mixtral. 25 | # FIXME(woosuk): This is a temporary hack. 26 | if (model_config.quantization is not None 27 | and model_config.quantization != "fp8" 28 | and "MixtralForCausalLM" in architectures): 29 | architectures = ["QuantMixtralForCausalLM"] 30 | 31 | for arch in architectures: 32 | model_cls = ModelRegistry.load_model_cls(arch) 33 | if model_cls is not None: 34 | return (model_cls, arch) 35 | raise ValueError( 36 | f"Model architectures {architectures} are not supported for now. " 37 | f"Supported architectures: {ModelRegistry.get_supported_archs()}") 38 | 39 | 40 | def get_architecture_class_name(model_config: ModelConfig) -> str: 41 | return get_model_architecture(model_config)[1] 42 | -------------------------------------------------------------------------------- /vllm/model_executor/models/vlm_base.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from vllm.config import VisionLanguageConfig 4 | 5 | 6 | class VisionLanguageModelBase(nn.Module): 7 | """Base class for all vision language models (VLMs).""" 8 | 9 | def __init__(self, vision_language_config: VisionLanguageConfig) -> None: 10 | super().__init__() 11 | 12 | self.vision_language_config = vision_language_config 13 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | from typing import Any, Dict, Optional 4 | 5 | import numpy as np 6 | import torch 7 | 8 | 9 | def set_random_seed(seed: int) -> None: 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | if torch.cuda.is_available(): 14 | torch.cuda.manual_seed_all(seed) 15 | 16 | 17 | def set_weight_attrs( 18 | weight: torch.Tensor, 19 | weight_attrs: Optional[Dict[str, Any]], 20 | ): 21 | """Set attributes on a weight tensor. 22 | 23 | This method is used to set attributes on a weight tensor. This method 24 | will not overwrite existing attributes. 25 | 26 | Args: 27 | weight: The weight tensor. 28 | weight_attrs: A dictionary of attributes to set on the weight tensor. 29 | """ 30 | if weight_attrs is None: 31 | return 32 | for key, value in weight_attrs.items(): 33 | assert not hasattr( 34 | weight, key), (f"Overwriting existing tensor attribute: {key}") 35 | setattr(weight, key, value) 36 | -------------------------------------------------------------------------------- /vllm/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import MultiModalData, MultiModalPlugin 2 | from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry 3 | 4 | __all__ = [ 5 | "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY", 6 | "MultiModalRegistry" 7 | ] 8 | -------------------------------------------------------------------------------- /vllm/pooling_params.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | 4 | class PoolingParams: 5 | """Pooling parameters for pooling. 6 | 7 | Attributes: 8 | additional_data: Any additional data needed for pooling. 9 | """ 10 | 11 | def __init__(self, additional_data: Optional[Any] = None): 12 | self.additional_data = additional_data 13 | 14 | def clone(self) -> "PoolingParams": 15 | """Returns a deep copy of the PoolingParams instance.""" 16 | return PoolingParams(additional_data=self.additional_data, ) 17 | 18 | def __repr__(self) -> str: 19 | return (f"PoolingParams(" 20 | f"additional_metadata={self.additional_data})") 21 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/transformers_utils/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig 2 | from vllm.transformers_utils.configs.dbrx import DbrxConfig 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 5 | # `FalconConfig` class from the official HuggingFace transformers library. 6 | from vllm.transformers_utils.configs.falcon import RWConfig 7 | from vllm.transformers_utils.configs.jais import JAISConfig 8 | from vllm.transformers_utils.configs.mpt import MPTConfig 9 | 10 | __all__ = [ 11 | "ChatGLMConfig", 12 | "DbrxConfig", 13 | "MPTConfig", 14 | "RWConfig", 15 | "JAISConfig", 16 | ] 17 | -------------------------------------------------------------------------------- /vllm/transformers_utils/image_processor.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from typing import Optional 3 | 4 | from transformers import AutoImageProcessor 5 | from transformers.image_processing_utils import BaseImageProcessor 6 | 7 | from vllm.logger import init_logger 8 | 9 | logger = init_logger(__name__) 10 | 11 | 12 | def get_image_processor( 13 | processor_name: str, 14 | *args, 15 | trust_remote_code: bool = False, 16 | revision: Optional[str] = None, 17 | **kwargs, 18 | ) -> BaseImageProcessor: 19 | """Gets an image processor for the given model name via HuggingFace.""" 20 | try: 21 | processor: BaseImageProcessor = AutoImageProcessor.from_pretrained( 22 | processor_name, 23 | *args, 24 | trust_remote_code=trust_remote_code, 25 | revision=revision, 26 | **kwargs) 27 | except ValueError as e: 28 | # If the error pertains to the processor class not existing or not 29 | # currently being imported, suggest using the --trust-remote-code flag. 30 | # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors 31 | if not trust_remote_code: 32 | err_msg = ( 33 | "Failed to load the image processor. If the image processor is " 34 | "a custom processor not yet available in the HuggingFace " 35 | "transformers library, consider setting " 36 | "`trust_remote_code=True` in LLM or using the " 37 | "`--trust-remote-code` flag in the CLI.") 38 | raise RuntimeError(err_msg) from e 39 | else: 40 | raise e 41 | 42 | return processor 43 | 44 | 45 | cached_get_image_processor = lru_cache(get_image_processor) 46 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from vllm.config import TokenizerPoolConfig 4 | from vllm.executor.ray_utils import ray 5 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( 6 | BaseTokenizerGroup) 7 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( 8 | TokenizerGroup) 9 | 10 | if ray: 11 | from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( 12 | RayTokenizerGroupPool) 13 | else: 14 | RayTokenizerGroupPool = None # type: ignore 15 | 16 | 17 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], 18 | **init_kwargs) -> BaseTokenizerGroup: 19 | if tokenizer_pool_config is None: 20 | return TokenizerGroup(**init_kwargs) 21 | if tokenizer_pool_config.pool_type == "ray": 22 | if RayTokenizerGroupPool is None: 23 | raise ImportError( 24 | "RayTokenizerGroupPool is not available. Please install " 25 | "the ray package to use the Ray tokenizer group pool.") 26 | return RayTokenizerGroupPool.from_config(tokenizer_pool_config, 27 | **init_kwargs) 28 | else: 29 | raise ValueError( 30 | f"Unknown pool type: {tokenizer_pool_config.pool_type}") 31 | 32 | 33 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"] 34 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional 3 | 4 | from transformers import PreTrainedTokenizer 5 | 6 | from vllm.lora.request import LoRARequest 7 | 8 | 9 | class BaseTokenizerGroup(ABC): 10 | """A group of tokenizers that can be used for LoRA adapters.""" 11 | 12 | @abstractmethod 13 | def ping(self) -> bool: 14 | """Check if the tokenizer group is alive.""" 15 | pass 16 | 17 | @abstractmethod 18 | def get_max_input_len(self, 19 | lora_request: Optional[LoRARequest] = None 20 | ) -> Optional[int]: 21 | """Get the maximum input length for the LoRA request.""" 22 | pass 23 | 24 | @abstractmethod 25 | def encode(self, 26 | prompt: str, 27 | request_id: Optional[str] = None, 28 | lora_request: Optional[LoRARequest] = None) -> List[int]: 29 | """Encode a prompt using the tokenizer group.""" 30 | pass 31 | 32 | @abstractmethod 33 | async def encode_async( 34 | self, 35 | prompt: str, 36 | request_id: Optional[str] = None, 37 | lora_request: Optional[LoRARequest] = None) -> List[int]: 38 | """Encode a prompt using the tokenizer group.""" 39 | pass 40 | 41 | @abstractmethod 42 | def get_lora_tokenizer( 43 | self, 44 | lora_request: Optional[LoRARequest] = None 45 | ) -> "PreTrainedTokenizer": 46 | """Get a tokenizer for a LoRA request.""" 47 | pass 48 | 49 | @abstractmethod 50 | async def get_lora_tokenizer_async( 51 | self, 52 | lora_request: Optional[LoRARequest] = None 53 | ) -> "PreTrainedTokenizer": 54 | """Get a tokenizer for a LoRA request.""" 55 | pass 56 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer 2 | 3 | __all__ = [ 4 | "BaichuanTokenizer", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/usage/__init__.py -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/worker/__init__.py --------------------------------------------------------------------------------