├── .buildkite
    ├── check-wheel-size.py
    ├── download-images.sh
    ├── run-amd-test.sh
    ├── run-benchmarks.sh
    ├── run-cpu-test.sh
    ├── run-neuron-test.sh
    ├── test-pipeline.yaml
    └── test-template.j2
├── .clang-format
├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug report.yml
    │   ├── 500-feature request.yml
    │   ├── 600-new model.yml
    │   ├── 700-performance discussion.yml
    │   ├── 750-RFC.yml
    │   ├── 800-misc discussion.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── clang-format.yml
    │   ├── mypy.yaml
    │   ├── publish.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── .yapfignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.cpu
├── Dockerfile.neuron
├── Dockerfile.rocm
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── README.md
    ├── backend_request_func.py
    ├── benchmark_latency.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── cutlass_benchmarks
    │   ├── w8a8_benchmarks.py
    │   └── weight_shapes.py
    ├── kernels
    │   ├── benchmark_aqlm.py
    │   ├── benchmark_marlin.py
    │   ├── benchmark_mixtral_moe.py
    │   ├── benchmark_paged_attention.py
    │   ├── benchmark_rope.py
    │   └── benchmark_shapes.py
    ├── launch_tgi_server.sh
    ├── overheads
    │   └── benchmark_hashing.py
    └── sonnet.txt
├── cmake
    ├── cpu_extension.cmake
    ├── hipify.py
    └── utils.cmake
├── collect_env.py
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cu
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   ├── dtype_float32.cuh
    │   └── dtype_fp8.cuh
    ├── cache.h
    ├── cache_kernels.cu
    ├── cpu
    │   ├── activation.cpp
    │   ├── attention.cpp
    │   ├── cache.cpp
    │   ├── cpu_types.hpp
    │   ├── layernorm.cpp
    │   ├── pos_encoding.cpp
    │   └── pybind.cpp
    ├── cuda_compat.h
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── custom_all_reduce.cu
    ├── custom_all_reduce.cuh
    ├── custom_all_reduce_test.cu
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── moe
    │   ├── moe_ops.cpp
    │   ├── moe_ops.h
    │   └── topk_softmax_kernels.cu
    ├── moe_align_block_size_kernels.cu
    ├── ops.h
    ├── pos_encoding_kernels.cu
    ├── punica
    │   ├── LICENSE
    │   ├── bgmv
    │   │   ├── bgmv_bf16_bf16_bf16.cu
    │   │   ├── bgmv_bf16_fp32_bf16.cu
    │   │   ├── bgmv_config.h
    │   │   ├── bgmv_fp16_fp16_fp16.cu
    │   │   ├── bgmv_fp16_fp32_fp16.cu
    │   │   ├── bgmv_fp32_bf16_bf16.cu
    │   │   ├── bgmv_fp32_fp16_fp16.cu
    │   │   ├── bgmv_impl.cuh
    │   │   ├── generator.py
    │   │   └── vec_dtypes.cuh
    │   ├── punica_ops.cu
    │   ├── punica_ops.h
    │   ├── punica_pybind.cpp
    │   └── type_convert.h
    ├── pybind.cpp
    ├── quantization
    │   ├── aqlm
    │   │   └── gemm_kernels.cu
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   ├── compressed_tensors
    │   │   └── int8_quant_kernels.cu
    │   ├── cutlass_w8a8
    │   │   ├── broadcast_load_epilogue_c2x.hpp
    │   │   ├── broadcast_load_epilogue_c3x.hpp
    │   │   ├── common.hpp
    │   │   ├── scaled_mm_dq_c2x.cu
    │   │   ├── scaled_mm_dq_c3x.cu
    │   │   └── scaled_mm_dq_entry.cu
    │   ├── fp8
    │   │   ├── amd
    │   │   │   ├── hip_float8.h
    │   │   │   ├── hip_float8_impl.h
    │   │   │   └── quant_utils.cuh
    │   │   ├── common.cu
    │   │   └── nvidia
    │   │   │   └── quant_utils.cuh
    │   ├── gptq
    │   │   ├── compat.cuh
    │   │   ├── matrix_view.cuh
    │   │   ├── q_gemm.cu
    │   │   ├── qdq_2.cuh
    │   │   ├── qdq_3.cuh
    │   │   ├── qdq_4.cuh
    │   │   ├── qdq_8.cuh
    │   │   └── qdq_util.cuh
    │   ├── gptq_marlin
    │   │   ├── gptq_marlin.cu
    │   │   ├── gptq_marlin.cuh
    │   │   ├── gptq_marlin_dtypes.cuh
    │   │   └── gptq_marlin_repack.cu
    │   ├── marlin
    │   │   ├── dense
    │   │   │   ├── LICENSE
    │   │   │   └── marlin_cuda_kernel.cu
    │   │   └── sparse
    │   │   │   ├── LICENSE
    │   │   │   ├── common
    │   │   │       ├── base.h
    │   │   │       ├── mem.h
    │   │   │       └── mma.h
    │   │   │   └── marlin_24_cuda_kernel.cu
    │   └── squeezellm
    │   │   └── quant_cuda_kernel.cu
    └── reduction_utils.cuh
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── assets
    │       ├── dev
    │       │   └── dockerfile-stages-dependency.png
    │       ├── kernel
    │       │   ├── k_vecs.png
    │       │   ├── key.png
    │       │   ├── logits_vec.png
    │       │   ├── q_vecs.png
    │       │   ├── query.png
    │       │   ├── v_vec.png
    │       │   └── value.png
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── community
    │       ├── meetups.rst
    │       └── sponsors.md
    │   ├── conf.py
    │   ├── dev
    │       ├── dockerfile
    │       │   └── dockerfile.rst
    │       ├── engine
    │       │   ├── async_llm_engine.rst
    │       │   ├── engine_index.rst
    │       │   └── llm_engine.rst
    │       ├── kernel
    │       │   └── paged_attention.rst
    │       ├── multimodal
    │       │   └── multimodal_index.rst
    │       ├── offline_inference
    │       │   ├── llm.rst
    │       │   ├── llm_inputs.rst
    │       │   └── offline_index.rst
    │       └── sampling_params.rst
    │   ├── generate_examples.py
    │   ├── getting_started
    │       ├── amd-installation.rst
    │       ├── cpu-installation.rst
    │       ├── examples
    │       │   └── examples_index.template.rst
    │       ├── installation.rst
    │       ├── neuron-installation.rst
    │       └── quickstart.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── engine_args.rst
    │       ├── lora.rst
    │       ├── performance.rst
    │       ├── supported_models.rst
    │       └── vlm.rst
    │   ├── quantization
    │       ├── auto_awq.rst
    │       ├── fp8_e4m3_kvcache.rst
    │       └── fp8_e5m2_kvcache.rst
    │   └── serving
    │       ├── deploying_with_bentoml.rst
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_dstack.rst
    │       ├── deploying_with_kserve.rst
    │       ├── deploying_with_lws.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       ├── env_vars.rst
    │       ├── integrations.rst
    │       ├── metrics.rst
    │       ├── openai_compatible_server.md
    │       ├── run_on_sky.rst
    │       ├── serving_with_langchain.rst
    │       └── usage_stats.md
├── examples
    ├── api_client.py
    ├── aqlm_example.py
    ├── fp8
    │   ├── README.md
    │   ├── extract_scales.py
    │   └── quantizer
    │   │   ├── README.md
    │   │   └── quantize.py
    ├── gradio_openai_chatbot_webserver.py
    ├── gradio_webserver.py
    ├── llava_example.py
    ├── llm_engine_example.py
    ├── logging_configuration.md
    ├── lora_with_quantization_inference.py
    ├── multilora_inference.py
    ├── offline_inference.py
    ├── offline_inference_arctic.py
    ├── offline_inference_distributed.py
    ├── offline_inference_embedding.py
    ├── offline_inference_neuron.py
    ├── offline_inference_openai.md
    ├── offline_inference_with_prefix.py
    ├── openai_chat_completion_client.py
    ├── openai_completion_client.py
    ├── openai_embedding_client.py
    ├── openai_example_batch.jsonl
    ├── production_monitoring
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── grafana.json
    │   └── prometheus.yaml
    ├── save_sharded_state.py
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    ├── template_inkbot.jinja
    └── tensorize_vllm_model.py
├── format.sh
├── pyproject.toml
├── requirements-build.txt
├── requirements-common.txt
├── requirements-cpu.txt
├── requirements-cuda.txt
├── requirements-dev.txt
├── requirements-neuron.txt
├── requirements-rocm.txt
├── rocm_patch
    └── rocm_bf16.patch
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── __init__.py
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   ├── test_chat_template.py
    │   ├── test_openapi_server_ray.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   ├── __init__.py
    │   ├── test_basic_correctness.py
    │   ├── test_chunked_prefill.py
    │   └── test_preemption.py
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── block
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── e2e
    │   │   │   ├── __init__.py
    │   │   │   ├── conftest.py
    │   │   │   ├── test_correctness.py
    │   │   │   └── test_correctness_sliding_window.py
    │   │   ├── test_block_manager_v2.py
    │   │   ├── test_block_table.py
    │   │   ├── test_common.py
    │   │   ├── test_cpu_gpu_block_allocator.py
    │   │   ├── test_naive_block.py
    │   │   └── test_prefix_caching_block.py
    │   ├── test_block_manager.py
    │   ├── test_chunked_prefill_scheduler.py
    │   ├── test_scheduler.py
    │   └── utils.py
    ├── distributed
    │   ├── __init__.py
    │   ├── test_basic_distributed_correctness.py
    │   ├── test_chunked_prefill_distributed.py
    │   ├── test_comm_ops.py
    │   ├── test_custom_all_reduce.py
    │   └── test_pynccl.py
    ├── engine
    │   ├── __init__.py
    │   ├── output_processor
    │   │   ├── __init__.py
    │   │   ├── test_multi_step.py
    │   │   └── test_stop_checker.py
    │   ├── test_computed_prefix_blocks.py
    │   ├── test_detokenization.py
    │   ├── test_multiproc_workers.py
    │   ├── test_skip_tokenizer_init.py
    │   ├── test_stop_reason.py
    │   └── test_stop_strings.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── openai
    │   │   └── test_serving_chat.py
    │   ├── test_guided_processors.py
    │   ├── test_llm_encode.py
    │   ├── test_llm_generate.py
    │   ├── test_openai_run_batch.py
    │   ├── test_openai_server.py
    │   └── test_server_oot_registration.py
    ├── fp8_kv
    │   ├── llama2-70b-fp8-kv
    │   │   └── kv_cache_scales.json
    │   └── llama2-7b-fp8-kv
    │   │   └── kv_cache_scales.json
    ├── kernels
    │   ├── __init__.py
    │   ├── allclose_default.py
    │   ├── conftest.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_attention_selector.py
    │   ├── test_blocksparse_attention.py
    │   ├── test_cache.py
    │   ├── test_cutlass.py
    │   ├── test_flash_attn.py
    │   ├── test_int8_quant.py
    │   ├── test_layernorm.py
    │   ├── test_marlin_gemm.py
    │   ├── test_moe.py
    │   ├── test_pos_encoding.py
    │   ├── test_prefix_prefill.py
    │   ├── test_rand.py
    │   └── test_sampler.py
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── long_context_test_data.py
    │   ├── test_baichuan.py
    │   ├── test_chatglm3.py
    │   ├── test_gemma.py
    │   ├── test_layer_variation.py
    │   ├── test_layers.py
    │   ├── test_llama.py
    │   ├── test_long_context.py
    │   ├── test_lora.py
    │   ├── test_lora_checkpoints.py
    │   ├── test_lora_manager.py
    │   ├── test_mixtral.py
    │   ├── test_phi.py
    │   ├── test_punica.py
    │   ├── test_quant_model.py
    │   ├── test_tokenizer_group.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   ├── __init__.py
    │   └── test_metrics.py
    ├── model_executor
    │   ├── __init__.py
    │   └── weight_utils.py
    ├── models
    │   ├── __init__.py
    │   ├── test_aqlm.py
    │   ├── test_big_models.py
    │   ├── test_embedding.py
    │   ├── test_fp8.py
    │   ├── test_gptq_marlin.py
    │   ├── test_gptq_marlin_24.py
    │   ├── test_llava.py
    │   ├── test_marlin.py
    │   ├── test_mistral.py
    │   ├── test_models.py
    │   ├── test_oot_registration.py
    │   ├── test_registry.py
    │   └── utils.py
    ├── multimodal
    │   ├── __init__.py
    │   └── test_processor.py
    ├── prefix_caching
    │   ├── __init__.py
    │   ├── test_disable_sliding_window.py
    │   └── test_prefix_caching.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── quantization
    │   ├── __init__.py
    │   ├── test_bitsandbytes.py
    │   ├── test_compressed_tensors.py
    │   ├── test_configs.py
    │   └── test_fp8.py
    ├── samplers
    │   ├── __init__.py
    │   ├── test_beam_search.py
    │   ├── test_ignore_eos.py
    │   ├── test_logits_processor.py
    │   ├── test_logprobs.py
    │   ├── test_ranks.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   └── test_seeded_generate.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compatibility.py
    │   │   ├── test_integration.py
    │   │   ├── test_integration_dist.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_multistep_correctness.py
    │   │   └── test_ngram_correctness.py
    │   ├── test_batch_expansion.py
    │   ├── test_dynamic_spec_decode.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_ngram_worker.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── tensorizer_loader
    │   ├── __init__.py
    │   └── test_tensorizer.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_inputs.py
    ├── test_logger.py
    ├── test_logits_processor.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_sequence.py
    ├── test_sharded_state_loader.py
    ├── test_utils.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   ├── test_image_processor.py
    │   ├── test_tokenizer.py
    │   └── test_tokenizer_group.py
    ├── utils.py
    └── worker
    │   ├── __init__.py
    │   ├── test_model_runner.py
    │   └── test_swap.py
└── vllm
    ├── __init__.py
    ├── _custom_ops.py
    ├── attention
        ├── __init__.py
        ├── backends
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── blocksparse_attn.py
        │   ├── flash_attn.py
        │   ├── flashinfer.py
        │   ├── rocm_flash_attn.py
        │   ├── torch_sdpa.py
        │   └── xformers.py
        ├── layer.py
        ├── ops
        │   ├── __init__.py
        │   ├── blocksparse_attention
        │   │   ├── __init__.py
        │   │   ├── blocksparse_attention_kernel.py
        │   │   ├── interface.py
        │   │   └── utils.py
        │   ├── paged_attn.py
        │   ├── prefix_prefill.py
        │   └── triton_flash_attention.py
        └── selector.py
    ├── block.py
    ├── config.py
    ├── core
        ├── __init__.py
        ├── block
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── common.py
        │   ├── cpu_gpu_block_allocator.py
        │   ├── interfaces.py
        │   ├── naive_block.py
        │   ├── prefix_caching_block.py
        │   └── utils.py
        ├── block_manager_v1.py
        ├── block_manager_v2.py
        ├── embedding_model_block_manager.py
        ├── evictor_v1.py
        ├── evictor_v2.py
        ├── interfaces.py
        ├── policy.py
        └── scheduler.py
    ├── distributed
        ├── __init__.py
        ├── communication_op.py
        ├── device_communicators
        │   ├── __init__.py
        │   ├── custom_all_reduce.py
        │   ├── custom_all_reduce_utils.py
        │   ├── pynccl.py
        │   └── pynccl_wrapper.py
        ├── parallel_state.py
        └── utils.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── llm_engine.py
        ├── metrics.py
        └── output_processor
        │   ├── __init__.py
        │   ├── interfaces.py
        │   ├── multi_step.py
        │   ├── single_step.py
        │   ├── stop_checker.py
        │   └── util.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── llm.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── protocol.py
        │   ├── run_batch.py
        │   ├── serving_chat.py
        │   ├── serving_completion.py
        │   ├── serving_embedding.py
        │   └── serving_engine.py
    ├── envs.py
    ├── executor
        ├── __init__.py
        ├── cpu_executor.py
        ├── distributed_gpu_executor.py
        ├── executor_base.py
        ├── gpu_executor.py
        ├── multiproc_gpu_executor.py
        ├── multiproc_worker_utils.py
        ├── neuron_executor.py
        ├── ray_gpu_executor.py
        └── ray_utils.py
    ├── inputs.py
    ├── logger.py
    ├── logging
        ├── __init__.py
        └── formatter.py
    ├── lora
        ├── __init__.py
        ├── fully_sharded_layers.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── punica.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── guided_decoding
        │   ├── __init__.py
        │   ├── lm_format_enforcer_decoding.py
        │   ├── outlines_decoding.py
        │   └── outlines_logits_processors.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   └── README
        │   │   └── fused_moe.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── ops
        │   │   ├── __init__.py
        │   │   ├── rand.py
        │   │   └── sample.py
        │   ├── pooler.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── aqlm.py
        │   │   ├── awq.py
        │   │   ├── base_config.py
        │   │   ├── bitsandbytes.py
        │   │   ├── compressed_tensors
        │   │   │   ├── __init__.py
        │   │   │   ├── compressed_tensors.py
        │   │   │   └── schemes
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── compressed_tensors_scheme.py
        │   │   │   │   ├── compressed_tensors_unquantized.py
        │   │   │   │   └── compressed_tensors_w8a8_statictensor.py
        │   │   ├── deepspeedfp.py
        │   │   ├── fp8.py
        │   │   ├── gptq.py
        │   │   ├── gptq_marlin.py
        │   │   ├── gptq_marlin_24.py
        │   │   ├── marlin.py
        │   │   ├── schema.py
        │   │   ├── squeezellm.py
        │   │   └── utils
        │   │   │   ├── __init__.py
        │   │   │   ├── format_24.py
        │   │   │   ├── marlin_24_perms.py
        │   │   │   ├── marlin_perms.py
        │   │   │   ├── marlin_utils.py
        │   │   │   └── quant_utils.py
        │   ├── rejection_sampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader
        │   ├── __init__.py
        │   ├── loader.py
        │   ├── neuron.py
        │   ├── tensorizer.py
        │   ├── utils.py
        │   └── weight_utils.py
        ├── models
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── baichuan.py
        │   ├── bloom.py
        │   ├── chatglm.py
        │   ├── commandr.py
        │   ├── dbrx.py
        │   ├── decilm.py
        │   ├── deepseek.py
        │   ├── falcon.py
        │   ├── gemma.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── internlm2.py
        │   ├── jais.py
        │   ├── llama.py
        │   ├── llama_embedding.py
        │   ├── llava.py
        │   ├── minicpm.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mpt.py
        │   ├── olmo.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── phi.py
        │   ├── phi3_small.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── qwen2_moe.py
        │   ├── skywork_moe.py
        │   ├── skywork_moe_quant.py
        │   ├── stablelm.py
        │   ├── starcoder2.py
        │   ├── vlm_base.py
        │   └── xverse.py
        ├── pooling_metadata.py
        ├── sampling_metadata.py
        └── utils.py
    ├── multimodal
        ├── __init__.py
        ├── base.py
        ├── image.py
        └── registry.py
    ├── outputs.py
    ├── pooling_params.py
    ├── py.typed
    ├── sampling_params.py
    ├── sequence.py
    ├── spec_decode
        ├── __init__.py
        ├── batch_expansion.py
        ├── interfaces.py
        ├── metrics.py
        ├── multi_step_worker.py
        ├── ngram_worker.py
        ├── spec_decode_worker.py
        ├── top1_proposer.py
        └── util.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── arctic.py
        │   ├── chatglm.py
        │   ├── dbrx.py
        │   ├── falcon.py
        │   ├── jais.py
        │   └── mpt.py
        ├── detokenizer.py
        ├── image_processor.py
        ├── tokenizer.py
        ├── tokenizer_group
        │   ├── __init__.py
        │   ├── base_tokenizer_group.py
        │   ├── ray_tokenizer_group.py
        │   └── tokenizer_group.py
        └── tokenizers
        │   ├── __init__.py
        │   └── baichuan.py
    ├── usage
        ├── __init__.py
        └── usage_lib.py
    ├── utils.py
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── cpu_model_runner.py
        ├── cpu_worker.py
        ├── embedding_model_runner.py
        ├── model_runner.py
        ├── neuron_model_runner.py
        ├── neuron_worker.py
        ├── worker.py
        └── worker_base.py


/.buildkite/check-wheel-size.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | 
 4 | MAX_SIZE_MB = 200
 5 | 
 6 | 
 7 | def print_top_10_largest_files(zip_file):
 8 |     with zipfile.ZipFile(zip_file, 'r') as z:
 9 |         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
10 |         file_sizes.sort(key=lambda x: x[1], reverse=True)
11 |         for f, size in file_sizes[:10]:
12 |             print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
13 | 
14 | 
15 | def check_wheel_size(directory):
16 |     for root, _, files in os.walk(directory):
17 |         for f in files:
18 |             if f.endswith(".whl"):
19 |                 wheel_path = os.path.join(root, f)
20 |                 wheel_size = os.path.getsize(wheel_path)
21 |                 wheel_size_mb = wheel_size / (1024 * 1024)
22 |                 if wheel_size_mb > MAX_SIZE_MB:
23 |                     print(
24 |                         f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
25 |                         f"compare to the allowed size ({MAX_SIZE_MB} MB).")
26 |                     print_top_10_largest_files(wheel_path)
27 |                     return 1
28 |                 else:
29 |                     print(f"Wheel {wheel_path} is within the allowed size "
30 |                           f"({wheel_size_mb} MB).")
31 |     return 0
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     import sys
36 |     sys.exit(check_wheel_size(sys.argv[1]))
37 | 


--------------------------------------------------------------------------------
/.buildkite/download-images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | set -o pipefail
 5 | 
 6 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 7 | 
 8 | # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 9 | mkdir -p images
10 | cd images
11 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
12 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
13 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
14 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
15 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
16 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
17 | 
18 | cd -
19 | 


--------------------------------------------------------------------------------
/.buildkite/run-cpu-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the CPU docker image and run the offline inference inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -ex
 4 | 
 5 | # Try building the docker image
 6 | docker build -t cpu-test -f Dockerfile.cpu .
 7 | 
 8 | # Setup cleanup
 9 | remove_docker_container() { docker rm -f cpu-test || true; }
10 | trap remove_docker_container EXIT
11 | remove_docker_container
12 | 
13 | # Run the image and launch offline inference
14 | docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
15 | 


--------------------------------------------------------------------------------
/.buildkite/run-neuron-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the Neuron docker image and run the API server inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -e
 4 | 
 5 | # Try building the docker image
 6 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 7 | 
 8 | # prune old image and containers to save disk space, and only once a day
 9 | # by using a timestamp file in tmp.
10 | if [ -f /tmp/neuron-docker-build-timestamp ]; then
11 |     last_build=$(cat /tmp/neuron-docker-build-timestamp)
12 |     current_time=$(date +%s)
13 |     if [ $((current_time - last_build)) -gt 86400 ]; then
14 |         docker system prune -f
15 |         echo $current_time > /tmp/neuron-docker-build-timestamp
16 |     fi
17 | else
18 |     echo $(date +%s) > /tmp/neuron-docker-build-timestamp
19 | fi
20 | 
21 | docker build -t neuron -f Dockerfile.neuron .
22 | 
23 | # Setup cleanup
24 | remove_docker_container() { docker rm -f neuron || true; }
25 | trap remove_docker_container EXIT
26 | remove_docker_container
27 | 
28 | # Run the image
29 | docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
30 |        --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
31 | 
32 | # Wait for the server to start
33 | wait_for_server_to_start() {
34 |     timeout=300
35 |     counter=0
36 | 
37 |     while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
38 |         sleep 1
39 |         counter=$((counter + 1))
40 |         if [ $counter -ge $timeout ]; then
41 |             echo "Timeout after $timeout seconds"
42 |             break
43 |         fi
44 |     done
45 | }
46 | wait_for_server_to_start
47 | 
48 | # Test a simple prompt
49 | curl -X POST -H "Content-Type: application/json" \
50 |     localhost:8000/generate \
51 |     -d '{"prompt": "San Francisco is a"}'
52 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | UseTab: Never
 3 | IndentWidth: 2
 4 | ColumnLimit: 80
 5 | 
 6 | # Force pointers to the type for C++.
 7 | DerivePointerAlignment: false
 8 | PointerAlignment: Left
 9 | 
10 | # Reordering #include statements can (and currently will) introduce errors
11 | SortIncludes: false
12 | 
13 | # Style choices
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | IndentPPDirectives: BeforeHash
17 | 
18 | IncludeCategories:
19 |   - Regex:           '^<'
20 |     Priority:        4
21 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
22 |     Priority:        3
23 |   - Regex:           '^"(qoda|\.\.)/'
24 |     Priority:        2
25 |   - Regex:           '.*'
26 |     Priority:        1
27 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | vllm/*.so
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://docs.vllm.ai/
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/200-installation.yml:
--------------------------------------------------------------------------------
 1 | name: 🛠️ Installation
 2 | description: Report an issue here when you hit errors during installation.
 3 | title: "[Installation]: "
 4 | labels: ["installation"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
22 |     value: |
23 |       ```text
24 |       The output of `python collect_env.py`
25 |       ```
26 |   validations:
27 |     required: true
28 | - type: textarea
29 |   attributes:
30 |     label: How you are installing vllm
31 |     description: |
32 |       Paste the full command you are trying to execute.
33 |     value: |
34 |       ```sh
35 |       pip install -vvv vllm
36 |       ```
37 | - type: markdown
38 |   attributes:
39 |     value: >
40 |       Thanks for contributing 🎉!
41 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/300-usage.yml:
--------------------------------------------------------------------------------
 1 | name: 💻 Usage
 2 | description: Raise an issue here if you don't know how to use vllm.
 3 | title: "[Usage]: "
 4 | labels: ["usage"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
22 |     value: |
23 |       ```text
24 |       The output of `python collect_env.py`
25 |       ```
26 |   validations:
27 |     required: true
28 | - type: textarea
29 |   attributes:
30 |     label: How would you like to use vllm
31 |     description: |
32 |       A detailed description of how you want to use vllm.
33 |     value: |
34 |       I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
35 | - type: markdown
36 |   attributes:
37 |     value: >
38 |       Thanks for contributing 🎉!
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/500-feature request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new vllm feature
 3 | title: "[Feature]: "
 4 | labels: ["feature request"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: 🚀 The feature, motivation and pitch
14 |     description: >
15 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Alternatives
21 |     description: >
22 |       A description of any alternative solutions or features you've considered, if any.
23 | - type: textarea
24 |   attributes:
25 |     label: Additional context
26 |     description: >
27 |       Add any other context or screenshots about the feature request.
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/600-new model.yml:
--------------------------------------------------------------------------------
 1 | name: 🤗 Support request for a new model from huggingface
 2 | description: Submit a proposal/request for a new model from huggingface
 3 | title: "[New Model]: "
 4 | labels: ["new model"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | 
12 |       #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
13 | - type: textarea
14 |   attributes:
15 |     label: The model to consider.
16 |     description: >
17 |       A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
18 |   validations:
19 |     required: true
20 | - type: textarea
21 |   attributes:
22 |     label: The closest model vllm already supports.
23 |     description: >
24 |       Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
25 | - type: textarea
26 |   attributes:
27 |     label: What's your difficulty of supporting the model you want?
28 |     description: >
29 |       For example, any new operators or new architecture?
30 | - type: markdown
31 |   attributes:
32 |     value: >
33 |       Thanks for contributing 🎉!
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/700-performance discussion.yml:
--------------------------------------------------------------------------------
 1 | name: ⚡ Discussion on the performance of vllm
 2 | description: Submit a proposal/discussion about the performance of vllm
 3 | title: "[Performance]: "
 4 | labels: ["performance"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Proposal to improve performance
14 |     description: >
15 |       How do you plan to improve vllm's performance?
16 |   validations:
17 |     required: false
18 | - type: textarea
19 |   attributes:
20 |     label: Report of performance regression
21 |     description: >
22 |       Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
23 |   validations:
24 |     required: false
25 | - type: textarea
26 |   attributes:
27 |     label: Misc discussion on performance
28 |     description: >
29 |       Anything about the performance.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: Your current environment (if you think it is necessary)
35 |     description: |
36 |       Please run the following and paste the output below.
37 |       ```sh
38 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
39 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
40 |       python collect_env.py
41 |       ```
42 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
43 |     value: |
44 |       ```text
45 |       The output of `python collect_env.py`
46 |       ```
47 |   validations:
48 |     required: false
49 | - type: markdown
50 |   attributes:
51 |     value: >
52 |       Thanks for contributing 🎉!
53 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/750-RFC.yml:
--------------------------------------------------------------------------------
 1 | name: 💬 Request for comments (RFC).
 2 | description: Ask for feedback on major architectural changes or design choices.
 3 | title: "[RFC]: "
 4 | labels: ["RFC"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
11 | - type: textarea
12 |   attributes:
13 |     label: Motivation.
14 |     description: >
15 |       The motivation of the RFC.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Proposed Change.
21 |     description: >
22 |       The proposed change of the RFC.
23 |   validations:
24 |     required: true
25 | - type: textarea
26 |   attributes:
27 |     label: Feedback Period.
28 |     description: >
29 |       The feedback period of the RFC. Usually at least one week.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: CC List.
35 |     description: >
36 |       The list of people you want to CC.
37 |   validations:
38 |     required: false
39 | - type: textarea
40 |   attributes:
41 |     label: Any Other Things.
42 |     description: >
43 |       Any other things you would like to mention.
44 |   validations:
45 |     required: false
46 | - type: markdown
47 |   attributes:
48 |     value: >
49 |       Thanks for contributing 🎉!
50 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-misc discussion.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Misc/random discussions that do not fit into the above categories.
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm.
14 |     description: >
15 |       Anything you want to discuss about vllm.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/workflows/clang-format.yml:
--------------------------------------------------------------------------------
 1 | name: clang-format
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   clang-format:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install clang-format==18.1.5
29 |     - name: Running clang-format
30 |       run: |
31 |         EXCLUDES=(
32 |             'csrc/moe/topk_softmax_kernels.cu'
33 |             'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
34 |             'csrc/punica/bgmv/bgmv_config.h'
35 |             'csrc/punica/bgmv/bgmv_impl.cuh'
36 |             'csrc/punica/bgmv/vec_dtypes.cuh'
37 |             'csrc/punica/punica_ops.cu'
38 |             'csrc/punica/type_convert.h'
39 |         )
40 |         find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
41 |             | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
42 |             | xargs clang-format --dry-run --Werror


--------------------------------------------------------------------------------
/.github/workflows/mypy.yaml:
--------------------------------------------------------------------------------
 1 | name: mypy
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install mypy==1.9.0
29 |         pip install types-setuptools
30 |         pip install types-PyYAML
31 |         pip install types-requests
32 |         pip install types-setuptools
33 |     - name: Mypy
34 |       run: |
35 |         mypy vllm/attention --config-file pyproject.toml
36 |         mypy vllm/core --config-file pyproject.toml
37 |         mypy vllm/distributed --config-file pyproject.toml
38 |         mypy vllm/entrypoints --config-file pyproject.toml
39 |         mypy vllm/executor --config-file pyproject.toml
40 |         mypy vllm/multimodal --config-file pyproject.toml
41 |         mypy vllm/usage --config-file pyproject.toml
42 |         mypy vllm/*.py --config-file pyproject.toml
43 |         mypy vllm/transformers_utils --config-file pyproject.toml
44 |         mypy vllm/engine  --config-file pyproject.toml
45 |         mypy vllm/worker --config-file pyproject.toml
46 |         mypy vllm/spec_decode --config-file pyproject.toml
47 |         mypy vllm/model_executor  --config-file pyproject.toml
48 |         mypy vllm/lora --config-file pyproject.toml
49 |         mypy vllm/logging --config-file pyproject.toml
50 |         mypy vllm/model_executor --config-file pyproject.toml
51 | 
52 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: ruff
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
29 |     - name: Analysing the code with ruff
30 |       run: |
31 |         ruff .
32 |     - name: Spelling check with codespell
33 |       run: |
34 |         codespell --toml pyproject.toml
35 |     - name: Run isort
36 |       run: |
37 |         isort . --check-only
38 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements-cuda.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure punica is built for the release (for LoRA)
17 | export VLLM_INSTALL_PUNICA_KERNELS=1
18 | # Make sure release wheels are built for the following architectures
19 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
20 | # Build
21 | $python_executable setup.py bdist_wheel --dist-dir=dist
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive .
32 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to vLLM
 2 | 
 3 | Thank you for your interest in contributing to vLLM!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add a new model.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | 
19 | ## Setup for development
20 | 
21 | ### Build from source
22 | 
23 | ```bash
24 | pip install -e .  # This may take several minutes.
25 | ```
26 | 
27 | ### Testing
28 | 
29 | ```bash
30 | pip install -r requirements-dev.txt
31 | 
32 | # linting and formatting
33 | bash format.sh
34 | # Static type checking
35 | mypy
36 | # Unit tests
37 | pytest tests/
38 | ```
39 | **Note:** Currently, the repository does not pass the mypy tests.
40 | 
41 | 
42 | ## Contributing Guidelines
43 | 
44 | ### Issue Reporting
45 | 
46 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
47 | If not, please file a new issue, providing as much relevant information as possible.
48 | 
49 | ### Pull Requests & Code Reviews
50 | 
51 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
52 | 
53 | ### Thank You
54 | 
55 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
56 | Your contributions make vLLM a great tool for everyone!
57 | 


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 2 | 
 3 | FROM ubuntu:22.04
 4 | 
 5 | RUN apt-get update  -y \
 6 |     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
 7 |     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 8 | 
 9 | RUN pip install --upgrade pip \
10 |     && pip install wheel packaging ninja setuptools>=49.4.0 numpy
11 | 
12 | COPY ./ /workspace/vllm
13 | 
14 | WORKDIR /workspace/vllm
15 | 
16 | RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
17 | 
18 | RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
19 | 
20 | WORKDIR /workspace/
21 | 
22 | CMD ["/bin/bash"]
23 | 


--------------------------------------------------------------------------------
/Dockerfile.neuron:
--------------------------------------------------------------------------------
 1 | # default base image
 2 | ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
 3 | 
 4 | FROM $BASE_IMAGE
 5 | 
 6 | RUN echo "Base image is $BASE_IMAGE"
 7 | 
 8 | # Install some basic utilities
 9 | RUN apt-get update && apt-get install python3 python3-pip -y
10 | 
11 | ### Mount Point ###
12 | # When launching the container, mount the code directory to /app
13 | ARG APP_MOUNT=/app
14 | VOLUME [ ${APP_MOUNT} ]
15 | WORKDIR ${APP_MOUNT}
16 | 
17 | RUN python3 -m pip install --upgrade pip
18 | RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
19 | RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
20 | RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
21 | RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
22 | 
23 | COPY ./vllm /app/vllm/vllm
24 | COPY ./setup.py /app/vllm/setup.py
25 | COPY ./requirements-common.txt /app/vllm/requirements-common.txt
26 | COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
27 | 
28 | RUN cd /app/vllm \
29 |     && python3 -m pip install -U -r requirements-neuron.txt
30 | 
31 | ENV VLLM_BUILD_WITH_NEURON 1
32 | RUN cd /app/vllm \
33 |     && pip install -e . \
34 |     && cd ..
35 | 
36 | CMD ["/bin/bash"]
37 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements-common.txt
 3 | include requirements-cuda.txt
 4 | include requirements-rocm.txt
 5 | include requirements-neuron.txt
 6 | include requirements-cpu.txt
 7 | include CMakeLists.txt
 8 | 
 9 | recursive-include cmake *
10 | recursive-include csrc *
11 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/benchmarks/cutlass_benchmarks/weight_shapes.py:
--------------------------------------------------------------------------------
 1 | # Weight Shapes are in the format
 2 | # ([K, N], TP_SPLIT_DIM)
 3 | # Example:
 4 | #  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
 5 | #   - TP1 : K = 14336, N = 4096
 6 | #   - TP2 : K = 7168, N = 4096
 7 | #  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
 8 | #   - TP1 : K = 4096, N = 6144
 9 | #   - TP4 : K = 4096, N = 1536
10 | 
11 | # TP1 shapes
12 | WEIGHT_SHAPES = {
13 |     "mistralai/Mistral-7B-v0.1": [
14 |         ([4096, 6144], 1),
15 |         ([4096, 4096], 0),
16 |         ([4096, 28672], 1),
17 |         ([14336, 4096], 0),
18 |     ],
19 |     "meta-llama/Llama-2-7b-hf": [
20 |         ([4096, 12288], 1),
21 |         ([4096, 4096], 0),
22 |         ([4096, 22016], 1),
23 |         ([11008, 4096], 0),
24 |     ],
25 |     "meta-llama/Llama-2-13b-hf": [
26 |         ([5120, 15360], 1),
27 |         ([5120, 5120], 0),
28 |         ([5120, 27648], 1),
29 |         ([13824, 5120], 0),
30 |     ],
31 |     "meta-llama/Llama-2-70b-hf": [
32 |         ([8192, 10240], 1),
33 |         ([8192, 8192], 0),
34 |         ([8192, 57344], 1),
35 |         ([28672, 8192], 0),
36 |     ],
37 | }
38 | 


--------------------------------------------------------------------------------
/benchmarks/kernels/benchmark_shapes.py:
--------------------------------------------------------------------------------
 1 | WEIGHT_SHAPES = {
 2 |     "ideal": [[4 * 256 * 32, 256 * 32]],
 3 |     "mistralai/Mistral-7B-v0.1/TP1": [
 4 |         [4096, 6144],
 5 |         [4096, 4096],
 6 |         [4096, 28672],
 7 |         [14336, 4096],
 8 |     ],
 9 |     "mistralai/Mistral-7B-v0.1/TP2": [
10 |         [4096, 3072],
11 |         [2048, 4096],
12 |         [4096, 14336],
13 |         [7168, 4096],
14 |     ],
15 |     "mistralai/Mistral-7B-v0.1/TP4": [
16 |         [4096, 1536],
17 |         [1024, 4096],
18 |         [4096, 7168],
19 |         [3584, 4096],
20 |     ],
21 |     "meta-llama/Llama-2-7b-hf/TP1": [
22 |         [4096, 12288],
23 |         [4096, 4096],
24 |         [4096, 22016],
25 |         [11008, 4096],
26 |     ],
27 |     "meta-llama/Llama-2-7b-hf/TP2": [
28 |         [4096, 6144],
29 |         [2048, 4096],
30 |         [4096, 11008],
31 |         [5504, 4096],
32 |     ],
33 |     "meta-llama/Llama-2-7b-hf/TP4": [
34 |         [4096, 3072],
35 |         [1024, 4096],
36 |         [4096, 5504],
37 |         [2752, 4096],
38 |     ],
39 |     "meta-llama/Llama-2-13b-hf/TP1": [
40 |         [5120, 15360],
41 |         [5120, 5120],
42 |         [5120, 27648],
43 |         [13824, 5120],
44 |     ],
45 |     "meta-llama/Llama-2-13b-hf/TP2": [
46 |         [5120, 7680],
47 |         [2560, 5120],
48 |         [5120, 13824],
49 |         [6912, 5120],
50 |     ],
51 |     "meta-llama/Llama-2-13b-hf/TP4": [
52 |         [5120, 3840],
53 |         [1280, 5120],
54 |         [5120, 6912],
55 |         [3456, 5120],
56 |     ],
57 |     "meta-llama/Llama-2-70b-hf/TP1": [
58 |         [8192, 10240],
59 |         [8192, 8192],
60 |         [8192, 57344],
61 |         [28672, 8192],
62 |     ],
63 |     "meta-llama/Llama-2-70b-hf/TP2": [
64 |         [8192, 5120],
65 |         [4096, 8192],
66 |         [8192, 28672],
67 |         [14336, 8192],
68 |     ],
69 |     "meta-llama/Llama-2-70b-hf/TP4": [
70 |         [8192, 2560],
71 |         [2048, 8192],
72 |         [8192, 14336],
73 |         [7168, 8192],
74 |     ],
75 | }
76 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:1.4.0 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8.cuh"
8 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 4 |  * Copyright (c) 2023, The vLLM team.
 5 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 6 |  *
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | #pragma once
20 | 
21 | #include <stdint.h>
22 | 
23 | namespace vllm {
24 | 
25 | // A vector type to store Q, K, V elements.
26 | template <typename T, int VEC_SIZE>
27 | struct Vec {};
28 | 
29 | // A vector type to store FP32 accumulators.
30 | template <typename T>
31 | struct FloatVec {};
32 | 
33 | // Template vector operations.
34 | template <typename Acc, typename A, typename B>
35 | inline __device__ Acc mul(A a, B b);
36 | 
37 | template <typename T>
38 | inline __device__ float sum(T v);
39 | 
40 | template <typename T>
41 | inline __device__ float dot(T a, T b) {
42 |   return sum(mul<T, T, T>(a, b));
43 | }
44 | 
45 | template <typename A, typename T>
46 | inline __device__ float dot(T a, T b) {
47 |   return sum(mul<A, T, T>(a, b));
48 | }
49 | 
50 | template <typename T>
51 | inline __device__ void zero(T& dst) {
52 |   constexpr int WORDS = sizeof(T) / 4;
53 |   union {
54 |     T raw;
55 |     uint32_t words[WORDS];
56 |   } tmp;
57 | 
58 | #pragma unroll
59 |   for (int ii = 0; ii < WORDS; ++ii) {
60 |     tmp.words[ii] = 0u;
61 |   }
62 |   dst = tmp.raw;
63 | }
64 | 
65 | }  // namespace vllm
66 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 4 |  * Copyright (c) 2023, The vLLM team.
 5 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 6 |  *
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | #pragma once
20 | 
21 | #include "../cuda_compat.h"
22 | #include "attention_dtypes.h"
23 | 
24 | #include <float.h>
25 | #include <type_traits>
26 | 
27 | namespace vllm {
28 | 
29 | // Q*K^T operation.
30 | template <int THREAD_GROUP_SIZE, typename Vec, int N>
31 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
32 |   using A_vec = typename FloatVec<Vec>::Type;
33 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
34 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
35 | #pragma unroll
36 |   for (int ii = 1; ii < N; ++ii) {
37 |     qk_vec = fma(q[ii], k[ii], qk_vec);
38 |   }
39 | 
40 |   // Finalize the reduction across lanes.
41 |   float qk = sum(qk_vec);
42 | #pragma unroll
43 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
44 |     qk += VLLM_SHFL_XOR_SYNC(qk, mask);
45 |   }
46 |   return qk;
47 | }
48 | 
49 | template <typename T, int THREAD_GROUP_SIZE>
50 | struct Qk_dot {
51 |   template <typename Vec, int N>
52 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
53 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
54 |   }
55 | };
56 | 
57 | }  // namespace vllm
58 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8
 7 |   #ifndef USE_ROCM
 8 |     #include <cuda_fp8.h>
 9 |   #endif  // USE_ROCM
10 | #endif    // ENABLE_FP8
11 | 
12 | namespace vllm {
13 | 
14 | enum class Fp8KVCacheDataType {
15 |   kAuto = 0,
16 |   kFp8E4M3 = 1,
17 |   kFp8E5M2 = 2,
18 | };
19 | 
20 | // fp8 vector types for quantization of kv cache
21 | template <>
22 | struct Vec<uint8_t, 1> {
23 |   using Type = uint8_t;
24 | };
25 | 
26 | template <>
27 | struct Vec<uint8_t, 2> {
28 |   using Type = uint16_t;
29 | };
30 | 
31 | template <>
32 | struct Vec<uint8_t, 4> {
33 |   using Type = uint32_t;
34 | };
35 | 
36 | template <>
37 | struct Vec<uint8_t, 8> {
38 |   using Type = uint2;
39 | };
40 | 
41 | }  // namespace vllm
42 | 


--------------------------------------------------------------------------------
/csrc/cache.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | #include <map>
 6 | #include <vector>
 7 | 
 8 | void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
 9 |                  const torch::Tensor& block_mapping);
10 | 
11 | void copy_blocks(std::vector<torch::Tensor>& key_caches,
12 |                  std::vector<torch::Tensor>& value_caches,
13 |                  const torch::Tensor& block_mapping);
14 | 
15 | void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
16 |                        torch::Tensor& key_cache, torch::Tensor& value_cache,
17 |                        torch::Tensor& slot_mapping,
18 |                        const std::string& kv_cache_dtype, const float kv_scale);
19 | 
20 | void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
21 |                              torch::Tensor& key_cache,
22 |                              torch::Tensor& value_cache,
23 |                              torch::Tensor& slot_mapping,
24 |                              const std::string& kv_cache_dtype);
25 | 
26 | // Just for unittest
27 | void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
28 |                  const float scale, const std::string& kv_cache_dtype);
29 | 


--------------------------------------------------------------------------------
/csrc/cpu/pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | #include "cuda_utils.h"
 3 | #include "ops.h"
 4 | #include <torch/extension.h>
 5 | 
 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 7 |   // vLLM custom ops
 8 |   pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
 9 | 
10 |   // Attention ops
11 |   ops.def("paged_attention_v1", &paged_attention_v1,
12 |           "Compute the attention between an input query and the cached "
13 |           "keys/values using PagedAttention.");
14 |   ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
15 | 
16 |   // Activation ops
17 |   ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
18 |   ops.def("gelu_and_mul", &gelu_and_mul,
19 |           "Activation function used in GeGLU with `none` approximation.");
20 |   ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
21 |           "Activation function used in GeGLU with `tanh` approximation.");
22 |   ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
23 |   ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
24 | 
25 |   // Layernorm
26 |   ops.def("rms_norm", &rms_norm,
27 |           "Apply Root Mean Square (RMS) Normalization to the input tensor.");
28 | 
29 |   ops.def("fused_add_rms_norm", &fused_add_rms_norm,
30 |           "In-place fused Add and RMS Normalization");
31 | 
32 |   // Rotary embedding
33 |   ops.def("rotary_embedding", &rotary_embedding,
34 |           "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
35 | 
36 |   // Cache ops
37 |   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
38 |   cache_ops.def("swap_blocks", &swap_blocks,
39 |                 "Swap in (out) the cache blocks from src to dst");
40 |   cache_ops.def("copy_blocks", &copy_blocks,
41 |                 "Copy the cache blocks from src to dst");
42 |   cache_ops.def("reshape_and_cache", &reshape_and_cache,
43 |                 "Reshape the key and value tensors and cache them");
44 | }
45 | 


--------------------------------------------------------------------------------
/csrc/cuda_compat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_ROCM
 4 |   #include <hip/hip_runtime.h>
 5 | #endif
 6 | 
 7 | #ifndef USE_ROCM
 8 |   #define WARP_SIZE 32
 9 | #else
10 |   #define WARP_SIZE warpSize
11 | #endif
12 | 
13 | #ifndef USE_ROCM
14 |   #define VLLM_LDG(arg) __ldg(arg)
15 | #else
16 |   #define VLLM_LDG(arg) *(arg)
17 | #endif
18 | 
19 | #ifndef USE_ROCM
20 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
21 |     __shfl_xor_sync(uint32_t(-1), var, lane_mask)
22 |   #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
23 |     __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
24 | #else
25 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
26 |   #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
27 |     __shfl_xor(var, lane_mask, width)
28 | #endif
29 | 
30 | #ifndef USE_ROCM
31 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
32 | #else
33 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
34 | #endif
35 | 
36 | #ifndef USE_ROCM
37 |   #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
38 |     __shfl_down_sync(uint32_t(-1), var, lane_delta)
39 | #else
40 |   #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
41 | #endif
42 | 
43 | #ifndef USE_ROCM
44 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
45 |     cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
46 | #else
47 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
48 |     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
49 | #endif
50 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/extension.h>
4 | 
5 | int get_device_attribute(int attribute, int device_id);
6 | 
7 | int get_max_shared_memory_per_block_device_attribute(int device_id);
8 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int get_device_attribute(int attribute, int device_id) {
 6 |   int device, value;
 7 |   if (device_id < 0) {
 8 |     cudaGetDevice(&device);
 9 |   } else {
10 |     device = device_id;
11 |   }
12 |   cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
13 |                          device);
14 |   return value;
15 | }
16 | 
17 | int get_max_shared_memory_per_block_device_attribute(int device_id) {
18 |   int attribute;
19 |   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
20 |   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
21 | 
22 | #ifdef USE_ROCM
23 |   attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
24 | #else
25 |   attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
26 | #endif
27 | 
28 |   return get_device_attribute(attribute, device_id);
29 | }
30 | 


--------------------------------------------------------------------------------
/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #pragma once
 6 | 
 7 | #include <torch/extension.h>
 8 | 
 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
10 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
11 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
12 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
13 | 
14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
15 |   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
16 | 
17 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
18 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
19 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
20 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
21 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
22 | 
23 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
24 |   AT_DISPATCH_SWITCH(TYPE, NAME,                               \
25 |                      VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
26 | 
27 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
28 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
29 |   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
30 |   AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
31 |   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
32 |   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
33 | 
34 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
35 |   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
36 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.cpp:
--------------------------------------------------------------------------------
1 | #include "moe_ops.h"
2 | 
3 | #include <torch/extension.h>
4 | 
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 |   m.def("topk_softmax", &topk_softmax,
7 |         "Apply topk softmax to the gating outputs.");
8 | }
9 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/extension.h>
4 | 
5 | void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
6 |                   torch::Tensor& token_expert_indices,
7 |                   torch::Tensor& gating_output);
8 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/generator.py:
--------------------------------------------------------------------------------
 1 | DTYPES = ["fp16", "bf16", "fp32"]
 2 | DTYPE_MAP = {
 3 |     "fp16": "nv_half",
 4 |     "bf16": "nv_bfloat16",
 5 |     "fp32": "float",
 6 | }
 7 | 
 8 | TEMPLATE = """
 9 | #include "bgmv_config.h"
10 | #include "bgmv_impl.cuh"
11 | 
12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
13 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
14 | """.lstrip()  # noqa: E501
15 | 
16 | for input_dtype in DTYPES:
17 |     for output_dtype in DTYPES:
18 |         for weight_dtype in DTYPES:
19 |             if weight_dtype == "fp32":
20 |                 # FP32 weights are not supported.
21 |                 continue
22 |             if output_dtype == "fp32":
23 |                 # LoRA A matrix.
24 |                 if input_dtype != weight_dtype:
25 |                     # NOTE(woosuk): While Punica supports the case where the
26 |                     # input and weight dtypes are different, we only generate
27 |                     # the kernels the same dtypes to reduce the binary size.
28 |                     continue
29 |             elif input_dtype == "fp32":
30 |                 # LoRA B matrix.
31 |                 if output_dtype != weight_dtype:
32 |                     # NOTE(woosuk): While Punica supports the case where the
33 |                     # output and weight dtypes are different, we only generate
34 |                     # the kernels the same dtypes to reduce the binary size.
35 |                     continue
36 |             elif not (input_dtype == output_dtype == weight_dtype):
37 |                 # NOTE(woosuk): While Punica supports mixed data types for
38 |                 # input, output, and weight, we only generate the kernels with
39 |                 # the same data types to reduce the binary size.
40 |                 continue
41 | 
42 |             kernel_definition = TEMPLATE.format(
43 |                 input_dtype=DTYPE_MAP[input_dtype],
44 |                 output_dtype=DTYPE_MAP[output_dtype],
45 |                 weight_dtype=DTYPE_MAP[weight_dtype])
46 |             filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
47 |             with open(filename, "w") as f:
48 |                 f.write(kernel_definition)
49 | 


--------------------------------------------------------------------------------
/csrc/punica/punica_ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
 6 |                    torch::Tensor indicies, int64_t layer_idx, float scale);
 7 | 
 8 | void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
 9 |                              torch::Tensor indicies, int64_t layer_idx,
10 |                              float scale, int64_t h_in, int64_t h_out,
11 |                              int64_t y_offset);
12 | 


--------------------------------------------------------------------------------
/csrc/punica/punica_pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include "punica_ops.h"
 4 | 
 5 | //====== pybind ======
 6 | 
 7 | #define DEFINE_pybind(name) m.def(#name, &name, #name);
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
11 |   m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
12 |         "dispatch_bgmv_low_level");
13 | }
14 | 


--------------------------------------------------------------------------------
/csrc/punica/type_convert.h:
--------------------------------------------------------------------------------
 1 | #ifndef CSRC__PUNICA__TYPE_CONVERT_H__
 2 | #define CSRC__PUNICA__TYPE_CONVERT_H__
 3 | 
 4 | #ifndef USE_ROCM
 5 | 
 6 | #include <cuda_bf16.h>
 7 | #include <cuda_fp16.h>
 8 | 
 9 | #else
10 | 
11 | #include <hip/hip_bf16.h>
12 | #include <hip/hip_fp16.h>
13 | 
14 | #define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__
15 | 
16 | typedef __half nv_half;
17 | typedef __hip_bfloat16 nv_bfloat16;
18 | typedef __hip_bfloat162 nv_bfloat162;
19 | 
20 | __TYPE_CONVERT__HOST_DEVICE__
21 | inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) {
22 |   return __hip_bfloat162{val, val};
23 | }
24 | 
25 | __TYPE_CONVERT__HOST_DEVICE__
26 | inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) {
27 |   return __hip_bfloat162{vall, valr};
28 | }
29 | 
30 | template <typename T_src, typename T_dst>
31 | __TYPE_CONVERT__HOST_DEVICE__
32 | inline T_dst convert_type(T_src val) {
33 |   return static_cast<T_dst>(val);
34 | }
35 | 
36 | template <>
37 | __TYPE_CONVERT__HOST_DEVICE__
38 | inline float convert_type<__half, float>(__half val) {
39 |   return __half2float(val);
40 | }
41 | 
42 | template <>
43 | __TYPE_CONVERT__HOST_DEVICE__
44 | inline __half convert_type<float, __half>(float val) {
45 |   return __float2half(val);
46 | }
47 | 
48 | template <>
49 | __TYPE_CONVERT__HOST_DEVICE__
50 | inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) {
51 |   return __bfloat162float(val);
52 | }
53 | 
54 | template <>
55 | __TYPE_CONVERT__HOST_DEVICE__
56 | inline __hip_bfloat16 convert_type<float, __hip_bfloat16>(float val) {
57 |   return __float2bfloat16(val);
58 | }
59 | 
60 | template <typename T>
61 | __TYPE_CONVERT__HOST_DEVICE__
62 | inline T vllm_add(T a, T b) {
63 |   return a + b;
64 | }
65 | 
66 | template <>
67 | __TYPE_CONVERT__HOST_DEVICE__
68 | inline __half vllm_add<__half>(__half a, __half b) {
69 |   return __hadd(a, b);
70 | }
71 | 
72 | template <>
73 | __TYPE_CONVERT__HOST_DEVICE__
74 | inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) {
75 |   return __hadd(a, b);
76 | }
77 | 
78 | #undef __TYPE_CONVERT__HOST_DEVICE__
79 | 
80 | #endif // USE_ROCM
81 | 
82 | #endif // CSRC__PUNICA__TYPE_CONVERT_H__
83 | 


--------------------------------------------------------------------------------
/csrc/quantization/cutlass_w8a8/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cutlass/cutlass.h"
 4 | 
 5 | /**
 6 |  * Helper function for checking CUTLASS errors
 7 |  */
 8 | #define CUTLASS_CHECK(status)                        \
 9 |   {                                                  \
10 |     TORCH_CHECK(status == cutlass::Status::kSuccess, \
11 |                 cutlassGetStatusString(status))      \
12 |   }
13 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/compat.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _compat_cuh
 6 | #define _compat_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | // atomicAdd for half types, to support CC < 7.x
11 | 
12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) {
13 |   unsigned int* address_as_ui =
14 |       (unsigned int*)((char*)address - ((size_t)address & 2));
15 |   unsigned int old = *address_as_ui;
16 |   unsigned int assumed;
17 | 
18 |   do {
19 |     assumed = old;
20 |     __half_raw hsum;
21 |     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
22 |     half tmpres = __hadd(hsum, val);
23 |     hsum = __half_raw(tmpres);
24 |     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)
25 |                               : (old & 0xffff0000) | hsum.x;
26 |     old = atomicCAS(address_as_ui, assumed, old);
27 |   } while (assumed != old);
28 | }
29 | 
30 | // atomicAdd for half2 types
31 | 
32 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
33 |   unsigned int* address_as_ui = (unsigned int*)address;
34 |   unsigned int old = *address_as_ui;
35 |   unsigned int assumed;
36 |   do {
37 |     assumed = old;
38 |     half2 old_val = *((half2*)&old);
39 |     half2 new_val = __hadd2(old_val, val);
40 |     old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
41 |   } while (assumed != old);
42 | }
43 | 
44 | //
45 | 
46 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
47 |   #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
48 | 
49 | __device__ __forceinline__ void atomicAdd(half* address, half val) {
50 |   atomicAdd_half(address, val);
51 | }
52 | 
53 |     #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
54 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
55 |   atomicAdd_half2(address, val);
56 | }
57 |     #endif
58 | 
59 |   #endif
60 | #endif
61 | 
62 | }  // namespace gptq
63 | }  // namespace vllm
64 | #endif
65 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
14 | 
15 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
16 |                                                const uint32_t q_1,
17 |                                                half2 (&dq)[4], int stride,
18 |                                                const uint32_t zero) {
19 |   half dqh[8];
20 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
21 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
22 | 
23 |   for (int i = 0; i < 4; i++)
24 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
25 | }
26 | 
27 | }  // namespace gptq
28 | }  // namespace vllm
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_util.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_util_cuh
 6 | #define _qdq_util_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | 
11 | union half2_uint32 {
12 |   uint32_t as_uint32;
13 |   half2 as_half2;
14 |   __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
15 |   __device__ half2_uint32(half2 val) : as_half2(val) {}
16 | };
17 | 
18 | union half_uint16 {
19 |   uint16_t as_uint16;
20 |   half as_half;
21 |   __device__ half_uint16(uint16_t val) : as_uint16(val) {}
22 |   __device__ half_uint16(half val) : as_half(val) {}
23 | };
24 | 
25 | // Max_scale premultiplied by 1/256
26 | 
27 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
28 |   int qs_i = qs + 1;
29 |   half qs_h = __int2half_rn(qs_i * qs_i);
30 |   qs_h = __hmul(qs_h, max_scale);
31 |   return qs_h;
32 | }
33 | 
34 | __forceinline__ __device__ half dq(const int q, const int qzero,
35 |                                    const half scale) {
36 |   return __hmul(__int2half_rn(q - qzero), scale);
37 | }
38 | 
39 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) {
40 |   // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
41 |   return __int2half_rn(q - qzero);
42 | }
43 | 
44 | __forceinline__ __device__ int exb(const uint32_t q, const int shift,
45 |                                    const int mask) {
46 |   return (int)((q >> shift) & mask);
47 | }
48 | 
49 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
50 |                                    const int shift, const int mask) {
51 |   return (int)(__funnelshift_rc(q0, q1, shift) & mask);
52 | }
53 | 
54 | }  // namespace gptq
55 | }  // namespace vllm
56 | #endif
57 | 


--------------------------------------------------------------------------------
/csrc/quantization/marlin/sparse/common/base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
 3 |  * Rights Reserved.
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *       http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #pragma once
19 | 
20 | namespace marlin_24 {
21 | 
22 | constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
23 | 
24 | // Instances of `Vec` are used to organize groups of >>registers<<, as needed
25 | // for instance as inputs to tensor core operations. Consequently, all
26 | // corresponding index accesses must be compile-time constants, which is why we
27 | // extensively use `#pragma unroll` throughout the kernel code to guarantee
28 | // this.
29 | template <typename T, int n>
30 | struct Vec {
31 |   T elems[n];
32 |   __device__ T& operator[](int i) { return elems[i]; }
33 | };
34 | 
35 | template <int M_, int N_, int K_>
36 | struct ShapeBase {
37 |   static constexpr int M = M_, N = N_, K = K_;
38 | };
39 | 
40 | using I4 = Vec<int, 4>;
41 | 
42 | // Matrix fragments for tensor core instructions; their precise layout is
43 | // documented here:
44 | // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
45 | using FragA = Vec<half2, 4>;
46 | using FragB = Vec<half2, 2>;
47 | using FragM = Vec<uint, 1>;
48 | using FragC = Vec<float, 4>;
49 | using FragS = Vec<half2, 1>;  // quantization scales
50 | 
51 | }  // namespace marlin_24
52 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx == 6.2.1
 2 | sphinx-book-theme == 1.0.1
 3 | sphinx-copybutton == 0.5.2
 4 | myst-parser == 2.0.0
 5 | sphinx-argparse
 6 | 
 7 | # packages to install to build the documentation
 8 | pydantic
 9 | -f https://download.pytorch.org/whl/cpu
10 | torch
11 | py-cpuinfo
12 | transformers
13 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
14 | 


--------------------------------------------------------------------------------
/docs/source/assets/dev/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/dev/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/community/meetups.rst:
--------------------------------------------------------------------------------
 1 | .. _meetups:
 2 | 
 3 | vLLM Meetups
 4 | ============
 5 | 
 6 | We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 7 | 
 8 | - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 9 | - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
10 | - `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
11 | 
12 | We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu <mailto:vllm-questions@lists.berkeley.edu>`__.
13 | 


--------------------------------------------------------------------------------
/docs/source/community/sponsors.md:
--------------------------------------------------------------------------------
 1 | # Sponsors
 2 | 
 3 | vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 4 | 
 5 | <!-- Note: Please sort them in alphabetical order. -->
 6 | <!-- Note: Please keep these consistent with README.md. -->
 7 | 
 8 | - a16z
 9 | - AMD
10 | - Anyscale
11 | - AWS
12 | - Crusoe Cloud
13 | - Databricks
14 | - DeepInfra
15 | - Dropbox
16 | - Lambda Lab
17 | - NVIDIA
18 | - Replicate
19 | - Roblox
20 | - RunPod
21 | - Trainy
22 | - UC Berkeley
23 | - UC San Diego
24 | 
25 | We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.


--------------------------------------------------------------------------------
/docs/source/dev/dockerfile/dockerfile.rst:
--------------------------------------------------------------------------------
 1 | Dockerfile
 2 | ====================
 3 | 
 4 | See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
 5 | the image for running an OpenAI compatible server with vLLM.
 6 | 
 7 | -  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 8 | 
 9 |    - All build stages
10 |    - The default build target (highlighted in grey)
11 |    - External images (with dashed borders)
12 |    
13 |    The edges of the build graph represent:
14 |    
15 |    - FROM ... dependencies (with a solid line and a full arrow head)
16 |    - COPY --from=... dependencies (with a dashed line and an empty arrow head)
17 |    - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
18 | 
19 |    .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
20 |       :alt: query
21 |       :width: 100%
22 |       :align: center
23 | 
24 |    Made using: https://github.com/patrickhoefler/dockerfilegraph
25 | 
26 |    Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
27 | 
28 |    .. code:: bash
29 | 
30 |       dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
31 | 
32 |    or in case you want to run it directly with the docker image:
33 |    
34 |    .. code:: bash
35 | 
36 |       docker run \
37 |          --rm \
38 |          --user "$(id -u):$(id -g)" \
39 |          --workdir /workspace \
40 |          --volume "$(pwd)":/workspace \
41 |          ghcr.io/patrickhoefler/dockerfilegraph:alpine \
42 |          --output png \
43 |          --dpi 200 \
44 |          --max-label-length 50 \
45 |          --filename Dockerfile \
46 |          --legend
47 | 
48 |    (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
49 | 
50 |    


--------------------------------------------------------------------------------
/docs/source/dev/engine/async_llm_engine.rst:
--------------------------------------------------------------------------------
1 | AsyncLLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.AsyncLLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/engine_index.rst:
--------------------------------------------------------------------------------
 1 | vLLM Engine
 2 | =================================
 3 | 
 4 | .. automodule:: vllm.engine
 5 | .. currentmodule:: vllm.engine
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Engines
10 | 
11 |    llm_engine
12 |    async_llm_engine
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/llm_engine.rst:
--------------------------------------------------------------------------------
1 | LLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.LLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/multimodal/multimodal_index.rst:
--------------------------------------------------------------------------------
 1 | Multi-Modality
 2 | ==============
 3 | 
 4 | .. currentmodule:: vllm.multimodal
 5 |     
 6 | vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 7 | 
 8 | :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 9 | which allows you to pass in multi-modal input alongside text and token prompts.
10 | 
11 | By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
12 | you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
13 | as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
14 | 
15 | .. contents::
16 |    :local:
17 |    :backlinks: none
18 | 
19 | Module Contents
20 | +++++++++++++++
21 | 
22 | .. automodule:: vllm.multimodal
23 | 
24 | Registry
25 | --------
26 | 
27 | .. data:: vllm.multimodal.MULTIMODAL_REGISTRY
28 | 
29 |     The global :class:`MultiModalRegistry` which is used by model runners.
30 | 
31 | .. autoclass:: vllm.multimodal.MultiModalRegistry
32 |     :members:
33 |     :show-inheritance:
34 | 
35 | Base Classes
36 | ------------
37 | 
38 | .. autoclass:: vllm.multimodal.MultiModalData
39 |     :members:
40 |     :show-inheritance:
41 | 
42 | .. autoclass:: vllm.multimodal.MultiModalPlugin
43 |     :members:
44 |     :show-inheritance:
45 | 
46 | Image Classes
47 | -------------
48 | 
49 | .. automodule:: vllm.multimodal.image
50 |     :members:
51 |     :show-inheritance:
52 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm.rst:
--------------------------------------------------------------------------------
1 | LLM Class
2 | =========
3 | 
4 | .. autoclass:: vllm.LLM
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/llm_inputs.rst:
--------------------------------------------------------------------------------
 1 | LLM Inputs
 2 | ==========
 3 | 
 4 | .. autodata:: vllm.inputs.PromptStrictInputs
 5 | 
 6 | .. autoclass:: vllm.inputs.TextPrompt
 7 |     :show-inheritance:
 8 |     :members:
 9 |     :member-order: bysource
10 | 
11 | .. autoclass:: vllm.inputs.TokensPrompt
12 |     :show-inheritance:
13 |     :members:
14 |     :member-order: bysource
15 | 


--------------------------------------------------------------------------------
/docs/source/dev/offline_inference/offline_index.rst:
--------------------------------------------------------------------------------
1 | Offline Inference
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    llm
8 |    llm_inputs
9 | 


--------------------------------------------------------------------------------
/docs/source/dev/sampling_params.rst:
--------------------------------------------------------------------------------
1 | Sampling Parameters
2 | ===================
3 | 
4 | .. autoclass:: vllm.SamplingParams
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/examples/examples_index.template.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: Scripts
7 | 
8 |    %EXAMPLE_DOCS%
9 | 


--------------------------------------------------------------------------------
/docs/source/models/engine_args.rst:
--------------------------------------------------------------------------------
 1 | .. _engine_args:
 2 | 
 3 | Engine Arguments
 4 | ================
 5 | 
 6 | Below, you can find an explanation of every engine argument for vLLM:
 7 | 
 8 | .. argparse::
 9 |     :module: vllm.engine.arg_utils
10 |     :func: _engine_args_parser
11 |     :prog: -m vllm.entrypoints.openai.api_server
12 |     :nodefaultconst:
13 | 
14 | Async Engine Arguments
15 | ----------------------
16 | 
17 | Below are the additional arguments related to the asynchronous engine:
18 | 
19 | .. argparse::
20 |     :module: vllm.engine.arg_utils
21 |     :func: _async_engine_args_parser
22 |     :prog: -m vllm.entrypoints.openai.api_server
23 |     :nodefaultconst:


--------------------------------------------------------------------------------
/docs/source/models/vlm.rst:
--------------------------------------------------------------------------------
 1 | .. _vlm:
 2 | 
 3 | Using VLMs
 4 | ==========
 5 | 
 6 | This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
 7 | 
 8 | Engine Arguments
 9 | ----------------
10 | 
11 | The following :ref:`engine arguments <engine_args>` are specific to VLMs:
12 | 
13 | .. argparse::
14 |     :module: vllm.engine.arg_utils
15 |     :func: _vlm_engine_args_parser
16 |     :prog: -m vllm.entrypoints.openai.api_server
17 |     :nodefaultconst:
18 | 
19 | Offline Batched Inference
20 | -------------------------
21 | 
22 | To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
23 | 
24 | .. code-block:: python
25 | 
26 |     llm = LLM(
27 |         model="llava-hf/llava-1.5-7b-hf",
28 |         image_input_type="pixel_values",
29 |         image_token_id=32000,
30 |         image_input_shape="1,3,336,336",
31 |         image_feature_size=576,
32 |     )
33 | 
34 | For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
35 | 
36 | * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
37 | * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
38 | 
39 | .. code-block:: python
40 | 
41 |     prompt = "<image>" * 576 + (
42 |         "\nUSER: What is the content of this image?\nASSISTANT:")
43 | 
44 |     # Load the image using PIL.Image
45 |     image = ...
46 | 
47 |     outputs = llm.generate({
48 |         "prompt": prompt,
49 |         "multi_modal_data": ImagePixelData(image),
50 |     })
51 | 
52 |     for o in outputs:
53 |         generated_text = o.outputs[0].text
54 |         print(generated_text)
55 | 
56 | A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
57 | 


--------------------------------------------------------------------------------
/docs/source/quantization/fp8_e5m2_kvcache.rst:
--------------------------------------------------------------------------------
 1 | .. _fp8_kv_cache:
 2 | 
 3 | FP8 E5M2 KV Cache
 4 | ==================
 5 | 
 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
 8 | 
 9 | Here is an example of how to enable this feature:
10 | 
11 | .. code-block:: python
12 | 
13 |     from vllm import LLM, SamplingParams
14 |     # Sample prompts.
15 |     prompts = [
16 |         "Hello, my name is",
17 |         "The president of the United States is",
18 |         "The capital of France is",
19 |         "The future of AI is",
20 |     ]
21 |     # Create a sampling params object.
22 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
23 |     # Create an LLM.
24 |     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
25 |     # Generate texts from the prompts. The output is a list of RequestOutput objects
26 |     # that contain the prompt, generated text, and other information.
27 |     outputs = llm.generate(prompts, sampling_params)
28 |     # Print the outputs.
29 |     for output in outputs:
30 |         prompt = output.prompt
31 |         generated_text = output.outputs[0].text
32 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
33 | 
34 | 
35 | Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_bentoml.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_bentoml:
2 | 
3 | Deploying with BentoML
4 | ======================
5 | 
6 | `BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_kserve.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_kserve:
2 | 
3 | Deploying with KServe
4 | ============================
5 | 
6 | vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/vllm/>`_ for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_lws.rst:
--------------------------------------------------------------------------------
 1 | .. _deploying_with_lws:
 2 | 
 3 | Deploying with LWS
 4 | ============================
 5 | 
 6 | LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 7 | A major use case is for multi-host/multi-node distributed inference.
 8 | 
 9 | vLLM can be deployed with `LWS <https://github.com/kubernetes-sigs/lws>`_ on Kubernetes for distributed model serving.
10 | 
11 | Please see `this guide <https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm>`_ for more details on
12 | deploying vLLM on Kubernetes using LWS.
13 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/docs/source/serving/env_vars.rst:
--------------------------------------------------------------------------------
 1 | Environment Variables
 2 | ========================
 3 | 
 4 | vLLM uses the following environment variables to configure the system:
 5 | 
 6 | .. literalinclude:: ../../../vllm/envs.py
 7 |     :language: python
 8 |     :start-after: begin-env-vars-definition
 9 |     :end-before: end-env-vars-definition
10 | 


--------------------------------------------------------------------------------
/docs/source/serving/integrations.rst:
--------------------------------------------------------------------------------
 1 | Integrations
 2 | ------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    run_on_sky
 8 |    deploying_with_kserve
 9 |    deploying_with_triton
10 |    deploying_with_bentoml
11 |    deploying_with_lws
12 |    deploying_with_dstack
13 |    serving_with_langchain
14 | 


--------------------------------------------------------------------------------
/docs/source/serving/metrics.rst:
--------------------------------------------------------------------------------
 1 | Production Metrics
 2 | ==================
 3 | 
 4 | vLLM exposes a number of metrics that can be used to monitor the health of the
 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 6 | OpenAI compatible API server.
 7 | 
 8 | The following metrics are exposed:
 9 | 
10 | .. literalinclude:: ../../../vllm/engine/metrics.py
11 |     :language: python
12 |     :start-after: begin-metrics-definitions
13 |     :end-before: end-metrics-definitions
14 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_langchain.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_langchain:
 2 | 
 3 | Serving with Langchain
 4 | ============================
 5 | 
 6 | vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 7 | 
 8 | To install langchain, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install langchain langchain_community -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from langchain_community.llms import VLLM
19 | 
20 |     llm = VLLM(model="mosaicml/mpt-7b",
21 |                trust_remote_code=True,  # mandatory for hf models
22 |                max_new_tokens=128,
23 |                top_k=10,
24 |                top_p=0.95,
25 |                temperature=0.8,
26 |                # tensor_parallel_size=... # for distributed inference
27 |     )
28 | 
29 |     print(llm("What is the capital of France ?"))
30 | 
31 | Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
32 | 


--------------------------------------------------------------------------------
/docs/source/serving/usage_stats.md:
--------------------------------------------------------------------------------
 1 | # Usage Stats Collection
 2 | 
 3 | vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
 4 | 
 5 | ## What data is collected?
 6 | 
 7 | You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
 8 | 
 9 | Here is an example as of v0.4.0:
10 | 
11 | ```json
12 | {
13 |   "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
14 |   "provider": "GCP",
15 |   "num_cpu": 24,
16 |   "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
17 |   "cpu_family_model_stepping": "6,85,7",
18 |   "total_memory": 101261135872,
19 |   "architecture": "x86_64",
20 |   "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
21 |   "gpu_count": 2,
22 |   "gpu_type": "NVIDIA L4",
23 |   "gpu_memory_per_device": 23580639232,
24 |   "model_architecture": "OPTForCausalLM",
25 |   "vllm_version": "0.3.2+cu123",
26 |   "context": "LLM_CLASS",
27 |   "log_time": 1711663373492490000,
28 |   "source": "production",
29 |   "dtype": "torch.float16",
30 |   "tensor_parallel_size": 1,
31 |   "block_size": 16,
32 |   "gpu_memory_utilization": 0.9,
33 |   "quantization": null,
34 |   "kv_cache_dtype": "auto",
35 |   "enable_lora": false,
36 |   "enable_prefix_caching": false,
37 |   "enforce_eager": false,
38 |   "disable_custom_all_reduce": true
39 | }
40 | ```
41 | 
42 | You can preview the collected data by running the following command:
43 | 
44 | ```bash
45 | tail ~/.config/vllm/usage_stats.json
46 | ```
47 | 
48 | ## Opt-out of Usage Stats Collection
49 | 
50 | You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
51 | 
52 | ```bash
53 | # Any of the following methods can disable usage stats collection
54 | export VLLM_NO_USAGE_STATS=1
55 | export DO_NOT_TRACK=1
56 | mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
57 | ```
58 | 


--------------------------------------------------------------------------------
/examples/aqlm_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | 
 6 | def main():
 7 | 
 8 |     parser = argparse.ArgumentParser(description='AQLM examples')
 9 | 
10 |     parser.add_argument('--model',
11 |                         '-m',
12 |                         type=str,
13 |                         default=None,
14 |                         help='model path, as for HF')
15 |     parser.add_argument('--choice',
16 |                         '-c',
17 |                         type=int,
18 |                         default=0,
19 |                         help='known good models by index, [0-4]')
20 |     parser.add_argument('--tensor_parallel_size',
21 |                         '-t',
22 |                         type=int,
23 |                         default=1,
24 |                         help='tensor parallel size')
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     models = [
29 |         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
30 |         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
31 |         "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
32 |         "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
33 |         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
34 |     ]
35 | 
36 |     model = LLM(args.model if args.model is not None else models[args.choice],
37 |                 tensor_parallel_size=args.tensor_parallel_size)
38 | 
39 |     sampling_params = SamplingParams(max_tokens=100, temperature=0)
40 |     outputs = model.generate("Hello my name is",
41 |                              sampling_params=sampling_params)
42 |     print(outputs[0].outputs[0].text)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/examples/fp8/quantizer/README.md:
--------------------------------------------------------------------------------
 1 | ### Quantizer Utilities
 2 | `quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
 3 | `https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
 4 | 
 5 | ### Prerequisite
 6 | 
 7 | #### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
 8 | `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
 9 | 
10 | #### AMMO Download (code and docs)
11 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
12 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
13 | 
14 | ### Usage
15 | 
16 | #### Run on H100 system for speed if FP8; number of GPUs depends on the model size
17 | 
18 | #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
19 | `python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
20 | 
21 | Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
22 | ```
23 | # ll ./ll2_7b_fp8/
24 | total 19998244
25 | drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
26 | drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
27 | -rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
28 | -rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
29 | -rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
30 | #
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default=None)
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue().launch(server_name=args.host,
51 |                         server_port=args.port,
52 |                         share=True)
53 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference_arctic.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="snowflake/snowflake-arctic-instruct",
15 |           quantization="deepspeedfp",
16 |           tensor_parallel_size=8,
17 |           trust_remote_code=True)
18 | # Generate texts from the prompts. The output is a list of RequestOutput objects
19 | # that contain the prompt, generated text, and other information.
20 | 
21 | outputs = llm.generate(prompts, sampling_params)
22 | # Print the outputs.
23 | for output in outputs:
24 |     prompt = output.prompt
25 |     generated_text = output.outputs[0].text
26 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
27 | 


--------------------------------------------------------------------------------
/examples/offline_inference_embedding.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | 
11 | # Create an LLM.
12 | model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
13 | # Generate embedding. The output is a list of EmbeddingRequestOutputs.
14 | outputs = model.encode(prompts)
15 | # Print the outputs.
16 | for output in outputs:
17 |     print(output.outputs.embedding)  # list of 4096 floats
18 | 


--------------------------------------------------------------------------------
/examples/offline_inference_neuron.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(
15 |     model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
16 |     max_num_seqs=8,
17 |     # The max_model_len and block_size arguments are required to be same as
18 |     # max sequence length when targeting neuron device.
19 |     # Currently, this is a known limitation in continuous batching support
20 |     # in transformers-neuronx.
21 |     # TODO(liangfu): Support paged-attention in transformers-neuronx.
22 |     max_model_len=128,
23 |     block_size=128,
24 |     # The device can be automatically detected when AWS Neuron SDK is installed.
25 |     # The device argument can be either unspecified for automated detection,
26 |     # or explicitly assigned.
27 |     device="neuron",
28 |     tensor_parallel_size=2)
29 | # Generate texts from the prompts. The output is a list of RequestOutput objects
30 | # that contain the prompt, generated text, and other information.
31 | outputs = llm.generate(prompts, sampling_params)
32 | # Print the outputs.
33 | for output in outputs:
34 |     prompt = output.prompt
35 |     generated_text = output.outputs[0].text
36 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
37 | 


--------------------------------------------------------------------------------
/examples/openai_chat_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | chat_completion = client.chat.completions.create(
17 |     messages=[{
18 |         "role": "system",
19 |         "content": "You are a helpful assistant."
20 |     }, {
21 |         "role": "user",
22 |         "content": "Who won the world series in 2020?"
23 |     }, {
24 |         "role":
25 |         "assistant",
26 |         "content":
27 |         "The Los Angeles Dodgers won the World Series in 2020."
28 |     }, {
29 |         "role": "user",
30 |         "content": "Where was it played?"
31 |     }],
32 |     model=model,
33 | )
34 | 
35 | print("Chat completion results:")
36 | print(chat_completion)
37 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | # Completion API
17 | stream = False
18 | completion = client.completions.create(
19 |     model=model,
20 |     prompt="A robot may not injure a human being",
21 |     echo=False,
22 |     n=2,
23 |     stream=stream,
24 |     logprobs=3)
25 | 
26 | print("Completion results:")
27 | if stream:
28 |     for c in completion:
29 |         print(c)
30 | else:
31 |     print(completion)
32 | 


--------------------------------------------------------------------------------
/examples/openai_embedding_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | responses = client.embeddings.create(input=[
17 |     "Hello my name is",
18 |     "The best thing about vLLM is that it supports many different models"
19 | ],
20 |                                      model=model)
21 | 
22 | for data in responses.data:
23 |     print(data.embedding)  # list of float of len 4096
24 | 


--------------------------------------------------------------------------------
/examples/openai_example_batch.jsonl:
--------------------------------------------------------------------------------
1 | {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
2 | {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
3 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_inkbot.jinja:
--------------------------------------------------------------------------------
 1 | <#meta#>
 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 4 | <#system#>
 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 6 | <#chat#>
 7 | {% for message in messages %}
 8 | {% if message['role'] == 'user' %}
 9 | <#user#>
10 | {{ message['content']|trim -}}
11 | {% if not loop.last %}
12 | 
13 | {% endif %}
14 | {% elif message['role'] == 'assistant' %}
15 | <#bot#>
16 | {{ message['content']|trim -}}
17 | {% if not loop.last %}
18 | 
19 | {% endif %}
20 | {% elif message['role'] == 'user_context' %}
21 | <#user_context#>
22 | {{ message['content']|trim -}}
23 | {% if not loop.last %}
24 | 
25 | {% endif %}
26 | {% endif %}
27 | {% endfor %}
28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
29 | <#bot#>
30 | {% endif %}


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # Should be mirrored in requirements-build.txt
 3 | requires = [
 4 |     "cmake>=3.21",
 5 |     "ninja",
 6 |     "packaging",
 7 |     "setuptools >= 49.4.0",
 8 |     "wheel",
 9 | ]
10 | build-backend = "setuptools.build_meta"
11 | 
12 | [tool.ruff]
13 | # Allow lines to be as long as 80.
14 | line-length = 80
15 | exclude = [
16 |     # External file, leaving license intact
17 |     "examples/fp8/quantizer/quantize.py"
18 | ]
19 | 
20 | [tool.ruff.lint]
21 | select = [
22 |     # pycodestyle
23 |     "E",
24 |     # Pyflakes
25 |     "F",
26 |     # pyupgrade
27 |     # "UP",
28 |     # flake8-bugbear
29 |     "B",
30 |     # flake8-simplify
31 |     "SIM",
32 |     # isort
33 |     # "I",
34 |     "G",
35 | ]
36 | ignore = [
37 |     # star imports
38 |     "F405", "F403",
39 |     # lambda expression assignment
40 |     "E731",
41 |     # Loop control variable not used within loop body
42 |     "B007",
43 | ]
44 | 
45 | [tool.mypy]
46 | python_version = "3.8"
47 | 
48 | ignore_missing_imports = true
49 | check_untyped_defs = true
50 | follow_imports = "skip"
51 | 
52 | files = "vllm"
53 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
54 | exclude = [
55 |     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
56 |     # Ignore triton kernels in ops.
57 |     'vllm/attention/ops/.*\.py$'
58 | ]
59 | 
60 | [tool.codespell]
61 | ignore-words-list = "dout, te, indicies, subtile"
62 | skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
63 | 
64 | [tool.isort]
65 | use_parentheses = true
66 | skip_gitignore = true
67 | 
68 | [tool.pytest.ini_options]
69 | markers = [
70 |     "skip_global_cleanup",
71 |     "llm: run tests for vLLM API only",
72 |     "openai: run tests for OpenAI API only",
73 | ]
74 | 


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
1 | # Should be mirrored in pyproject.toml
2 | cmake>=3.21
3 | ninja
4 | packaging
5 | setuptools>=49.4.0
6 | wheel
7 | 


--------------------------------------------------------------------------------
/requirements-common.txt:
--------------------------------------------------------------------------------
 1 | cmake >= 3.21
 2 | ninja  # For faster builds.
 3 | psutil
 4 | sentencepiece  # Required for LLaMA tokenizer.
 5 | numpy
 6 | requests
 7 | py-cpuinfo
 8 | transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 9 | tokenizers >= 0.19.1  # Required for Llama 3.
10 | fastapi
11 | aiohttp
12 | openai
13 | uvicorn[standard]
14 | pydantic >= 2.0  # Required for OpenAI server.
15 | pillow  # Required for image processing
16 | prometheus_client >= 0.18.0
17 | prometheus-fastapi-instrumentator >= 7.0.0
18 | tiktoken >= 0.6.0  # Required for DBRX tokenizer
19 | lm-format-enforcer == 0.10.1
20 | outlines == 0.0.34 # Requires torch >= 2.1.0
21 | typing_extensions
22 | filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
23 | 


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for x86_64 CPUs
5 | torch == 2.3.0+cpu
6 | triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.


--------------------------------------------------------------------------------
/requirements-cuda.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for NVIDIA GPUs
5 | ray >= 2.9
6 | nvidia-ml-py # for pynvml package
7 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | tomli==2.0.1
 5 | ruff==0.1.5
 6 | codespell==2.2.6
 7 | isort==5.13.2
 8 | clang-format==18.1.5
 9 | 
10 | # type checking
11 | mypy==1.9.0
12 | types-PyYAML
13 | types-requests
14 | types-setuptools
15 | 
16 | # testing
17 | pytest
18 | tensorizer>=2.9.0
19 | pytest-forked
20 | pytest-asyncio
21 | pytest-rerunfailures
22 | pytest-shard
23 | 
24 | # testing utils
25 | awscli
26 | einops # required for MPT
27 | httpx
28 | peft
29 | requests
30 | ray
31 | sentence-transformers # required for embedding
32 | 
33 | # Benchmarking
34 | aiohttp
35 | 
36 | # quantization
37 | bitsandbytes==0.42.0
38 | 


--------------------------------------------------------------------------------
/requirements-neuron.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for Neuron devices
5 | transformers-neuronx >= 0.9.0
6 | torch-neuronx >= 2.1.0
7 | neuronx-cc
8 | 


--------------------------------------------------------------------------------
/requirements-rocm.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for AMD GPUs
5 | ray >= 2.10.0
6 | pytest-asyncio
7 | 


--------------------------------------------------------------------------------
/rocm_patch/rocm_bf16.patch:
--------------------------------------------------------------------------------
 1 | --- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
 2 | +++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
 3 | @@ -90,10 +90,10 @@
 4 |  #include "math_fwd.h"              // ocml device functions
 5 |  
 6 |  #if defined(__HIPCC_RTC__)
 7 | -#define __HOST_DEVICE__ __device__
 8 | +#define __HOST_DEVICE__ __device__ static
 9 |  #else
10 |  #include <climits>
11 | -#define __HOST_DEVICE__ __host__ __device__
12 | +#define __HOST_DEVICE__ __host__ __device__ static inline
13 |  #endif
14 |  
15 |  // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
16 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/async_engine/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self._num_aborts = 0
20 | 
21 |     async def abort(self, request_id: str) -> None:
22 |         await super().abort(request_id)
23 |         self._num_aborts += 1
24 | 
25 |     def testing_stats(self) -> Dict[str, Any]:
26 |         return {"num_aborted_requests": self._num_aborts}
27 | 
28 | 
29 | @app.get("/stats")
30 | def stats() -> Response:
31 |     """Get the statistics of the engine."""
32 |     return JSONResponse(engine.testing_stats())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--host", type=str, default="localhost")
38 |     parser.add_argument("--port", type=int, default=8000)
39 |     parser = AsyncEngineArgs.add_cli_args(parser)
40 |     args = parser.parse_args()
41 | 
42 |     engine_args = AsyncEngineArgs.from_cli_args(args)
43 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
44 |     vllm.entrypoints.api_server.engine = engine
45 |     uvicorn.run(
46 |         app,
47 |         host=args.host,
48 |         port=args.port,
49 |         log_level="debug",
50 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
51 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/basic_correctness/__init__.py


--------------------------------------------------------------------------------
/tests/basic_correctness/test_basic_correctness.py:
--------------------------------------------------------------------------------
 1 | """Compare the short outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 4 | """
 5 | import os
 6 | import weakref
 7 | 
 8 | import pytest
 9 | 
10 | from vllm import LLM
11 | 
12 | MODELS = [
13 |     "facebook/opt-125m",
14 |     "meta-llama/Llama-2-7b-hf",
15 | ]
16 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
17 | 
18 | 
19 | def test_vllm_gc_ed():
20 |     """Verify vllm instance is GC'ed when it is deleted"""
21 |     llm = LLM("facebook/opt-125m")
22 |     weak_llm = weakref.ref(llm)
23 |     del llm
24 |     # If there's any circular reference to vllm, this fails
25 |     # because llm instance is not GC'ed.
26 |     assert weak_llm() is None
27 | 
28 | 
29 | @pytest.mark.parametrize("model", MODELS)
30 | @pytest.mark.parametrize("dtype", ["half"])
31 | @pytest.mark.parametrize("max_tokens", [5])
32 | @pytest.mark.parametrize("enforce_eager", [False, True])
33 | def test_models(
34 |     hf_runner,
35 |     vllm_runner,
36 |     example_prompts,
37 |     model: str,
38 |     dtype: str,
39 |     max_tokens: int,
40 |     enforce_eager: bool,
41 | ) -> None:
42 |     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
43 |     if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
44 |         pytest.skip("Skipping non-eager test for FlashInferBackend.")
45 | 
46 |     hf_model = hf_runner(model, dtype=dtype)
47 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
48 |     del hf_model
49 | 
50 |     vllm_model = vllm_runner(model,
51 |                              dtype=dtype,
52 |                              enforce_eager=enforce_eager,
53 |                              gpu_memory_utilization=0.7)
54 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
55 |     del vllm_model
56 | 
57 |     for i in range(len(example_prompts)):
58 |         hf_output_ids, hf_output_str = hf_outputs[i]
59 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
60 |         assert hf_output_str == vllm_output_str, (
61 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
62 |         assert hf_output_ids == vllm_output_ids, (
63 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
64 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/block/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def should_do_global_cleanup_after_test() -> bool:
 6 |     """Disable the global cleanup fixture for tests in this directory. This
 7 |     provides a ~10x speedup for unit tests that don't load a model to GPU.
 8 | 
 9 |     This requires that tests in this directory clean up after themselves if they
10 |     use the GPU.
11 |     """
12 |     return False
13 | 


--------------------------------------------------------------------------------
/tests/core/block/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/core/block/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/test_common.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import pytest
 4 | 
 5 | from vllm.core.block.common import RefCounter
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("seed", list(range(20)))
 9 | @pytest.mark.parametrize("num_incrs", [1, 100])
10 | @pytest.mark.parametrize("num_blocks", [1024])
11 | def test_incr(seed: int, num_incrs: int, num_blocks: int):
12 |     random.seed(seed)
13 | 
14 |     all_block_indices = list(range(num_blocks))
15 |     counter = RefCounter(all_block_indices=all_block_indices)
16 | 
17 |     block_id = random.randint(0, num_blocks - 1)
18 |     for i in range(num_incrs):
19 |         value = counter.incr(block_id)
20 |         assert value == i + 1
21 | 
22 | 
23 | @pytest.mark.parametrize("seed", list(range(20)))
24 | @pytest.mark.parametrize("num_incrs", [1, 100])
25 | @pytest.mark.parametrize("num_blocks", [1024])
26 | def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
27 |     random.seed(seed)
28 | 
29 |     all_block_indices = list(range(num_blocks))
30 |     counter = RefCounter(all_block_indices=all_block_indices)
31 | 
32 |     block_id = random.randint(0, num_blocks - 1)
33 |     for i in range(num_incrs):
34 |         value = counter.incr(block_id)
35 |         assert value == i + 1
36 | 
37 |     for i in range(num_incrs):
38 |         value = counter.decr(block_id)
39 |         assert value == num_incrs - (i + 1)
40 | 
41 |     with pytest.raises(AssertionError):
42 |         counter.decr(block_id)
43 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/distributed/__init__.py


--------------------------------------------------------------------------------
/tests/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/engine/__init__.py


--------------------------------------------------------------------------------
/tests/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/tests/engine/test_computed_prefix_blocks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.arg_utils import EngineArgs
 4 | from vllm.engine.llm_engine import LLMEngine
 5 | from vllm.sampling_params import SamplingParams
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 9 | @pytest.mark.parametrize("block_size", [16])
10 | def test_computed_prefix_blocks(model: str, block_size: int):
11 |     # This test checks if we are able to run the engine to completion
12 |     # without triggering asserts.
13 |     # We are in a scenario where all blocks from the second request's prompt
14 |     # are full and already computed when the second request arrives.
15 |     prompt = (
16 |         "You are a helpful assistant. How do I build a car from cardboard and "
17 |         "paper clips? Is there an easy to follow video tutorial available "
18 |         "online for free?")
19 |     prompt2 = (
20 |         " Please recommend to me some resources where I can learn not only to "
21 |         "handle technical difficulties of building a car, but also "
22 |         "decoration.")
23 | 
24 |     engine_args = EngineArgs(model=model,
25 |                              block_size=block_size,
26 |                              enable_prefix_caching=True)
27 | 
28 |     engine = LLMEngine.from_engine_args(engine_args)
29 |     sampling_params = SamplingParams()
30 | 
31 |     engine.add_request("0", prompt + prompt2, sampling_params)
32 |     engine.step()
33 |     engine.add_request("1", prompt, sampling_params)
34 |     engine.step()
35 | 


--------------------------------------------------------------------------------
/tests/engine/test_detokenization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_computed_prefix_blocks(model: str):
 9 |     # This test checks if the engine generates completions both with and
10 |     # without optional detokenization, that detokenization includes text
11 |     # and no-detokenization doesn't, and that both completions have the same
12 |     # token_ids.
13 |     prompt = (
14 |         "You are a helpful assistant. How do I build a car from cardboard and "
15 |         "paper clips? Is there an easy to follow video tutorial available "
16 |         "online for free?")
17 | 
18 |     llm = LLM(model=model)
19 |     sampling_params = SamplingParams(max_tokens=10,
20 |                                      temperature=0.0,
21 |                                      detokenize=False)
22 | 
23 |     outputs_no_detokenization = llm.generate(prompt,
24 |                                              sampling_params)[0].outputs[0]
25 |     sampling_params.detokenize = True
26 |     outputs_with_detokenization = llm.generate(prompt,
27 |                                                sampling_params)[0].outputs[0]
28 | 
29 |     assert outputs_no_detokenization.text == ''
30 |     assert outputs_with_detokenization.text != ''
31 |     assert outputs_no_detokenization.token_ids == \
32 |         outputs_with_detokenization.token_ids
33 | 


--------------------------------------------------------------------------------
/tests/engine/test_skip_tokenizer_init.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_skip_tokenizer_initialization(model: str):
 9 |     # This test checks if the flag skip_tokenizer_init skips the initialization
10 |     # of tokenizer and detokenizer. The generated output is expected to contain
11 |     # token ids.
12 |     llm = LLM(model=model, skip_tokenizer_init=True)
13 |     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
14 |     with pytest.raises(ValueError) as err:
15 |         llm.generate("abc", sampling_params)
16 |     assert "prompts must be None if" in str(err.value)
17 |     outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
18 |                            sampling_params=sampling_params)
19 |     assert len(outputs) > 0
20 |     completions = outputs[0].outputs
21 |     assert len(completions) > 0
22 |     assert completions[0].text == ""
23 |     assert completions[0].token_ids
24 | 


--------------------------------------------------------------------------------
/tests/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/entrypoints/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoints/openai/test_serving_chat.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 7 | 
 8 | MODEL_NAME = "openai-community/gpt2"
 9 | CHAT_TEMPLATE = "Dummy chat template for testing {}"
10 | 
11 | pytestmark = pytest.mark.openai
12 | 
13 | 
14 | @dataclass
15 | class MockModelConfig:
16 |     tokenizer = MODEL_NAME
17 |     trust_remote_code = False
18 |     tokenizer_mode = "auto"
19 |     max_model_len = 100
20 |     tokenizer_revision = None
21 |     embedding_mode = False
22 | 
23 | 
24 | @dataclass
25 | class MockEngine:
26 | 
27 |     async def get_model_config(self):
28 |         return MockModelConfig()
29 | 
30 | 
31 | async def _async_serving_chat_init():
32 |     engine = MockEngine()
33 |     model_config = await engine.get_model_config()
34 | 
35 |     serving_completion = OpenAIServingChat(engine,
36 |                                            model_config,
37 |                                            served_model_names=[MODEL_NAME],
38 |                                            response_role="assistant",
39 |                                            chat_template=CHAT_TEMPLATE)
40 |     return serving_completion
41 | 
42 | 
43 | def test_async_serving_chat_init():
44 |     serving_completion = asyncio.run(_async_serving_chat_init())
45 |     assert serving_completion.tokenizer is not None
46 |     assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
47 | 


--------------------------------------------------------------------------------
/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "llama",
 3 |     "kv_cache": {
 4 |         "dtype": "float8_e4m3fn",
 5 |         "scaling_factor": {
 6 |             "0": {
 7 |                 "0": 0.0152239128947258,
 8 |                 "1": 0.0188860222697258,
 9 |                 "2": 0.0354178324341774,
10 |                 "3": 0.0376674123108387,
11 |                 "4": 0.0418526791036129,
12 |                 "5": 0.0433175228536129,
13 |                 "6": 0.0397600457072258,
14 |                 "7": 0.0424455925822258,
15 |                 "8": 0.0415387861430645,
16 |                 "9": 0.0408412404358387,
17 |                 "10": 0.0395856611430645,
18 |                 "11": 0.0377371683716774,
19 |                 "12": 0.0400739423930645,
20 |                 "13": 0.040771484375,
21 |                 "14": 0.0393415205180645,
22 |                 "15": 0.0369001142680645,
23 |                 "16": 0.03857421875,
24 |                 "17": 0.0387486070394516,
25 |                 "18": 0.0403180830180645,
26 |                 "19": 0.0396205373108387,
27 |                 "20": 0.0375627800822258,
28 |                 "21": 0.0407366082072258,
29 |                 "22": 0.0432477705180645,
30 |                 "23": 0.0377022884786129,
31 |                 "24": 0.0399693101644516,
32 |                 "25": 0.0374581478536129,
33 |                 "26": 0.0413295216858387,
34 |                 "27": 0.0442243330180645,
35 |                 "28": 0.0424804724752903,
36 |                 "29": 0.0456891767680645,
37 |                 "30": 0.0409109964966774,
38 |                 "31": 0.0482352152466774
39 |             }
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/kernels/__init__.py


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Reference default values of atol and rtol are from
 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 6 | default_rtol = {
 7 |     torch.float16: 1e-3,
 8 |     torch.bfloat16: 1.6e-2,
 9 |     torch.float: 1.3e-6
10 | }
11 | 
12 | 
13 | def get_default_atol(output) -> float:
14 |     return default_atol[output.dtype]
15 | 
16 | 
17 | def get_default_rtol(output) -> float:
18 |     return default_rtol[output.dtype]
19 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.utils import (create_kv_caches_with_random,
 4 |                         create_kv_caches_with_random_flash)
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def kv_cache_factory():
 9 |     return create_kv_caches_with_random
10 | 
11 | 
12 | @pytest.fixture()
13 | def kv_cache_factory_flashinfer():
14 |     return create_kv_caches_with_random_flash
15 | 


--------------------------------------------------------------------------------
/tests/kernels/test_int8_quant.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm._C import ops
 5 | 
 6 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 7 | HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 8 | NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 9 | SEEDS = [0]
10 | SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
11 | 
12 | 
13 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
14 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
15 | @pytest.mark.parametrize("dtype", DTYPES)
16 | @pytest.mark.parametrize("seed", SEEDS)
17 | @pytest.mark.parametrize("scale", SCALE)
18 | @torch.inference_mode()
19 | def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
20 |                seed: int, scale: float) -> None:
21 |     torch.random.manual_seed(seed)
22 |     torch.cuda.manual_seed(seed)
23 |     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
24 | 
25 |     out1 = (x / scale).round().clamp(
26 |         torch.iinfo(torch.int8).min,
27 |         torch.iinfo(torch.int8).max).to(torch.int8)
28 |     out2 = torch.empty_like(x, dtype=torch.int8)
29 |     ops.static_scaled_int8_quant(out2, x, scale)
30 |     assert torch.allclose(out1, out2,
31 |                           atol=1)  # big atol to account for rounding errors
32 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm.model_executor.layers.layernorm import RMSNorm
 5 | 
 6 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 7 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 8 | HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
 9 |                 8199]  # Arbitrary values for testing
10 | ADD_RESIDUAL = [False, True]
11 | SEEDS = [0]
12 | CUDA_DEVICES = [
13 |     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
14 | ]
15 | 
16 | 
17 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
18 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
19 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
20 | @pytest.mark.parametrize("dtype", DTYPES)
21 | @pytest.mark.parametrize("seed", SEEDS)
22 | @pytest.mark.parametrize("device", CUDA_DEVICES)
23 | @torch.inference_mode()
24 | def test_rms_norm(
25 |     num_tokens: int,
26 |     hidden_size: int,
27 |     add_residual: bool,
28 |     dtype: torch.dtype,
29 |     seed: int,
30 |     device: str,
31 | ) -> None:
32 |     torch.random.manual_seed(seed)
33 |     if torch.cuda.is_available():
34 |         torch.cuda.manual_seed(seed)
35 |     torch.set_default_device(device)
36 |     layer = RMSNorm(hidden_size).to(dtype=dtype)
37 |     layer.weight.data.normal_(mean=1.0, std=0.1)
38 |     scale = 1 / (2 * hidden_size)
39 |     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
40 |     x *= scale
41 |     residual = torch.randn_like(x) * scale if add_residual else None
42 | 
43 |     # NOTE(woosuk): The reference implementation should be executed first
44 |     # because the custom kernel is in-place.
45 |     ref_out = layer._forward(x, residual)
46 |     out = layer(x, residual)
47 |     # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
48 |     # numerical errors than other operators because they involve reductions.
49 |     # Therefore, we use a larger tolerance.
50 |     if add_residual:
51 |         assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
52 |         assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
53 |     else:
54 |         assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
55 | 


--------------------------------------------------------------------------------
/tests/kernels/test_rand.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.ops.rand import seeded_uniform
 7 | from vllm.model_executor.utils import set_random_seed
 8 | 
 9 | 
10 | @pytest.mark.parametrize("dtype",
11 |                          [torch.float32, torch.float16, torch.bfloat16])
12 | @pytest.mark.parametrize("use_3d", [True, False])
13 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
14 |     device = "cuda"
15 |     for seed in range(512):
16 |         set_random_seed(seed)
17 |         rows = random.randint(1, 512)
18 |         cols = random.randint(1, 64000)
19 |         if use_3d:
20 |             third_dim = random.randint(2, 10)
21 |             dims = [rows, third_dim, cols]
22 |         else:
23 |             dims = [rows, cols]
24 |         seeds = torch.randint(torch.iinfo(torch.long).min,
25 |                               torch.iinfo(torch.long).max, (rows, ),
26 |                               device=device)
27 | 
28 |         # Test that the same seed produces the same output
29 |         out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
30 |         out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
31 |         torch.testing.assert_close(out, out2)
32 |         # del to save memory
33 |         del out2
34 | 
35 |         out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
36 |         torch.testing.assert_close(out, out3)
37 |         # del to save memory
38 |         del out3
39 | 
40 |         # Initialize out tensor with garbage to ensure that it is overwritten
41 |         out_with_tensor = seeded_uniform(
42 |             *dims,
43 |             out=torch.full(
44 |                 (*dims, ),
45 |                 -1,
46 |                 dtype=dtype,
47 |                 device=device,
48 |             ),
49 |             seeds=seeds,
50 |             dtype=dtype,
51 |         )
52 |         torch.testing.assert_close(out, out_with_tensor)
53 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/lora/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/lora/data/__init__.py


--------------------------------------------------------------------------------
/tests/lora/test_gemma.py:
--------------------------------------------------------------------------------
 1 | import vllm
 2 | from vllm.lora.request import LoRARequest
 3 | 
 4 | MODEL_PATH = "google/gemma-7b"
 5 | 
 6 | 
 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str:
 8 |     prompts = [
 9 |         "Quote: Imagination is",
10 |         "Quote: Be yourself;",
11 |         "Quote: So many books,",
12 |     ]
13 |     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
14 |     outputs = llm.generate(
15 |         prompts,
16 |         sampling_params,
17 |         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
18 |         if lora_id else None)
19 |     # Print the outputs.
20 |     generated_texts = []
21 |     for output in outputs:
22 |         prompt = output.prompt
23 |         generated_text = output.outputs[0].text.strip()
24 |         generated_texts.append(generated_text)
25 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
26 |     return generated_texts
27 | 
28 | 
29 | def test_gemma_lora(gemma_lora_files):
30 |     llm = vllm.LLM(MODEL_PATH,
31 |                    max_model_len=1024,
32 |                    enable_lora=True,
33 |                    max_loras=4)
34 | 
35 |     expected_lora_output = [
36 |         "more important than knowledge.\nAuthor: Albert Einstein\n",
37 |         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
38 |         "so little time\nAuthor: Frank Zappa\n",
39 |     ]
40 | 
41 |     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
42 |     for i in range(len(expected_lora_output)):
43 |         assert output1[i].startswith(expected_lora_output[i])
44 |     output2 = do_sample(llm, gemma_lora_files, lora_id=2)
45 |     for i in range(len(expected_lora_output)):
46 |         assert output2[i].startswith(expected_lora_output[i])
47 | 


--------------------------------------------------------------------------------
/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/model_executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/model_executor/__init__.py


--------------------------------------------------------------------------------
/tests/model_executor/weight_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | import huggingface_hub.constants
 5 | import pytest
 6 | from huggingface_hub.utils import LocalEntryNotFoundError
 7 | 
 8 | from vllm.model_executor.model_loader.weight_utils import (
 9 |     download_weights_from_hf, enable_hf_transfer)
10 | 
11 | 
12 | def test_hf_transfer_auto_activation():
13 |     if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
14 |         # in case it is already set, we can't test the auto activation
15 |         pytest.skip(
16 |             "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
17 |     enable_hf_transfer()
18 |     try:
19 |         # enable hf hub transfer if available
20 |         import hf_transfer  # type: ignore # noqa
21 |         HF_TRANFER_ACTIVE = True
22 |     except ImportError:
23 |         HF_TRANFER_ACTIVE = False
24 |     assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
25 |             HF_TRANFER_ACTIVE)
26 | 
27 | 
28 | def test_download_weights_from_hf():
29 |     with tempfile.TemporaryDirectory() as tmpdir:
30 |         # assert LocalEntryNotFoundError error is thrown
31 |         # if offline is set and model is not cached
32 |         huggingface_hub.constants.HF_HUB_OFFLINE = True
33 |         with pytest.raises(LocalEntryNotFoundError):
34 |             download_weights_from_hf("facebook/opt-125m",
35 |                                      allow_patterns=["*.safetensors", "*.bin"],
36 |                                      cache_dir=tmpdir)
37 | 
38 |         # download the model
39 |         huggingface_hub.constants.HF_HUB_OFFLINE = False
40 |         download_weights_from_hf("facebook/opt-125m",
41 |                                  allow_patterns=["*.safetensors", "*.bin"],
42 |                                  cache_dir=tmpdir)
43 | 
44 |         # now it should work offline
45 |         huggingface_hub.constants.HF_HUB_OFFLINE = True
46 |         assert download_weights_from_hf(
47 |             "facebook/opt-125m",
48 |             allow_patterns=["*.safetensors", "*.bin"],
49 |             cache_dir=tmpdir) is not None
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     test_hf_transfer_auto_activation()
54 |     test_download_weights_from_hf()
55 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/test_big_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | This tests bigger models and use half precision.
 4 | 
 5 | Run `pytest tests/models/test_big_models.py`.
 6 | """
 7 | import pytest
 8 | 
 9 | MODELS = [
10 |     "meta-llama/Llama-2-7b-hf",
11 |     # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
12 |     # "Deci/DeciLM-7b",  # Broken
13 |     # "tiiuae/falcon-7b",  # Broken
14 |     "EleutherAI/gpt-j-6b",
15 |     # "mosaicml/mpt-7b",  # Broken
16 |     # "Qwen/Qwen1.5-0.5B"  # Broken,
17 | ]
18 | 
19 | 
20 | @pytest.mark.parametrize("model", MODELS)
21 | @pytest.mark.parametrize("dtype", ["half"])
22 | @pytest.mark.parametrize("max_tokens", [32])
23 | def test_models(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 | ) -> None:
31 |     hf_model = hf_runner(model, dtype=dtype)
32 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, hf_output_str = hf_outputs[i]
41 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
42 |         assert hf_output_str == vllm_output_str, (
43 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
44 |         assert hf_output_ids == vllm_output_ids, (
45 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
46 | 
47 | 
48 | @pytest.mark.parametrize("model", MODELS)
49 | @pytest.mark.parametrize("dtype", ["half"])
50 | def test_model_print(
51 |     vllm_runner,
52 |     model: str,
53 |     dtype: str,
54 | ) -> None:
55 |     vllm_model = vllm_runner(model, dtype=dtype)
56 |     # This test is for verifying whether the model's extra_repr
57 |     # can be printed correctly.
58 |     print(vllm_model.model.llm_engine.model_executor.driver_worker.
59 |           model_runner.model)
60 |     del vllm_model
61 | 


--------------------------------------------------------------------------------
/tests/models/test_embedding.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_llama_embedding.py`.
 4 | """
 5 | import pytest
 6 | import torch
 7 | import torch.nn.functional as F
 8 | 
 9 | MODELS = [
10 |     "intfloat/e5-mistral-7b-instruct",
11 | ]
12 | 
13 | 
14 | def compare_embeddings(embeddings1, embeddings2):
15 |     similarities = [
16 |         F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0)
17 |         for e1, e2 in zip(embeddings1, embeddings2)
18 |     ]
19 |     return similarities
20 | 
21 | 
22 | @pytest.mark.parametrize("model", MODELS)
23 | @pytest.mark.parametrize("dtype", ["half"])
24 | def test_models(
25 |     hf_runner,
26 |     vllm_runner,
27 |     example_prompts,
28 |     model: str,
29 |     dtype: str,
30 | ) -> None:
31 |     hf_model = hf_runner(model, dtype=dtype)
32 |     hf_outputs = hf_model.encode(example_prompts)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.encode(example_prompts)
37 |     del vllm_model
38 | 
39 |     similarities = compare_embeddings(hf_outputs, vllm_outputs)
40 |     all_similarities = torch.stack(similarities)
41 |     tolerance = 1e-2
42 |     assert torch.all((all_similarities <= 1.0 + tolerance)
43 |                      & (all_similarities >= 1.0 - tolerance)
44 |                      ), f"Not all values are within {tolerance} of 1.0"
45 | 


--------------------------------------------------------------------------------
/tests/models/test_mistral.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_mistral.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | from .utils import check_logprobs_close
 8 | 
 9 | MODELS = [
10 |     "mistralai/Mistral-7B-Instruct-v0.1",
11 |     "mistralai/Mistral-7B-Instruct-v0.3",
12 | ]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", MODELS)
16 | @pytest.mark.parametrize("dtype", ["bfloat16"])
17 | @pytest.mark.parametrize("max_tokens", [64])
18 | @pytest.mark.parametrize("num_logprobs", [5])
19 | def test_models(
20 |     hf_runner,
21 |     vllm_runner,
22 |     example_prompts,
23 |     model: str,
24 |     dtype: str,
25 |     max_tokens: int,
26 |     num_logprobs: int,
27 | ) -> None:
28 |     # TODO(sang): Sliding window should be tested separately.
29 |     hf_model = hf_runner(model, dtype=dtype)
30 |     hf_outputs = hf_model.generate_greedy_logprobs_limit(
31 |         example_prompts, max_tokens, num_logprobs)
32 |     del hf_model
33 | 
34 |     vllm_model = vllm_runner(model, dtype=dtype)
35 |     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
36 |                                                        max_tokens,
37 |                                                        num_logprobs)
38 |     del vllm_model
39 |     check_logprobs_close(
40 |         outputs_0_lst=hf_outputs,
41 |         outputs_1_lst=vllm_outputs,
42 |         name_0="hf",
43 |         name_1="vllm",
44 |     )
45 | 


--------------------------------------------------------------------------------
/tests/models/test_oot_registration.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from vllm import LLM, ModelRegistry, SamplingParams
 4 | from vllm.model_executor.models.opt import OPTForCausalLM
 5 | from vllm.model_executor.sampling_metadata import SamplingMetadata
 6 | 
 7 | 
 8 | class MyOPTForCausalLM(OPTForCausalLM):
 9 | 
10 |     def compute_logits(self, hidden_states: torch.Tensor,
11 |                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
12 |         # this dummy model always predicts the first token
13 |         logits = super().compute_logits(hidden_states, sampling_metadata)
14 |         logits.zero_()
15 |         logits[:, 0] += 1.0
16 |         return logits
17 | 
18 | 
19 | def test_oot_registration():
20 |     # register our dummy model
21 |     ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
22 |     prompts = ["Hello, my name is", "The text does not matter"]
23 |     sampling_params = SamplingParams(temperature=0)
24 |     llm = LLM(model="facebook/opt-125m")
25 |     first_token = llm.get_tokenizer().decode(0)
26 |     outputs = llm.generate(prompts, sampling_params)
27 | 
28 |     for output in outputs:
29 |         generated_text = output.outputs[0].text
30 |         # make sure only the first token is generated
31 |         rest = generated_text.replace(first_token, "")
32 |         assert rest == ""
33 | 


--------------------------------------------------------------------------------
/tests/models/test_registry.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.model_executor.models import _MODELS, ModelRegistry
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("model_cls", _MODELS)
 7 | def test_registry_imports(model_cls):
 8 |     # Ensure all model classes can be imported successfully
 9 |     ModelRegistry.load_model_cls(model_cls)
10 | 


--------------------------------------------------------------------------------
/tests/models/utils.py:
--------------------------------------------------------------------------------
 1 | def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
 2 |     """Compare the logprobs of two sequences generated by different models, 
 3 |     which should be similar but not necessarily equal.
 4 |     """
 5 |     # Loop through responses to each prompt.
 6 |     for prompt_idx, (outputs_0,
 7 |                      outputs_1) in enumerate(zip(outputs_0_lst,
 8 |                                                  outputs_1_lst)):
 9 |         output_ids_0, output_str_0, logprobs_0 = outputs_0
10 |         output_ids_1, output_str_1, logprobs_1 = outputs_1
11 | 
12 |         # Loop through generated tokens.
13 |         for idx, (output_id_0,
14 |                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
15 | 
16 |             # If generated tokens don't match, then
17 |             if output_id_0 != output_id_1:
18 |                 # Each predicted token must be in top N logprobs of the other
19 |                 assert output_id_0 in logprobs_1[idx], (
20 |                     f"Test{prompt_idx}:"
21 |                     f"\n{name_0}:\t{output_str_0!r}"
22 |                     f"\n{name_1}:\t{output_str_1!r}")
23 |                 assert output_id_1 in logprobs_0[idx], (
24 |                     f"Test{prompt_idx}:"
25 |                     f"\n{name_0}:\t{output_str_0!r}"
26 |                     f"\n{name_1}:\t{output_str_1!r}")
27 | 
28 |                 # Break out since sequences will now diverge.
29 |                 break
30 | 


--------------------------------------------------------------------------------
/tests/multimodal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/multimodal/__init__.py


--------------------------------------------------------------------------------
/tests/prefix_caching/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/prefix_caching/__init__.py


--------------------------------------------------------------------------------
/tests/prefix_caching/test_disable_sliding_window.py:
--------------------------------------------------------------------------------
 1 | """Compare the with and without prefix caching.
 2 | 
 3 | Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | from tests.conftest import cleanup
 8 | from vllm import LLM
 9 | 
10 | MODEL_LEN_LEN = [
11 |     # Example models with sliding window.
12 |     ("bigcode/starcoder2-3b", 4096, 16384),
13 |     # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
14 | 
15 |     # Confirm model with sliding window works.
16 |     # config has "use_sliding_window": false
17 |     ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
18 |     # config has no sliding window attribute.
19 |     ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
20 | ]
21 | 
22 | 
23 | @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
24 | def test_disable_sliding_window(model_len_len, ):
25 |     model, sliding_len, full_len = model_len_len
26 |     vllm_disabled_model = LLM(model, disable_sliding_window=True)
27 |     vllm_disabled_model.generate("Hi my name is")
28 |     model_config = vllm_disabled_model.llm_engine.model_config
29 |     assert model_config.max_model_len == sliding_len, (
30 |         "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
31 |         model_config.max_model_len)
32 | 
33 |     del vllm_disabled_model
34 |     cleanup()
35 | 
36 |     vllm_enabled_model = LLM(model, disable_sliding_window=False)
37 |     vllm_enabled_model.generate("Hi my name is")
38 |     model_config = vllm_enabled_model.llm_engine.model_config
39 |     assert model_config.max_model_len == full_len, (
40 |         "Max len expected to equal full_len of %s, but got %s", full_len,
41 |         model_config.max_model_len)
42 | 
43 |     del vllm_enabled_model
44 |     cleanup()
45 | 


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/quantization/test_compressed_tensors.py:
--------------------------------------------------------------------------------
 1 | """Test model set-up and weight loading for sparseml-quantized models.
 2 | 
 3 | Run `pytest tests/quantization/test_compressed_tensors.py`.
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
 9 |     CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
10 | 
11 | 
12 | def test_compressed_tensors_w8a8_static_setup(vllm_runner):
13 |     model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
14 |     llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
15 |     model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
16 |     layer = model.model.layers[0]
17 | 
18 |     qkv_proj = layer.self_attn.qkv_proj
19 |     o_proj = layer.self_attn.o_proj
20 |     gate_up_proj = layer.mlp.gate_up_proj
21 |     down_proj = layer.mlp.down_proj
22 | 
23 |     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
24 |     assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
25 |     assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
26 |     assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
27 | 
28 |     assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
29 | 
30 |     assert qkv_proj.weight.dtype is torch.int8
31 |     assert o_proj.weight.dtype is torch.int8
32 |     assert gate_up_proj.weight.dtype is torch.int8
33 | 
34 |     assert qkv_proj.weight_scale.shard_splitter is not None
35 |     assert qkv_proj.weight_scale.logical_widths is not None
36 |     assert qkv_proj.input_scale.dtype is torch.float32
37 | 


--------------------------------------------------------------------------------
/tests/quantization/test_fp8.py:
--------------------------------------------------------------------------------
 1 | """Tests whether FP8 computation is enabled correctly.
 2 | 
 3 | Run `pytest tests/quantization/test_fp8.py --forked`.
 4 | """
 5 | import pytest
 6 | import torch
 7 | 
 8 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 9 | from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
10 | 
11 | capability = torch.cuda.get_device_capability()
12 | capability = capability[0] * 10 + capability[1]
13 | 
14 | 
15 | @pytest.mark.skipif(
16 |     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
17 |     reason="FP8 is not supported on this GPU type.")
18 | def test_load_fp16_model(vllm_runner) -> None:
19 |     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
20 | 
21 |     model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
22 |     fc1 = model.model.decoder.layers[0].fc1
23 |     assert isinstance(fc1.quant_method, Fp8LinearMethod)
24 |     assert fc1.weight.dtype == torch.float8_e4m3fn
25 | 


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py`.
 4 | """
 5 | import gc
 6 | 
 7 | import pytest
 8 | import torch
 9 | 
10 | # FIXME(zhuohan): The test can not pass if we:
11 | #   1. Increase max_tokens to 256.
12 | #   2. Increase beam_width to 8.
13 | #   3. Use the model "huggyllama/llama-7b".
14 | MAX_TOKENS = [128]
15 | BEAM_WIDTHS = [4]
16 | MODELS = ["facebook/opt-125m"]
17 | 
18 | 
19 | @pytest.mark.parametrize("model", MODELS)
20 | @pytest.mark.parametrize("dtype", ["half"])
21 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
22 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
23 | def test_beam_search_single_input(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 |     beam_width: int,
31 | ) -> None:
32 |     example_prompts = example_prompts[:1]
33 |     hf_model = hf_runner(model, dtype=dtype)
34 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
35 |                                                max_tokens)
36 |     del hf_model
37 | 
38 |     vllm_model = vllm_runner(model, dtype=dtype)
39 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
40 |                                                    max_tokens)
41 |     del vllm_model
42 |     # NOTE(woosuk): For some reason, the following GC is required to avoid
43 |     # GPU OOM errors in the following tests using `vllm_runner`.
44 |     gc.collect()
45 |     torch.cuda.empty_cache()
46 | 
47 |     for i in range(len(example_prompts)):
48 |         hf_output_ids, _ = hf_outputs[i]
49 |         vllm_output_ids, _ = vllm_outputs[i]
50 |         assert len(hf_output_ids) == len(vllm_output_ids)
51 |         for j in range(len(hf_output_ids)):
52 |             assert hf_output_ids[j] == vllm_output_ids[j], (
53 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
54 |                 f"vLLM: {vllm_output_ids}")
55 | 


--------------------------------------------------------------------------------
/tests/samplers/test_ignore_eos.py:
--------------------------------------------------------------------------------
 1 | """Make sure ignore_eos works.
 2 | 
 3 | Run `pytest tests/samplers/test_ignore_eos.py`.
 4 | """
 5 | 
 6 | import pytest
 7 | 
 8 | from vllm import SamplingParams
 9 | 
10 | # We also test with llama because it has generation_config to specify EOS
11 | # (past regression).
12 | MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
13 | 
14 | 
15 | @pytest.mark.parametrize("model", MODELS)
16 | @pytest.mark.parametrize("dtype", ["half"])
17 | @pytest.mark.parametrize("max_tokens", [512])
18 | def test_ignore_eos(
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 | ) -> None:
25 |     vllm_model = vllm_runner(model, dtype=dtype)
26 |     sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
27 | 
28 |     for prompt in example_prompts:
29 |         ignore_eos_output = vllm_model.model.generate(
30 |             prompt, sampling_params=sampling_params)
31 |         output_length = len(ignore_eos_output[0].outputs[0].token_ids)
32 |         assert output_length == max_tokens
33 | 


--------------------------------------------------------------------------------
/tests/samplers/test_logits_processor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm import SamplingParams
 5 | 
 6 | MODELS = ["facebook/opt-125m"]
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("model", MODELS)
10 | @pytest.mark.parametrize("dtype", ["half"])
11 | def test_logits_processor_force_generate(
12 |     vllm_runner,
13 |     example_prompts,
14 |     model: str,
15 |     dtype: str,
16 | ) -> None:
17 |     vllm_model = vllm_runner(model, dtype=dtype)
18 |     tokenizer = vllm_model.model.get_tokenizer()
19 |     repeat_times = 2
20 |     enforced_answers = " vLLM"
21 |     vllm_token_ids = tokenizer.encode(enforced_answers,
22 |                                       add_special_tokens=False)
23 |     max_tokens = len(vllm_token_ids) * repeat_times
24 | 
25 |     def pick_vllm(token_ids, logits):
26 |         token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
27 |         logits[token_id] = torch.finfo(logits.dtype).max
28 |         return logits
29 | 
30 |     params_with_logprobs = SamplingParams(
31 |         logits_processors=[pick_vllm],
32 |         prompt_logprobs=3,
33 |         max_tokens=max_tokens,
34 |     )
35 | 
36 |     # test logits_processors when prompt_logprobs is not None
37 |     vllm_model.model._add_request(
38 |         example_prompts[0],
39 |         params=params_with_logprobs,
40 |     )
41 | 
42 |     # test prompt_logprobs is not None
43 |     vllm_model.model._add_request(
44 |         example_prompts[1],
45 |         params=SamplingParams(
46 |             prompt_logprobs=3,
47 |             max_tokens=max_tokens,
48 |         ),
49 |     )
50 | 
51 |     # test grouped requests
52 |     vllm_model.model._add_request(
53 |         example_prompts[2],
54 |         params=SamplingParams(max_tokens=max_tokens),
55 |     )
56 | 
57 |     outputs = vllm_model.model._run_engine(use_tqdm=False)
58 | 
59 |     assert outputs[0].outputs[0].text == enforced_answers * repeat_times
60 | 


--------------------------------------------------------------------------------
/tests/samplers/test_ranks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import SamplingParams
 4 | 
 5 | MODELS = ["facebook/opt-125m"]
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", MODELS)
 9 | @pytest.mark.parametrize("dtype", ["half"])
10 | def test_ranks(
11 |     vllm_runner,
12 |     model,
13 |     dtype,
14 |     example_prompts,
15 | ):
16 |     max_tokens = 5
17 |     num_top_logprobs = 5
18 |     num_prompt_logprobs = 5
19 | 
20 |     vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
21 | 
22 |     ## Test greedy logprobs ranks
23 |     vllm_sampling_params = SamplingParams(temperature=0.0,
24 |                                           top_p=1.0,
25 |                                           max_tokens=max_tokens,
26 |                                           logprobs=num_top_logprobs,
27 |                                           prompt_logprobs=num_prompt_logprobs)
28 |     vllm_results = vllm_model.generate_w_logprobs(example_prompts,
29 |                                                   vllm_sampling_params)
30 |     for result in vllm_results:
31 |         assert result[2] is not None
32 |         assert len(result[2]) == len(result[0])
33 |         # check whether all chosen tokens have ranks = 1
34 |         for token, logprobs in zip(result[0], result[2]):
35 |             assert token in logprobs
36 |             assert logprobs[token].rank == 1
37 | 
38 |     ## Test non-greedy logprobs ranks
39 |     sampling_params = SamplingParams(temperature=1.0,
40 |                                      top_p=1.0,
41 |                                      max_tokens=max_tokens,
42 |                                      logprobs=num_top_logprobs,
43 |                                      prompt_logprobs=num_prompt_logprobs)
44 |     res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
45 |     for result in res:
46 |         assert result[2] is not None
47 |         assert len(result[2]) == len(result[0])
48 |         # check whether all chosen tokens have ranks
49 |         for token, logprobs in zip(result[0], result[2]):
50 |             assert logprobs[token].rank >= 1
51 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/spec_decode/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/test_integration.py:
--------------------------------------------------------------------------------
 1 | """Tests which cover integration of the speculative decoding framework with
 2 | other features, e.g. cuda graphs.
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from .conftest import run_greedy_equality_correctness_test
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "common_llm_kwargs",
12 |     [{
13 |         # Required for spec decode.
14 |         "use_v2_block_manager": True,
15 | 
16 |         # Verify equality when cuda graphs allowed.
17 |         "enforce_eager": False,
18 |         "model": "JackFram/llama-68m",
19 |     }])
20 | @pytest.mark.parametrize(
21 |     "per_test_common_llm_kwargs",
22 |     [
23 |         {
24 |             # Identical models.
25 |             "speculative_model": "JackFram/llama-68m",
26 |             "num_speculative_tokens": 5,
27 |         },
28 |     ])
29 | @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
30 | @pytest.mark.parametrize("test_llm_kwargs", [{}])
31 | @pytest.mark.parametrize("batch_size", [8])
32 | @pytest.mark.parametrize("output_len", [32])
33 | @pytest.mark.parametrize("seed", [1])
34 | def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
35 |                                 batch_size, output_len):
36 |     """Verify spec decode equality when cuda graphs are enabled.
37 |     """
38 |     run_greedy_equality_correctness_test(
39 |         baseline_llm_generator,
40 |         test_llm_generator,
41 |         batch_size,
42 |         max_output_len=output_len,
43 |         force_output_len=True,
44 |     )
45 | 


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/tensorizer_loader/__init__.py


--------------------------------------------------------------------------------
/tests/test_inputs.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pytest
 4 | 
 5 | from vllm.inputs import parse_and_batch_prompt
 6 | 
 7 | STRING_INPUTS = [
 8 |     '',
 9 |     'foo',
10 |     'foo bar',
11 |     'foo baz bar',
12 |     'foo bar qux baz',
13 | ]
14 | 
15 | TOKEN_INPUTS = [
16 |     [-1],
17 |     [1],
18 |     [1, 2],
19 |     [1, 3, 4],
20 |     [1, 2, 4, 3],
21 | ]
22 | 
23 | INPUTS_SLICES = [
24 |     slice(None, None, -1),
25 |     slice(None, None, 2),
26 |     slice(None, None, -2),
27 | ]
28 | 
29 | 
30 | def test_parse_single_batch_empty():
31 |     with pytest.raises(ValueError, match="at least one prompt"):
32 |         parse_and_batch_prompt([])
33 | 
34 |     with pytest.raises(ValueError, match="at least one prompt"):
35 |         parse_and_batch_prompt([[]])
36 | 
37 | 
38 | @pytest.mark.parametrize('string_input', STRING_INPUTS)
39 | def test_parse_single_batch_string_consistent(string_input: str):
40 |     assert parse_and_batch_prompt(string_input) \
41 |         == parse_and_batch_prompt([string_input])
42 | 
43 | 
44 | @pytest.mark.parametrize('token_input', TOKEN_INPUTS)
45 | def test_parse_single_batch_token_consistent(token_input: List[int]):
46 |     assert parse_and_batch_prompt(token_input) \
47 |         == parse_and_batch_prompt([token_input])
48 | 
49 | 
50 | @pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
51 | def test_parse_single_batch_string_slice(inputs_slice: slice):
52 |     assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
53 |         == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
54 | 


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | """Containing tests that check for regressions in vLLM's behavior.
 2 | 
 3 | It should include tests that are reported by users and making sure they
 4 | will never happen again.
 5 | 
 6 | """
 7 | import gc
 8 | 
 9 | import torch
10 | 
11 | from vllm import LLM, SamplingParams
12 | 
13 | 
14 | def test_duplicated_ignored_sequence_group():
15 |     """https://github.com/vllm-project/vllm/issues/1655"""
16 | 
17 |     sampling_params = SamplingParams(temperature=0.01,
18 |                                      top_p=0.1,
19 |                                      max_tokens=256)
20 |     llm = LLM(model="facebook/opt-125m",
21 |               max_num_batched_tokens=4096,
22 |               tensor_parallel_size=1)
23 |     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
24 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
25 | 
26 |     assert len(prompts) == len(outputs)
27 | 
28 | 
29 | def test_max_tokens_none():
30 |     sampling_params = SamplingParams(temperature=0.01,
31 |                                      top_p=0.1,
32 |                                      max_tokens=None)
33 |     llm = LLM(model="facebook/opt-125m",
34 |               max_num_batched_tokens=4096,
35 |               tensor_parallel_size=1)
36 |     prompts = ["Just say hello!"]
37 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
38 | 
39 |     assert len(prompts) == len(outputs)
40 | 
41 | 
42 | def test_gc():
43 |     llm = LLM("facebook/opt-125m", enforce_eager=True)
44 |     del llm
45 | 
46 |     gc.collect()
47 |     torch.cuda.empty_cache()
48 | 
49 |     # The memory allocated for model and KV cache should be released.
50 |     # The memory allocated for PyTorch and others should be less than 50MB.
51 |     # Usually, it's around 10MB.
52 |     allocated = torch.cuda.memory_allocated()
53 |     assert allocated < 50 * 1024 * 1024
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     import pytest
58 |     pytest.main([__file__])
59 | 


--------------------------------------------------------------------------------
/tests/test_sampling_params.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SamplingParams class.
 2 | """
 3 | from vllm import SamplingParams
 4 | 
 5 | 
 6 | def test_max_tokens_none():
 7 |     """max_tokens=None should be allowed"""
 8 |     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     import pytest
13 |     pytest.main([__file__])
14 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_cached_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 6 | 
 7 | 
 8 | def test_cached_tokenizer():
 9 |     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
10 |     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
11 |     reference_tokenizer.add_special_tokens(
12 |         {"additional_special_tokens": ["<SEP>"]})
13 |     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
14 | 
15 |     assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
16 |         "prompt")
17 |     assert set(reference_tokenizer.all_special_ids) == set(
18 |         cached_tokenizer.all_special_ids)
19 |     assert set(reference_tokenizer.all_special_tokens) == set(
20 |         cached_tokenizer.all_special_tokens)
21 |     assert set(reference_tokenizer.all_special_tokens_extended) == set(
22 |         cached_tokenizer.all_special_tokens_extended)
23 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_image_processor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers.image_processing_utils import BaseImageProcessor
 3 | 
 4 | from vllm.transformers_utils.image_processor import get_image_processor
 5 | 
 6 | IMAGE_PROCESSOR_NAMES = [
 7 |     "llava-hf/llava-1.5-7b-hf",
 8 |     "llava-hf/llava-v1.6-34b-hf",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
13 | def test_image_processor_revision(processor_name: str):
14 |     # Assume that "main" branch always exists
15 |     image_processor = get_image_processor(processor_name, revision="main")
16 |     assert isinstance(image_processor, BaseImageProcessor)
17 | 
18 |     # Assume that "never" branch always does not exist
19 |     with pytest.raises(OSError, match='not a valid git identifier'):
20 |         get_image_processor(processor_name, revision="never")
21 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import PreTrainedTokenizerBase
 3 | 
 4 | from vllm.transformers_utils.tokenizer import get_tokenizer
 5 | 
 6 | TOKENIZER_NAMES = [
 7 |     "facebook/opt-125m",
 8 |     "gpt2",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
13 | def test_tokenizer_revision(tokenizer_name: str):
14 |     # Assume that "main" branch always exists
15 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
16 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
17 | 
18 |     # Assume that "never" branch always does not exist
19 |     with pytest.raises(OSError, match='not a valid git identifier'):
20 |         get_tokenizer(tokenizer_name, revision="never")
21 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/tests/worker/__init__.py


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.entrypoints.llm import LLM
 7 | from vllm.executor.ray_utils import initialize_ray_cluster
 8 | from vllm.inputs import PromptStrictInputs, TextPrompt, TokensPrompt
 9 | from vllm.model_executor.models import ModelRegistry
10 | from vllm.outputs import (CompletionOutput, EmbeddingOutput,
11 |                           EmbeddingRequestOutput, RequestOutput)
12 | from vllm.pooling_params import PoolingParams
13 | from vllm.sampling_params import SamplingParams
14 | 
15 | __version__ = "0.4.3"
16 | 
17 | __all__ = [
18 |     "LLM",
19 |     "ModelRegistry",
20 |     "PromptStrictInputs",
21 |     "TextPrompt",
22 |     "TokensPrompt",
23 |     "SamplingParams",
24 |     "RequestOutput",
25 |     "CompletionOutput",
26 |     "EmbeddingOutput",
27 |     "EmbeddingRequestOutput",
28 |     "LLMEngine",
29 |     "EngineArgs",
30 |     "AsyncLLMEngine",
31 |     "AsyncEngineArgs",
32 |     "initialize_ray_cluster",
33 |     "PoolingParams",
34 | ]
35 | 


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.attention.backends.abstract import (AttentionBackend,
 2 |                                               AttentionMetadata)
 3 | from vllm.attention.layer import Attention
 4 | from vllm.attention.selector import get_attn_backend
 5 | 
 6 | __all__ = [
 7 |     "Attention",
 8 |     "AttentionBackend",
 9 |     "AttentionMetadata",
10 |     "Attention",
11 |     "get_attn_backend",
12 | ]
13 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/ops/blocksparse_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/attention/ops/blocksparse_attention/__init__.py


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/utils.py:
--------------------------------------------------------------------------------
 1 | """Block manager utils."""
 2 | from vllm.sequence import SequenceGroup
 3 | 
 4 | # Exception strings for non-implemented block manager enc/dec scenarios
 5 | 
 6 | STR_NOT_IMPL_ENC_DEC_SWA = \
 7 |     "Sliding window attention for encoder/decoder models " + \
 8 |                     "is not currently supported."
 9 | 
10 | STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
11 |     "Prefix caching for encoder/decoder models " + \
12 |                     "is not currently supported."
13 | 
14 | 
15 | def _get_block_mgr_sliding_window_attr(block_mgr):
16 |     '''
17 |     BlockManagerV1 and BlockManagerV2 have slightly different
18 |     members related to sliding window attention (SWA). This
19 |     function extracts the appropriate member to use for determining
20 |     whether SWA is enabled.
21 | 
22 |     Arguments:
23 | 
24 |     * block_mgr: BlockManagerV1 or BlockManagerV2 instance
25 |     '''
26 | 
27 |     if hasattr(block_mgr, 'block_sliding_window'):
28 |         return block_mgr.block_sliding_window
29 |     if hasattr(block_mgr, 'max_block_sliding_window'):
30 |         return block_mgr.max_block_sliding_window
31 | 
32 |     raise AttributeError("Block manager instance has neither " + \
33 |                          "block_sliding_window nor " + \
34 |                          "max_block_sliding_window attributes.")
35 | 
36 | 
37 | def check_no_caching_or_swa_for_blockmgr_encdec(
38 |         block_mgr, seq_group: SequenceGroup) -> None:
39 |     '''
40 |     Enforce that prefix caching & sliding-window attention (SWA)
41 |     are currently unsupported *specifically* for encoder/decoder models.
42 | 
43 |     Raises NotImplementedError if unsupported scenario is detected.
44 | 
45 |     Arguments:
46 | 
47 |     * block_mgr: BlockSpaceManager instance
48 |     * seq_group: SequenceGroup passed to block_mgr
49 |     '''
50 | 
51 |     if seq_group.is_encoder_decoder():
52 |         if _get_block_mgr_sliding_window_attr(block_mgr) is not None:
53 |             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
54 | 
55 |         if block_mgr.enable_caching:
56 |             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
57 | 


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | from typing import Deque
 3 | 
 4 | from vllm.sequence import SequenceGroup
 5 | 
 6 | 
 7 | class Policy:
 8 | 
 9 |     def get_priority(
10 |         self,
11 |         now: float,
12 |         seq_group: SequenceGroup,
13 |     ) -> float:
14 |         raise NotImplementedError
15 | 
16 |     def sort_by_priority(
17 |         self,
18 |         now: float,
19 |         seq_groups: Deque[SequenceGroup],
20 |     ) -> Deque[SequenceGroup]:
21 |         return deque(
22 |             sorted(
23 |                 seq_groups,
24 |                 key=lambda seq_group: self.get_priority(now, seq_group),
25 |                 reverse=True,
26 |             ))
27 | 
28 | 
29 | class FCFS(Policy):
30 | 
31 |     def get_priority(
32 |         self,
33 |         now: float,
34 |         seq_group: SequenceGroup,
35 |     ) -> float:
36 |         return now - seq_group.metrics.arrival_time
37 | 
38 | 
39 | class PolicyFactory:
40 | 
41 |     _POLICY_REGISTRY = {'fcfs': FCFS}
42 | 
43 |     @classmethod
44 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
45 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
46 | 


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The vLLM team.
 2 | # Adapted from
 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 5 | from typing import Sequence
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def ensure_divisibility(numerator, denominator):
11 |     """Ensure that numerator is divisible by the denominator."""
12 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
13 |         numerator, denominator)
14 | 
15 | 
16 | def divide(numerator, denominator):
17 |     """Ensure that numerator is divisible by the denominator and return
18 |     the division value."""
19 |     ensure_divisibility(numerator, denominator)
20 |     return numerator // denominator
21 | 
22 | 
23 | def split_tensor_along_last_dim(
24 |     tensor: torch.Tensor,
25 |     num_partitions: int,
26 |     contiguous_split_chunks: bool = False,
27 | ) -> Sequence[torch.Tensor]:
28 |     """ Split a tensor along its last dimension.
29 | 
30 |         Arguments:
31 |             tensor: input tensor.
32 |             num_partitions: number of partitions to split the tensor
33 |             contiguous_split_chunks: If True, make each chunk contiguous
34 |                                      in memory.
35 | 
36 |         Returns:
37 |             A list of Tensors
38 |     """
39 |     # Get the size and dimension.
40 |     last_dim = tensor.dim() - 1
41 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
42 |     # Split.
43 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
44 |     # NOTE: torch.split does not create contiguous tensors by default.
45 |     if contiguous_split_chunks:
46 |         return tuple(chunk.contiguous() for chunk in tensor_list)
47 | 
48 |     return tensor_list
49 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from typing import Sequence as GenericSequence
 3 | from typing import Union
 4 | 
 5 | from vllm.sequence import PoolerOutput, SamplerOutput, SequenceGroupOutput
 6 | 
 7 | 
 8 | def create_output_by_sequence_group(
 9 |         outputs: GenericSequence[Union[SamplerOutput, PoolerOutput]],
10 |         num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
11 |     """Helper method which transforms a 2d list organized by
12 |     [step][sequence group] into [sequence group][step].
13 |     """
14 |     output_by_sequence_group: List[List[SequenceGroupOutput]] = [
15 |         [] for _ in range(num_seq_groups)
16 |     ]
17 |     for step in outputs:
18 |         for i, sequence_group_output in enumerate(step):
19 |             output_by_sequence_group[i].append(sequence_group_output)
20 | 
21 |     return output_by_sequence_group
22 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/logging/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging.formatter import NewLineFormatter
2 | 
3 | __all__ = [
4 |     "NewLineFormatter",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/logging/formatter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NewLineFormatter(logging.Formatter):
 5 |     """Adds logging prefix to newlines to align multi-line messages."""
 6 | 
 7 |     def __init__(self, fmt, datefmt=None, style="%"):
 8 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 9 | 
10 |     def format(self, record):
11 |         msg = logging.Formatter.format(self, record)
12 |         if record.message != "":
13 |             parts = msg.split(record.message)
14 |             msg = msg.replace("\n", "\r\n" + parts[0])
15 |         return msg
16 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/request.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class LoRARequest:
 7 |     """
 8 |     Request for a LoRA adapter.
 9 | 
10 |     Note that this class should be be used internally. For online
11 |     serving, it is recommended to not allow users to use this class but
12 |     instead provide another layer of abstraction to prevent users from
13 |     accessing unauthorized LoRA adapters.
14 | 
15 |     lora_int_id must be globally unique for a given adapter.
16 |     This is currently not enforced in vLLM.
17 |     """
18 | 
19 |     lora_name: str
20 |     lora_int_id: int
21 |     lora_local_path: str
22 |     long_lora_max_len: Optional[int] = None
23 | 
24 |     def __post_init__(self):
25 |         if self.lora_int_id < 1:
26 |             raise ValueError(
27 |                 f"lora_int_id must be > 0, got {self.lora_int_id}")
28 | 
29 |     def __eq__(self, value: object) -> bool:
30 |         return isinstance(
31 |             value, LoRARequest) and self.lora_int_id == value.lora_int_id
32 | 
33 |     def __hash__(self) -> int:
34 |         return self.lora_int_id
35 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.sampling_metadata import SamplingMetadata
2 | from vllm.model_executor.utils import set_random_seed
3 | 
4 | __all__ = [
5 |     "SamplingMetadata",
6 |     "set_random_seed",
7 | ]
8 | 


--------------------------------------------------------------------------------
/vllm/model_executor/guided_decoding/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union
 2 | 
 3 | from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 4 |                                               CompletionRequest)
 5 | from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
 6 |     get_lm_format_enforcer_guided_decoding_logits_processor)
 7 | from vllm.model_executor.guided_decoding.outlines_decoding import (
 8 |     get_outlines_guided_decoding_logits_processor)
 9 | from vllm.sampling_params import LogitsProcessor
10 | 
11 | 
12 | async def get_guided_decoding_logits_processor(
13 |         guided_decoding_backend: str, request: Union[CompletionRequest,
14 |                                                      ChatCompletionRequest],
15 |         tokenizer) -> Optional[LogitsProcessor]:
16 |     if guided_decoding_backend == 'outlines':
17 |         return await get_outlines_guided_decoding_logits_processor(
18 |             request, tokenizer)
19 |     if guided_decoding_backend == 'lm-format-enforcer':
20 |         return await get_lm_format_enforcer_guided_decoding_logits_processor(
21 |             request, tokenizer)
22 | 
23 |     raise ValueError(
24 |         f"Unknown guided decoding backend '{guided_decoding_backend}'. "
25 |         "Must be one of 'outlines, 'lm-format-enforcer'")
26 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.fused_moe.fused_moe import (
 2 |     fused_experts, fused_moe, fused_topk, get_config_file_name)
 3 | 
 4 | __all__ = [
 5 |     "fused_moe",
 6 |     "fused_topk",
 7 |     "fused_experts",
 8 |     "get_config_file_name",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/pooler.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from vllm.model_executor.pooling_metadata import (PoolingMetadata,
 7 |                                                   PoolingTensors)
 8 | from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
 9 | 
10 | 
11 | class PoolingType(IntEnum):
12 |     """Enumeration for different types of pooling methods."""
13 |     LAST = 0
14 | 
15 | 
16 | class Pooler(nn.Module):
17 |     """A layer that pools specific information from hidden states.
18 | 
19 |     This layer does the following:
20 |     1. Extracts specific tokens or aggregates data based on pooling method.
21 |     2. Normalizes output if specified.
22 |     3. Returns structured results as `PoolerOutput`.
23 | 
24 |     Attributes:
25 |         pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
26 |         normalize: Whether to normalize the pooled data.
27 |     """
28 | 
29 |     def __init__(self, pooling_type: PoolingType, normalize: bool):
30 |         super().__init__()
31 |         self.pooling_type = pooling_type
32 |         self.normalize = normalize
33 | 
34 |     def forward(
35 |         self,
36 |         hidden_states: torch.Tensor,
37 |         pooling_metadata: PoolingMetadata,
38 |     ) -> PoolerOutput:
39 |         """Pools specific information from hidden states based on metadata."""
40 |         prompt_lens = PoolingTensors.from_pooling_metadata(
41 |             pooling_metadata, hidden_states.device).prompt_lens
42 | 
43 |         if self.pooling_type == PoolingType.LAST:
44 |             last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
45 |             pooled_data = hidden_states[last_token_flat_indices]
46 |         else:
47 |             raise ValueError(f"Invalid pooling type: {self.pooling_type}")
48 | 
49 |         if self.normalize:
50 |             pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
51 | 
52 |         pooled_outputs = [
53 |             EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
54 |         ]
55 | 
56 |         return PoolerOutput(outputs=pooled_outputs)
57 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Type
 2 | 
 3 | from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 4 | from vllm.model_executor.layers.quantization.awq import AWQConfig
 5 | from vllm.model_executor.layers.quantization.base_config import (
 6 |     QuantizationConfig)
 7 | from vllm.model_executor.layers.quantization.bitsandbytes import (
 8 |     BitsAndBytesConfig)
 9 | from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
10 |     CompressedTensorsConfig)
11 | from vllm.model_executor.layers.quantization.deepspeedfp import (
12 |     DeepSpeedFPConfig)
13 | from vllm.model_executor.layers.quantization.fp8 import Fp8Config
14 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig
15 | from vllm.model_executor.layers.quantization.gptq_marlin import (
16 |     GPTQMarlinConfig)
17 | from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
18 |     GPTQMarlin24Config)
19 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig
20 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
21 | 
22 | QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
23 |     "aqlm": AQLMConfig,
24 |     "awq": AWQConfig,
25 |     "deepspeedfp": DeepSpeedFPConfig,
26 |     "fp8": Fp8Config,
27 |     # The order of gptq methods is important for config.py iteration over
28 |     # override_quantization_method(..)
29 |     "marlin": MarlinConfig,
30 |     "gptq_marlin_24": GPTQMarlin24Config,
31 |     "gptq_marlin": GPTQMarlinConfig,
32 |     "gptq": GPTQConfig,
33 |     "squeezellm": SqueezeLLMConfig,
34 |     "sparseml": CompressedTensorsConfig,
35 |     "bitsandbytes": BitsAndBytesConfig,
36 | }
37 | 
38 | 
39 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
40 |     if quantization not in QUANTIZATION_METHODS:
41 |         raise ValueError(f"Invalid quantization method: {quantization}")
42 |     return QUANTIZATION_METHODS[quantization]
43 | 
44 | 
45 | __all__ = [
46 |     "QuantizationConfig",
47 |     "get_quantization_config",
48 |     "QUANTIZATION_METHODS",
49 | ]
50 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
1 | from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
2 | from .compressed_tensors_unquantized import (  # noqa: F401
3 |     CompressedTensorsUnquantized)
4 | from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
5 |     CompressedTensorsW8A8StaticTensor)
6 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import torch
 4 | 
 5 | __all__ = ["CompressedTensorsScheme"]
 6 | 
 7 | 
 8 | class CompressedTensorsScheme(ABC):
 9 |     """
10 |     Abstract class used to describe the weight creation and forward pass 
11 |     of different quantization schemes supported by CompressedTensors.
12 |     """
13 | 
14 |     @abstractmethod
15 |     def create_weights(self, *args, **kwargs):
16 |         """
17 |         Weight creation for the particular scheme. Inputs to this function 
18 | 
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
24 |         """
25 |         Run the forward pass for the particular scheme. This is where 
26 |         scheme-specific dequant/quant steps/kernels should be applied.
27 | 
28 |         :param layer: toch.nn.Module with the registered weights and 
29 |             other parameters relevant to the particular scheme. 
30 |         :param x: input to the layer
31 | 
32 |         """
33 |         raise NotImplementedError
34 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from torch.nn import Parameter
 6 | 
 7 | from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
 8 |     CompressedTensorsScheme)
 9 | from vllm.model_executor.utils import set_weight_attrs
10 | 
11 | __all__ = ["CompressedTensorsUnquantized"]
12 | 
13 | 
14 | class CompressedTensorsUnquantized(CompressedTensorsScheme):
15 |     """
16 |     Implements the scheme for all layers which are ignored 
17 |     in the CompressedTensors config. The input and loaded weight are used 
18 |     in a linear transformation.
19 |     """
20 | 
21 |     def create_weights(self, layer: torch.nn.Module,
22 |                        output_partition_sizes: List[int],
23 |                        input_size_per_partition: int,
24 |                        params_dtype: torch.dtype, weight_loader: Callable,
25 |                        **kwargs):
26 | 
27 |         weight = Parameter(torch.empty(sum(output_partition_sizes),
28 |                                        input_size_per_partition,
29 |                                        device="cuda",
30 |                                        dtype=params_dtype),
31 |                            requires_grad=False)
32 | 
33 |         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
34 |         layer.register_parameter("weight", weight)
35 |         set_weight_attrs(weight, {"weight_loader": weight_loader})
36 | 
37 |     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
38 |         weight = layer.weight
39 |         return F.linear(x, weight)
40 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/model_executor/layers/quantization/utils/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch import nn
 4 | 
 5 | from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 6 |                          ModelConfig, ParallelConfig, SchedulerConfig,
 7 |                          VisionLanguageConfig)
 8 | from vllm.model_executor.model_loader.loader import (BaseModelLoader,
 9 |                                                      get_model_loader)
10 | from vllm.model_executor.model_loader.utils import (
11 |     get_architecture_class_name, get_model_architecture)
12 | 
13 | 
14 | def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
15 |               device_config: DeviceConfig, parallel_config: ParallelConfig,
16 |               scheduler_config: SchedulerConfig,
17 |               lora_config: Optional[LoRAConfig],
18 |               vision_language_config: Optional[VisionLanguageConfig],
19 |               cache_config: CacheConfig) -> nn.Module:
20 |     loader = get_model_loader(load_config)
21 |     return loader.load_model(model_config=model_config,
22 |                              device_config=device_config,
23 |                              lora_config=lora_config,
24 |                              vision_language_config=vision_language_config,
25 |                              parallel_config=parallel_config,
26 |                              scheduler_config=scheduler_config,
27 |                              cache_config=cache_config)
28 | 
29 | 
30 | __all__ = [
31 |     "get_model", "get_model_loader", "BaseModelLoader",
32 |     "get_architecture_class_name", "get_model_architecture"
33 | ]
34 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/utils.py:
--------------------------------------------------------------------------------
 1 | """Utilities for selecting and loading models."""
 2 | import contextlib
 3 | from typing import Tuple, Type
 4 | 
 5 | import torch
 6 | from torch import nn
 7 | 
 8 | from vllm.config import ModelConfig
 9 | from vllm.model_executor.models import ModelRegistry
10 | 
11 | 
12 | @contextlib.contextmanager
13 | def set_default_torch_dtype(dtype: torch.dtype):
14 |     """Sets the default torch dtype to the given dtype."""
15 |     old_dtype = torch.get_default_dtype()
16 |     torch.set_default_dtype(dtype)
17 |     yield
18 |     torch.set_default_dtype(old_dtype)
19 | 
20 | 
21 | def get_model_architecture(
22 |         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
23 |     architectures = getattr(model_config.hf_config, "architectures", [])
24 |     # Special handling for quantized Mixtral.
25 |     # FIXME(woosuk): This is a temporary hack.
26 |     if (model_config.quantization is not None
27 |             and model_config.quantization != "fp8"
28 |             and "MixtralForCausalLM" in architectures):
29 |         architectures = ["QuantMixtralForCausalLM"]
30 | 
31 |     for arch in architectures:
32 |         model_cls = ModelRegistry.load_model_cls(arch)
33 |         if model_cls is not None:
34 |             return (model_cls, arch)
35 |     raise ValueError(
36 |         f"Model architectures {architectures} are not supported for now. "
37 |         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
38 | 
39 | 
40 | def get_architecture_class_name(model_config: ModelConfig) -> str:
41 |     return get_model_architecture(model_config)[1]
42 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/vlm_base.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from vllm.config import VisionLanguageConfig
 4 | 
 5 | 
 6 | class VisionLanguageModelBase(nn.Module):
 7 |     """Base class for all vision language models (VLMs)."""
 8 | 
 9 |     def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
10 |         super().__init__()
11 | 
12 |         self.vision_language_config = vision_language_config
13 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | def set_random_seed(seed: int) -> None:
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     if torch.cuda.is_available():
14 |         torch.cuda.manual_seed_all(seed)
15 | 
16 | 
17 | def set_weight_attrs(
18 |     weight: torch.Tensor,
19 |     weight_attrs: Optional[Dict[str, Any]],
20 | ):
21 |     """Set attributes on a weight tensor.
22 | 
23 |     This method is used to set attributes on a weight tensor. This method
24 |     will not overwrite existing attributes.
25 | 
26 |     Args:
27 |         weight: The weight tensor.
28 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
29 |     """
30 |     if weight_attrs is None:
31 |         return
32 |     for key, value in weight_attrs.items():
33 |         assert not hasattr(
34 |             weight, key), (f"Overwriting existing tensor attribute: {key}")
35 |         setattr(weight, key, value)
36 | 


--------------------------------------------------------------------------------
/vllm/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import MultiModalData, MultiModalPlugin
2 | from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
3 | 
4 | __all__ = [
5 |     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
6 |     "MultiModalRegistry"
7 | ]
8 | 


--------------------------------------------------------------------------------
/vllm/pooling_params.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | 
 4 | class PoolingParams:
 5 |     """Pooling parameters for pooling.
 6 | 
 7 |     Attributes:
 8 |         additional_data: Any additional data needed for pooling.
 9 |     """
10 | 
11 |     def __init__(self, additional_data: Optional[Any] = None):
12 |         self.additional_data = additional_data
13 | 
14 |     def clone(self) -> "PoolingParams":
15 |         """Returns a deep copy of the PoolingParams instance."""
16 |         return PoolingParams(additional_data=self.additional_data, )
17 | 
18 |     def __repr__(self) -> str:
19 |         return (f"PoolingParams("
20 |                 f"additional_metadata={self.additional_data})")
21 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/transformers_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 2 | from vllm.transformers_utils.configs.dbrx import DbrxConfig
 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 5 | # `FalconConfig` class from the official HuggingFace transformers library.
 6 | from vllm.transformers_utils.configs.falcon import RWConfig
 7 | from vllm.transformers_utils.configs.jais import JAISConfig
 8 | from vllm.transformers_utils.configs.mpt import MPTConfig
 9 | 
10 | __all__ = [
11 |     "ChatGLMConfig",
12 |     "DbrxConfig",
13 |     "MPTConfig",
14 |     "RWConfig",
15 |     "JAISConfig",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/image_processor.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from typing import Optional
 3 | 
 4 | from transformers import AutoImageProcessor
 5 | from transformers.image_processing_utils import BaseImageProcessor
 6 | 
 7 | from vllm.logger import init_logger
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | 
12 | def get_image_processor(
13 |     processor_name: str,
14 |     *args,
15 |     trust_remote_code: bool = False,
16 |     revision: Optional[str] = None,
17 |     **kwargs,
18 | ) -> BaseImageProcessor:
19 |     """Gets an image processor for the given model name via HuggingFace."""
20 |     try:
21 |         processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
22 |             processor_name,
23 |             *args,
24 |             trust_remote_code=trust_remote_code,
25 |             revision=revision,
26 |             **kwargs)
27 |     except ValueError as e:
28 |         # If the error pertains to the processor class not existing or not
29 |         # currently being imported, suggest using the --trust-remote-code flag.
30 |         # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
31 |         if not trust_remote_code:
32 |             err_msg = (
33 |                 "Failed to load the image processor. If the image processor is "
34 |                 "a custom processor not yet available in the HuggingFace "
35 |                 "transformers library, consider setting "
36 |                 "`trust_remote_code=True` in LLM or using the "
37 |                 "`--trust-remote-code` flag in the CLI.")
38 |             raise RuntimeError(err_msg) from e
39 |         else:
40 |             raise e
41 | 
42 |     return processor
43 | 
44 | 
45 | cached_get_image_processor = lru_cache(get_image_processor)
46 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from vllm.config import TokenizerPoolConfig
 4 | from vllm.executor.ray_utils import ray
 5 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
 6 |     BaseTokenizerGroup)
 7 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
 8 |     TokenizerGroup)
 9 | 
10 | if ray:
11 |     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
12 |         RayTokenizerGroupPool)
13 | else:
14 |     RayTokenizerGroupPool = None  # type: ignore
15 | 
16 | 
17 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
18 |                         **init_kwargs) -> BaseTokenizerGroup:
19 |     if tokenizer_pool_config is None:
20 |         return TokenizerGroup(**init_kwargs)
21 |     if tokenizer_pool_config.pool_type == "ray":
22 |         if RayTokenizerGroupPool is None:
23 |             raise ImportError(
24 |                 "RayTokenizerGroupPool is not available. Please install "
25 |                 "the ray package to use the Ray tokenizer group pool.")
26 |         return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
27 |                                                  **init_kwargs)
28 |     else:
29 |         raise ValueError(
30 |             f"Unknown pool type: {tokenizer_pool_config.pool_type}")
31 | 
32 | 
33 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
34 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | from transformers import PreTrainedTokenizer
 5 | 
 6 | from vllm.lora.request import LoRARequest
 7 | 
 8 | 
 9 | class BaseTokenizerGroup(ABC):
10 |     """A group of tokenizers that can be used for LoRA adapters."""
11 | 
12 |     @abstractmethod
13 |     def ping(self) -> bool:
14 |         """Check if the tokenizer group is alive."""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_max_input_len(self,
19 |                           lora_request: Optional[LoRARequest] = None
20 |                           ) -> Optional[int]:
21 |         """Get the maximum input length for the LoRA request."""
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def encode(self,
26 |                prompt: str,
27 |                request_id: Optional[str] = None,
28 |                lora_request: Optional[LoRARequest] = None) -> List[int]:
29 |         """Encode a prompt using the tokenizer group."""
30 |         pass
31 | 
32 |     @abstractmethod
33 |     async def encode_async(
34 |             self,
35 |             prompt: str,
36 |             request_id: Optional[str] = None,
37 |             lora_request: Optional[LoRARequest] = None) -> List[int]:
38 |         """Encode a prompt using the tokenizer group."""
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def get_lora_tokenizer(
43 |             self,
44 |             lora_request: Optional[LoRARequest] = None
45 |     ) -> "PreTrainedTokenizer":
46 |         """Get a tokenizer for a LoRA request."""
47 |         pass
48 | 
49 |     @abstractmethod
50 |     async def get_lora_tokenizer_async(
51 |             self,
52 |             lora_request: Optional[LoRARequest] = None
53 |     ) -> "PreTrainedTokenizer":
54 |         """Get a tokenizer for a LoRA request."""
55 |         pass
56 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
2 | 
3 | __all__ = [
4 |     "BaichuanTokenizer",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SkyworkAI/vllm/691f77f62751b30a66a0466985a72e6b2551e0f4/vllm/worker/__init__.py


--------------------------------------------------------------------------------